Tarun Pothulapati f05f5bd1c3 [ws-rollout] Add prometheus init check
When given a non-usable `--prometheus-url`, We start the
rollout without verifying if the prometheus is reachable or not. This
is a problem as we will be unable to get the metrics from prometheus
and hence the rollout will be reverted later causing unnecessary
time waste.

This can be prevented by performing a simple check to see if the
prometheus is reachable or not. `up` query is used instead of
key metrics as we can't be sure of their existence.

Signed-off-by: Tarun Pothulapati <tarun@gitpod.io>
2023-01-30 22:04:38 +01:00

152 lines
5.2 KiB
Go

// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.
package cmd
import (
"context"
"fmt"
"os"
"time"
"github.com/gitpod-io/gitpod/common-go/baseserver"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/analysis"
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/rollout"
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/wsbridge"
"github.com/spf13/cobra"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
)
var (
Version string
conf config
)
type config struct {
oldCluster string
newCluster string
prometheusURL string
rollOutWaitDuration time.Duration
analsysWaitDuration time.Duration
rolloutStepScore int32
okayScoreUntilNoData int32
targetPositivePercentage int
}
var rootCmd = &cobra.Command{
Short: "Rollout from old to a new cluster while monitoring metrics",
RunE: func(cmd *cobra.Command, args []string) error {
log.Info("Starting workspace-rollout-job")
ctx := context.Background()
var err error
if conf.rolloutStepScore <= 0 {
return fmt.Errorf("rollout step score must be greater than 0")
}
// Get kubeconfig
config, err := getKubeConfig()
if err != nil {
log.WithError(err).Fatal("failed to retrieve kube config")
return err
}
serverOpts := []baseserver.Option{
baseserver.WithVersion(Version),
}
srv, err := baseserver.New("workspace-rollout-job", serverOpts...)
if err != nil {
log.WithError(err).Fatal("failed to initialize server")
return err
}
// Run in a separate routine as this is not the main purpose
// This is used to expose prometheus metrics
go func() {
err = srv.ListenAndServe()
if err != nil {
log.WithError(err).Fatal("failed to listen and serve")
os.Exit(1)
}
}()
rollout.RegisterMetrics(srv.MetricsRegistry())
// 30304 is the port where ws-manager-bridge will be accessible
wsManagerBridgeClient, err := wsbridge.NewWsManagerBridgeClient(context.Background(), config, 30304)
if err != nil {
log.WithError(err).Fatal("failed to create a ws-manager-bridge client")
return err
}
// Check if the old cluster has a 100 score.
if score, err := wsManagerBridgeClient.GetScore(ctx, conf.oldCluster); err != nil || score != 100 {
log.WithError(err).Fatal("init condition does not satisfy")
return err
}
// Check if the new cluster has a 0 zero score.
// TODO: Check if the new cluster has no constraints.
if score, err := wsManagerBridgeClient.GetScore(ctx, conf.newCluster); err != nil || score != 0 {
log.WithError(err).Fatal("init condition does not satisfy")
return err
}
// Check if prometheus is reachable
err = analysis.CheckPrometheusReachable(ctx, conf.prometheusURL)
if err != nil {
log.WithError(err).Fatal("init: prometheus is not reachable")
return err
}
prometheusAnalyzer, err := analysis.NewWorkspaceKeyMetricsAnalyzer(ctx, config, conf.prometheusURL, conf.targetPositivePercentage, 30305)
if err != nil {
log.WithError(err).Fatal("failed to create a prometheus client")
return err
}
job := rollout.New(conf.oldCluster, conf.newCluster, conf.rollOutWaitDuration, conf.analsysWaitDuration, conf.rolloutStepScore, conf.okayScoreUntilNoData, prometheusAnalyzer, wsManagerBridgeClient)
return job.Start(ctx)
},
}
func getKubeConfig() (*rest.Config, error) {
var config *rest.Config
config, err := rest.InClusterConfig()
if err != nil {
kubeConfig := clientcmd.NewDefaultClientConfigLoadingRules().GetDefaultFilename()
config, err = clientcmd.BuildConfigFromFlags("", kubeConfig)
if err != nil {
return nil, err
}
}
return config, nil
}
// Execute adds all child commands to the root command and sets flags appropriately.
// This is called by main.main(). It only needs to happen once to the rootCmd.
func Execute() {
rootCmd.Flags().StringVar(&conf.oldCluster, "old-cluster", "", "Name of the old cluster with score 100")
rootCmd.Flags().StringVar(&conf.newCluster, "new-cluster", "", "Name of the new cluster with score 0")
rootCmd.Flags().StringVar(&conf.prometheusURL, "prometheus-url", "", "URL of Prometheus Service")
rootCmd.Flags().DurationVar(&conf.rollOutWaitDuration, "rollout-wait-duration", 50*time.Second, "Duration to wait before updating the score of the new cluster")
rootCmd.Flags().DurationVar(&conf.analsysWaitDuration, "analysis-wait-duration", 1*time.Second, "Duration to wait before analyzing the metrics")
rootCmd.Flags().Int32Var(&conf.rolloutStepScore, "rollout-step-score", 10, "Score to be added to the new cluster, and decreased from the old cluster")
rootCmd.Flags().Int32Var(&conf.okayScoreUntilNoData, "okay-score-until-no-data", 60, "If the score is below this value, and there is no data, the rollout score will be considered okay")
rootCmd.Flags().IntVar(&conf.targetPositivePercentage, "target-positive-percentage", 95, "Target percentage of positive metrics")
rootCmd.MarkFlagRequired("old-cluster")
rootCmd.MarkFlagRequired("new-cluster")
rootCmd.MarkFlagRequired("prometheus-url")
rootCmd.Version = Version
if err := rootCmd.Execute(); err != nil {
fmt.Println(err)
os.Exit(1)
}
}