mirror of
https://github.com/gitpod-io/gitpod.git
synced 2025-12-08 17:36:30 +00:00
\# `No Data` Variant This commit adds a new variant that the `Analysis.MoveForward` function can return. This variant is used to indicate that there isn't enough data to make a concrete decision about the rollout. This is then coupled with the `Rollout.OkayScoreUntilNoData` to move rollout forward until a specific point even when there is no data so that data can actually be created. If no data is present even after `OkayScoreUntilNoData` is reached, then we rollback as we aren't making an informed rollout. If positive, we move forward. If negative, we rollback. \# Target Based Metric Analysis In this commit, We add a new `ErrorRatioAnalyzer` through which we calculate the success target percentage by doing ((totalRequests - errorRequests)/totalRequests) * 100 and compare it with the target percentage provided by the user. This means users can specify a target percentage (i.e 99%, etc) at which a new cluster can be considered safe. We rollback, If it is less than that. Signed-off-by: Tarun Pothulapati <tarun@gitpod.io>
137 lines
4.8 KiB
Go
137 lines
4.8 KiB
Go
// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
|
|
// Licensed under the GNU Affero General Public License (AGPL).
|
|
// See License-AGPL.txt in the project root for license information.
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"time"
|
|
|
|
"github.com/gitpod-io/gitpod/common-go/baseserver"
|
|
"github.com/gitpod-io/gitpod/common-go/log"
|
|
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/analysis"
|
|
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/rollout"
|
|
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/wsbridge"
|
|
"github.com/spf13/cobra"
|
|
"k8s.io/client-go/rest"
|
|
"k8s.io/client-go/tools/clientcmd"
|
|
)
|
|
|
|
const (
|
|
version string = "0.0.0"
|
|
)
|
|
|
|
type config struct {
|
|
oldCluster string
|
|
newCluster string
|
|
prometheusService string
|
|
rollOutWaitDuration time.Duration
|
|
analsysWaitDuration time.Duration
|
|
rolloutStepScore int32
|
|
okayScoreUntilNoData int32
|
|
targetPositivePercentage int
|
|
}
|
|
|
|
var (
|
|
conf config
|
|
)
|
|
|
|
var rootCmd = &cobra.Command{
|
|
Short: "Rollout from old to a new cluster while monitoring metrics",
|
|
Run: func(cmd *cobra.Command, args []string) {
|
|
log.Info("Starting workspace-rollout-job")
|
|
ctx := context.Background()
|
|
var err error
|
|
|
|
// Get kubeconfig
|
|
config, err := getKubeConfig()
|
|
if err != nil {
|
|
log.WithError(err).Fatal("failed to retrieve kube config")
|
|
}
|
|
|
|
serverOpts := []baseserver.Option{
|
|
baseserver.WithVersion(version),
|
|
}
|
|
|
|
srv, err := baseserver.New("workspace-rollout-job", serverOpts...)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("failed to initialize server")
|
|
return
|
|
}
|
|
|
|
// Run in a separate routine as this is not the main purpose
|
|
go srv.ListenAndServe()
|
|
if err != nil {
|
|
log.WithError(err).Fatal("failed to listen and serve")
|
|
return
|
|
}
|
|
|
|
rollout.RegisterMetrics(srv.MetricsRegistry())
|
|
|
|
// 30304 is the port where ws-manager-bridge will be accessible
|
|
wsManagerBridgeClient, err := wsbridge.NewWsManagerBridgeClient(context.Background(), config, 30304)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("failed to create a ws-manager-bridge client")
|
|
return
|
|
}
|
|
|
|
// Check if the old cluster has a 100 score.
|
|
if score, err := wsManagerBridgeClient.GetScore(ctx, conf.oldCluster); err != nil || score != 100 {
|
|
log.WithError(err).Fatal("init condition does not satisfy")
|
|
}
|
|
|
|
// Check if the new cluster has a 0 zero score.
|
|
// TODO: Check if the new cluster has no constraints.
|
|
if score, err := wsManagerBridgeClient.GetScore(ctx, conf.newCluster); err != nil || score != 0 {
|
|
log.WithError(err).Fatal("init condition does not satisfy")
|
|
}
|
|
|
|
// Start the rollout process
|
|
prometheusAnalyzer, err := analysis.NewErrorRatioAnalyzer(ctx, config, conf.prometheusService, conf.targetPositivePercentage, 30305)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("failed to create a prometheus client")
|
|
return
|
|
}
|
|
|
|
job := rollout.New(conf.oldCluster, conf.newCluster, conf.rollOutWaitDuration, conf.analsysWaitDuration, conf.rolloutStepScore, conf.okayScoreUntilNoData, prometheusAnalyzer, wsManagerBridgeClient)
|
|
job.Start(ctx)
|
|
},
|
|
}
|
|
|
|
func getKubeConfig() (*rest.Config, error) {
|
|
var config *rest.Config
|
|
config, err := rest.InClusterConfig()
|
|
if err != nil {
|
|
kubeConfig := clientcmd.NewDefaultClientConfigLoadingRules().GetDefaultFilename()
|
|
config, err = clientcmd.BuildConfigFromFlags("", kubeConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return config, nil
|
|
}
|
|
|
|
// Execute adds all child commands to the root command and sets flags appropriately.
|
|
// This is called by main.main(). It only needs to happen once to the rootCmd.
|
|
func Execute() {
|
|
rootCmd.Flags().StringVar(&conf.oldCluster, "old-cluster", "", "Name of the old cluster with score 100")
|
|
rootCmd.Flags().StringVar(&conf.newCluster, "new-cluster", "", "Name of the new cluster with score 0")
|
|
rootCmd.Flags().StringVar(&conf.prometheusService, "prometheus-resource", "", "Please set in the format <namespace>/<kind>/<name>")
|
|
rootCmd.Flags().DurationVar(&conf.rollOutWaitDuration, "rollout-wait-duration", 20*time.Second, "Duration to wait before updating the score of the new cluster")
|
|
rootCmd.Flags().DurationVar(&conf.analsysWaitDuration, "analysis-wait-duration", 1*time.Second, "Duration to wait before analyzing the metrics")
|
|
rootCmd.Flags().Int32Var(&conf.rolloutStepScore, "rollout-step-score", 10, "Score to be added to the new cluster, and decreased to the old cluster")
|
|
rootCmd.Flags().Int32Var(&conf.okayScoreUntilNoData, "okay-score-until-no-data", 60, "If the score is below this value, and there is no data, the rollout score will be considered okay")
|
|
rootCmd.Flags().IntVar(&conf.targetPositivePercentage, "target-positive-percentage", 95, "Target percentage of positive metrics")
|
|
|
|
rootCmd.MarkFlagRequired("old-cluster")
|
|
rootCmd.MarkFlagRequired("new-cluster")
|
|
rootCmd.MarkFlagRequired("prometheus-resource")
|
|
if err := rootCmd.Execute(); err != nil {
|
|
fmt.Println(err)
|
|
os.Exit(1)
|
|
}
|
|
}
|