Tarun Pothulapati ed79a89d6b analysis: Add new No Data Variant & Target Based Metric Analysis
\# `No Data` Variant

This commit adds a new variant that the `Analysis.MoveForward`
function can return. This variant is used to indicate that there
isn't enough data to make a concrete decision about the rollout.

This is then coupled with the `Rollout.OkayScoreUntilNoData` to
move rollout forward until a specific point even when there is
no data so that data can actually be created. If no data
is present even after `OkayScoreUntilNoData` is reached, then
we rollback as we aren't making an informed rollout. If positive,
we move forward. If negative, we rollback.

\# Target Based Metric Analysis

In this commit, We add a new `ErrorRatioAnalyzer` through which we
calculate the success target percentage by doing
((totalRequests - errorRequests)/totalRequests) * 100 and compare it
with the target percentage provided by the user. This means users can
specify a target percentage (i.e 99%, etc) at which a new cluster
can be considered safe. We rollback, If it is less than that.

Signed-off-by: Tarun Pothulapati <tarun@gitpod.io>
2023-01-23 18:01:31 +01:00

137 lines
4.8 KiB
Go

// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.
package cmd
import (
"context"
"fmt"
"os"
"time"
"github.com/gitpod-io/gitpod/common-go/baseserver"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/analysis"
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/rollout"
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/wsbridge"
"github.com/spf13/cobra"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
)
const (
version string = "0.0.0"
)
type config struct {
oldCluster string
newCluster string
prometheusService string
rollOutWaitDuration time.Duration
analsysWaitDuration time.Duration
rolloutStepScore int32
okayScoreUntilNoData int32
targetPositivePercentage int
}
var (
conf config
)
var rootCmd = &cobra.Command{
Short: "Rollout from old to a new cluster while monitoring metrics",
Run: func(cmd *cobra.Command, args []string) {
log.Info("Starting workspace-rollout-job")
ctx := context.Background()
var err error
// Get kubeconfig
config, err := getKubeConfig()
if err != nil {
log.WithError(err).Fatal("failed to retrieve kube config")
}
serverOpts := []baseserver.Option{
baseserver.WithVersion(version),
}
srv, err := baseserver.New("workspace-rollout-job", serverOpts...)
if err != nil {
log.WithError(err).Fatal("failed to initialize server")
return
}
// Run in a separate routine as this is not the main purpose
go srv.ListenAndServe()
if err != nil {
log.WithError(err).Fatal("failed to listen and serve")
return
}
rollout.RegisterMetrics(srv.MetricsRegistry())
// 30304 is the port where ws-manager-bridge will be accessible
wsManagerBridgeClient, err := wsbridge.NewWsManagerBridgeClient(context.Background(), config, 30304)
if err != nil {
log.WithError(err).Fatal("failed to create a ws-manager-bridge client")
return
}
// Check if the old cluster has a 100 score.
if score, err := wsManagerBridgeClient.GetScore(ctx, conf.oldCluster); err != nil || score != 100 {
log.WithError(err).Fatal("init condition does not satisfy")
}
// Check if the new cluster has a 0 zero score.
// TODO: Check if the new cluster has no constraints.
if score, err := wsManagerBridgeClient.GetScore(ctx, conf.newCluster); err != nil || score != 0 {
log.WithError(err).Fatal("init condition does not satisfy")
}
// Start the rollout process
prometheusAnalyzer, err := analysis.NewErrorRatioAnalyzer(ctx, config, conf.prometheusService, conf.targetPositivePercentage, 30305)
if err != nil {
log.WithError(err).Fatal("failed to create a prometheus client")
return
}
job := rollout.New(conf.oldCluster, conf.newCluster, conf.rollOutWaitDuration, conf.analsysWaitDuration, conf.rolloutStepScore, conf.okayScoreUntilNoData, prometheusAnalyzer, wsManagerBridgeClient)
job.Start(ctx)
},
}
func getKubeConfig() (*rest.Config, error) {
var config *rest.Config
config, err := rest.InClusterConfig()
if err != nil {
kubeConfig := clientcmd.NewDefaultClientConfigLoadingRules().GetDefaultFilename()
config, err = clientcmd.BuildConfigFromFlags("", kubeConfig)
if err != nil {
return nil, err
}
}
return config, nil
}
// Execute adds all child commands to the root command and sets flags appropriately.
// This is called by main.main(). It only needs to happen once to the rootCmd.
func Execute() {
rootCmd.Flags().StringVar(&conf.oldCluster, "old-cluster", "", "Name of the old cluster with score 100")
rootCmd.Flags().StringVar(&conf.newCluster, "new-cluster", "", "Name of the new cluster with score 0")
rootCmd.Flags().StringVar(&conf.prometheusService, "prometheus-resource", "", "Please set in the format <namespace>/<kind>/<name>")
rootCmd.Flags().DurationVar(&conf.rollOutWaitDuration, "rollout-wait-duration", 20*time.Second, "Duration to wait before updating the score of the new cluster")
rootCmd.Flags().DurationVar(&conf.analsysWaitDuration, "analysis-wait-duration", 1*time.Second, "Duration to wait before analyzing the metrics")
rootCmd.Flags().Int32Var(&conf.rolloutStepScore, "rollout-step-score", 10, "Score to be added to the new cluster, and decreased to the old cluster")
rootCmd.Flags().Int32Var(&conf.okayScoreUntilNoData, "okay-score-until-no-data", 60, "If the score is below this value, and there is no data, the rollout score will be considered okay")
rootCmd.Flags().IntVar(&conf.targetPositivePercentage, "target-positive-percentage", 95, "Target percentage of positive metrics")
rootCmd.MarkFlagRequired("old-cluster")
rootCmd.MarkFlagRequired("new-cluster")
rootCmd.MarkFlagRequired("prometheus-resource")
if err := rootCmd.Execute(); err != nil {
fmt.Println(err)
os.Exit(1)
}
}