Wouter Verlaek 25220bb30b
[ws-manager-mk2] Maintenance mode (#16702)
* [ws-manager-mk2] Maintenance mode reconciler

* [ws-manager-mk2] Check for maintenance mode

* [ws-manager-mk2] Default to maintenance mode on startup

* [ws-manager-mk2] Disable maintenance on unmarshal failure
2023-03-09 17:25:45 +01:00

286 lines
9.8 KiB
Go

// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.
package main
import (
"bytes"
"encoding/json"
"flag"
"fmt"
"net"
"os"
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/credentials/insecure"
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics"
common_grpc "github.com/gitpod-io/gitpod/common-go/grpc"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/common-go/pprof"
"github.com/gitpod-io/gitpod/common-go/tracing"
imgbldr "github.com/gitpod-io/gitpod/image-builder/api"
regapi "github.com/gitpod-io/gitpod/registry-facade/api"
wsmanapi "github.com/gitpod-io/gitpod/ws-manager/api"
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
"github.com/gitpod-io/gitpod/ws-manager-mk2/controllers"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
imgproxy "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/proxy"
"github.com/gitpod-io/gitpod/ws-manager-mk2/service"
//+kubebuilder:scaffold:imports
)
var (
scheme = runtime.NewScheme()
setupLog = ctrl.Log.WithName("setup")
)
func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(workspacev1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}
func main() {
var enableLeaderElection bool
var configFN string
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
flag.StringVar(&configFN, "config", "", "Path to the config file")
opts := zap.Options{
Development: true,
}
opts.BindFlags(flag.CommandLine)
flag.Parse()
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
promrep := &tracing.PromReporter{
Operations: map[string]tracing.SpanMetricMapping{
"StartWorkspace": {
Name: "wsman_start_workspace",
Help: "time it takes to service a StartWorkspace request",
Buckets: prometheus.LinearBuckets(0, 500, 10), // 10 buckets, each 500ms wide
},
},
}
closer := tracing.Init("ws-manager-mk2", tracing.WithPrometheusReporter(promrep))
if closer != nil {
defer closer.Close()
}
cfg, err := getConfig(configFN)
if err != nil {
setupLog.Error(err, "unable to read config")
os.Exit(1)
}
if cfg.PProf.Addr != "" {
go pprof.Serve(cfg.PProf.Addr)
}
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
MetricsBindAddress: cfg.Prometheus.Addr,
Port: 9443,
HealthProbeBindAddress: cfg.Health.Addr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "ws-manager-mk2-leader.gitpod.io",
Namespace: cfg.Manager.Namespace,
NewCache: func(conf *rest.Config, opts cache.Options) (cache.Cache, error) {
// Only watch the maintenance mode ConfigMap.
opts.SelectorsByObject = cache.SelectorsByObject{
&corev1.ConfigMap{}: cache.ObjectSelector{
Label: labels.SelectorFromSet(labels.Set{controllers.LabelMaintenance: "true"}),
},
}
return cache.New(conf, opts)
},
})
if err != nil {
setupLog.Error(err, "unable to start manager")
os.Exit(1)
}
maintenance, err := controllers.NewMaintenanceReconciler(mgr.GetClient())
if err != nil {
setupLog.Error(err, "unable to create maintenance controller", "controller", "Maintenance")
os.Exit(1)
}
reconciler, err := controllers.NewWorkspaceReconciler(mgr.GetClient(), mgr.GetScheme(), &cfg.Manager, metrics.Registry, maintenance)
if err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Workspace")
os.Exit(1)
}
activity := &activity.WorkspaceActivity{}
timeoutReconciler, err := controllers.NewTimeoutReconciler(mgr.GetClient(), cfg.Manager, activity)
if err != nil {
setupLog.Error(err, "unable to create timeout controller", "controller", "Timeout")
os.Exit(1)
}
wsmanService, err := setupGRPCService(cfg, mgr.GetClient(), activity, maintenance)
if err != nil {
setupLog.Error(err, "unable to start manager service")
os.Exit(1)
}
reconciler.OnReconcile = wsmanService.OnWorkspaceReconcile
if err = reconciler.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to setup workspace controller with manager", "controller", "Workspace")
os.Exit(1)
}
if err = timeoutReconciler.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to setup timeout controller with manager", "controller", "Timeout")
os.Exit(1)
}
if err = maintenance.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to setup maintenance controller with manager", "controller", "Maintenance")
os.Exit(1)
}
// if err = (&workspacev1.Workspace{}).SetupWebhookWithManager(mgr); err != nil {
// setupLog.Error(err, "unable to create webhook", "webhook", "Workspace")
// os.Exit(1)
// }
//+kubebuilder:scaffold:builder
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
setupLog.Info("starting manager")
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
setupLog.Error(err, "problem running manager")
os.Exit(1)
}
}
func setupGRPCService(cfg *config.ServiceConfiguration, k8s client.Client, activity *activity.WorkspaceActivity, maintenance maintenance.Maintenance) (*service.WorkspaceManagerServer, error) {
// TODO(cw): remove use of common-go/log
if len(cfg.RPCServer.RateLimits) > 0 {
log.WithField("ratelimits", cfg.RPCServer.RateLimits).Info("imposing rate limits on the gRPC interface")
}
ratelimits := common_grpc.NewRatelimitingInterceptor(cfg.RPCServer.RateLimits)
grpcMetrics := grpc_prometheus.NewServerMetrics()
grpcMetrics.EnableHandlingTimeHistogram()
metrics.Registry.MustRegister(grpcMetrics)
grpcOpts := common_grpc.ServerOptionsWithInterceptors(
[]grpc.StreamServerInterceptor{grpcMetrics.StreamServerInterceptor()},
[]grpc.UnaryServerInterceptor{grpcMetrics.UnaryServerInterceptor(), ratelimits.UnaryInterceptor()},
)
if cfg.RPCServer.TLS.CA != "" && cfg.RPCServer.TLS.Certificate != "" && cfg.RPCServer.TLS.PrivateKey != "" {
tlsConfig, err := common_grpc.ClientAuthTLSConfig(
cfg.RPCServer.TLS.CA, cfg.RPCServer.TLS.Certificate, cfg.RPCServer.TLS.PrivateKey,
common_grpc.WithSetClientCAs(true),
common_grpc.WithServerName("ws-manager"),
)
if err != nil {
log.WithError(err).Fatal("cannot load ws-manager certs")
}
grpcOpts = append(grpcOpts, grpc.Creds(credentials.NewTLS(tlsConfig)))
} else {
log.Warn("no TLS configured - gRPC server will be unsecured")
}
grpcServer := grpc.NewServer(grpcOpts...)
if cfg.ImageBuilderProxy.TargetAddr != "" {
creds := insecure.NewCredentials()
if cfg.ImageBuilderProxy.TLS.CA != "" && cfg.ImageBuilderProxy.TLS.Certificate != "" && cfg.ImageBuilderProxy.TLS.PrivateKey != "" {
tlsConfig, err := common_grpc.ClientAuthTLSConfig(
cfg.ImageBuilderProxy.TLS.CA, cfg.ImageBuilderProxy.TLS.Certificate, cfg.ImageBuilderProxy.TLS.PrivateKey,
common_grpc.WithSetRootCAs(true),
common_grpc.WithServerName("image-builder-mk3"),
)
if err != nil {
log.WithError(err).Fatal("cannot load image-builder-mk3 TLS certs")
}
log.Info("Loaded TLS for image builder")
creds = credentials.NewTLS(tlsConfig)
}
// Note: never use block here, because image-builder connects to ws-manager,
// and if we blocked here, ws-manager wouldn't come up, hence we couldn't connect to ws-manager.
conn, err := grpc.Dial(cfg.ImageBuilderProxy.TargetAddr, grpc.WithTransportCredentials(creds))
if err != nil {
log.WithError(err).Fatal("failed to connect to image builder")
}
imgbldr.RegisterImageBuilderServer(grpcServer, imgproxy.ImageBuilder{D: imgbldr.NewImageBuilderClient(conn)})
}
srv := service.NewWorkspaceManagerServer(k8s, &cfg.Manager, metrics.Registry, activity, maintenance)
grpc_prometheus.Register(grpcServer)
wsmanapi.RegisterWorkspaceManagerServer(grpcServer, srv)
regapi.RegisterSpecProviderServer(grpcServer, &service.WorkspaceImageSpecProvider{
Client: k8s,
Namespace: cfg.Manager.Namespace,
})
lis, err := net.Listen("tcp", cfg.RPCServer.Addr)
if err != nil {
log.WithError(err).WithField("addr", cfg.RPCServer.Addr).Fatal("cannot start RPC server")
}
go func() {
err := grpcServer.Serve(lis)
if err != nil {
log.WithError(err).Error("gRPC service failed")
}
}()
log.WithField("addr", cfg.RPCServer.Addr).Info("started gRPC server")
return srv, nil
}
func getConfig(fn string) (*config.ServiceConfiguration, error) {
ctnt, err := os.ReadFile(fn)
if err != nil {
return nil, fmt.Errorf("cannot read configuration. Maybe missing --config?: %w", err)
}
var cfg config.ServiceConfiguration
dec := json.NewDecoder(bytes.NewReader(ctnt))
dec.DisallowUnknownFields()
err = dec.Decode(&cfg)
if err != nil {
return nil, fmt.Errorf("cannot decode configuration from %s: %w", fn, err)
}
return &cfg, nil
}