mirror of
https://github.com/gitpod-io/gitpod.git
synced 2025-12-08 17:36:30 +00:00
* [node-labeler] Refactor node labeling to use taints instead of labels * [agent-smith] Add toleration to daemonset * Add workspace component tolerations to various Gitpod components if it running in Full installation * Apply suggestions from code review Co-authored-by: Kyle Brennan <kyle@gitpod.io> * Update components/node-labeler/cmd/run.go Co-authored-by: Kyle Brennan <kyle@gitpod.io> --------- Co-authored-by: Kyle Brennan <kyle@gitpod.io>
780 lines
22 KiB
Go
780 lines
22 KiB
Go
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
|
|
// Licensed under the GNU Affero General Public License (AGPL).
|
|
// See License.AGPL.txt in the project root for license information.
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"fmt"
|
|
"net"
|
|
"net/http"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/bombsimon/logrusr/v2"
|
|
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
|
|
"github.com/spf13/cobra"
|
|
corev1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/errors"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
|
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
|
_ "k8s.io/client-go/plugin/pkg/client/auth"
|
|
"k8s.io/client-go/util/retry"
|
|
"k8s.io/utils/ptr"
|
|
ctrl "sigs.k8s.io/controller-runtime"
|
|
"sigs.k8s.io/controller-runtime/pkg/builder"
|
|
"sigs.k8s.io/controller-runtime/pkg/cache"
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
|
"sigs.k8s.io/controller-runtime/pkg/controller"
|
|
"sigs.k8s.io/controller-runtime/pkg/event"
|
|
"sigs.k8s.io/controller-runtime/pkg/healthz"
|
|
"sigs.k8s.io/controller-runtime/pkg/manager"
|
|
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
|
|
"sigs.k8s.io/controller-runtime/pkg/predicate"
|
|
"sigs.k8s.io/controller-runtime/pkg/reconcile"
|
|
"sigs.k8s.io/controller-runtime/pkg/webhook"
|
|
|
|
"github.com/gitpod-io/gitpod/common-go/log"
|
|
)
|
|
|
|
const (
|
|
registryFacadeLabel = "gitpod.io/registry-facade_ready_ns_%v"
|
|
wsdaemonLabel = "gitpod.io/ws-daemon_ready_ns_%v"
|
|
|
|
registryFacade = "registry-facade"
|
|
wsDaemon = "ws-daemon"
|
|
|
|
// Taint keys for different components
|
|
registryFacadeTaintKey = "gitpod.io/registry-facade-not-ready"
|
|
wsDaemonTaintKey = "gitpod.io/ws-daemon-not-ready"
|
|
|
|
workspacesRegularLabel = "gitpod.io/workload_workspace_regular"
|
|
workspacesHeadlessLabel = "gitpod.io/workload_workspace_headless"
|
|
)
|
|
|
|
var defaultRequeueTime = time.Second * 10
|
|
|
|
// serveCmd represents the serve command
|
|
var runCmd = &cobra.Command{
|
|
Use: "run",
|
|
Short: "Starts the node labeler",
|
|
Run: func(cmd *cobra.Command, args []string) {
|
|
ctrl.SetLogger(logrusr.New(log.Log))
|
|
|
|
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
|
|
Scheme: scheme,
|
|
HealthProbeBindAddress: ":8086",
|
|
Metrics: metricsserver.Options{BindAddress: "127.0.0.1:9500"},
|
|
Cache: cache.Options{
|
|
DefaultNamespaces: map[string]cache.Config{
|
|
namespace: {},
|
|
},
|
|
// default sync period is 10h.
|
|
// in case node-labeler is restarted and not change happens, we could waste (at least) 20m in a node
|
|
// that never will run workspaces and the additional nodes cluster-autoscaler adds to compensate
|
|
SyncPeriod: ptr.To(time.Duration(2 * time.Minute)),
|
|
},
|
|
WebhookServer: webhook.NewServer(webhook.Options{
|
|
Port: 9443,
|
|
}),
|
|
LeaderElection: true,
|
|
LeaderElectionID: "node-labeler.gitpod.io",
|
|
})
|
|
if err != nil {
|
|
log.WithError(err).Fatal("unable to start node-labeler")
|
|
}
|
|
|
|
r := &PodReconciler{
|
|
mgr.GetClient(),
|
|
}
|
|
|
|
componentPredicate, err := predicate.LabelSelectorPredicate(metav1.LabelSelector{
|
|
MatchExpressions: []metav1.LabelSelectorRequirement{{
|
|
Key: "component",
|
|
Operator: metav1.LabelSelectorOpIn,
|
|
Values: []string{"ws-daemon", "registry-facade"},
|
|
}},
|
|
})
|
|
if err != nil {
|
|
log.WithError(err).Fatal("unable to create predicate")
|
|
}
|
|
|
|
err = ctrl.NewControllerManagedBy(mgr).
|
|
Named("pod-watcher").
|
|
For(&corev1.Pod{}, builder.WithPredicates(predicate.Or(componentPredicate))).
|
|
WithOptions(controller.Options{MaxConcurrentReconciles: 1}).
|
|
Complete(r)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("unable to bind controller watch event handler")
|
|
}
|
|
nr := &NodeReconciler{
|
|
mgr.GetClient(),
|
|
}
|
|
|
|
err = ctrl.NewControllerManagedBy(mgr).
|
|
Named("node-watcher").
|
|
For(&corev1.Node{}, builder.WithPredicates(predicate.Or(nr.nodeFilter()))).
|
|
WithOptions(controller.Options{MaxConcurrentReconciles: 1}).
|
|
Complete(nr)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("unable to bind controller watch event handler")
|
|
}
|
|
|
|
go func() {
|
|
<-mgr.Elected()
|
|
if err := nr.reconcileAll(context.Background()); err != nil {
|
|
log.WithError(err).Fatal("failed to reconcile all nodes")
|
|
}
|
|
}()
|
|
|
|
if err := mgr.GetFieldIndexer().IndexField(context.Background(), &workspacev1.Workspace{}, "status.runtime.nodeName", func(o client.Object) []string {
|
|
ws := o.(*workspacev1.Workspace)
|
|
if ws.Status.Runtime == nil {
|
|
return nil
|
|
}
|
|
return []string{ws.Status.Runtime.NodeName}
|
|
}); err != nil {
|
|
log.WithError(err).Fatal("unable to create workspace indexer")
|
|
return
|
|
}
|
|
|
|
if err := mgr.GetFieldIndexer().IndexField(context.Background(), &corev1.Pod{}, "spec.nodeName", func(o client.Object) []string {
|
|
pod := o.(*corev1.Pod)
|
|
if pod.Spec.NodeName == "" {
|
|
return nil
|
|
}
|
|
return []string{pod.Spec.NodeName}
|
|
}); err != nil {
|
|
log.WithError(err).Fatal("unable to create pod indexer")
|
|
return
|
|
}
|
|
|
|
nsac, err := NewNodeScaledownAnnotationController(mgr.GetClient())
|
|
if err != nil {
|
|
log.WithError(err).Fatal("unable to create node scaledown annotation controller")
|
|
}
|
|
err = nsac.SetupWithManager(mgr)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("unable to bind node scaledown annotation controller")
|
|
}
|
|
|
|
err = mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
|
|
<-ctx.Done()
|
|
log.Info("Received shutdown signal - stopping NodeScaledownAnnotationController")
|
|
nsac.Stop()
|
|
return nil
|
|
}))
|
|
if err != nil {
|
|
log.WithError(err).Fatal("couldn't properly clean up node scaledown annotation controller")
|
|
}
|
|
err = mgr.AddHealthzCheck("healthz", healthz.Ping)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("unable to set up health check")
|
|
}
|
|
|
|
err = mgr.AddReadyzCheck("readyz", healthz.Ping)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("unable to set up ready check")
|
|
}
|
|
|
|
log.Info("starting node-labeler")
|
|
err = mgr.Start(ctrl.SetupSignalHandler())
|
|
if err != nil {
|
|
log.WithError(err).Fatal("problem running node-labeler")
|
|
}
|
|
|
|
log.Info("Received SIGINT - shutting down")
|
|
},
|
|
}
|
|
|
|
func init() {
|
|
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
|
|
utilruntime.Must(workspacev1.AddToScheme(scheme))
|
|
|
|
rootCmd.AddCommand(runCmd)
|
|
}
|
|
|
|
var (
|
|
scheme = runtime.NewScheme()
|
|
)
|
|
|
|
type PodReconciler struct {
|
|
client.Client
|
|
}
|
|
|
|
func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
|
|
var pod corev1.Pod
|
|
err := r.Get(ctx, req.NamespacedName, &pod)
|
|
if err != nil {
|
|
if !errors.IsNotFound(err) {
|
|
log.WithError(err).Error("unable to fetch pod")
|
|
}
|
|
|
|
return ctrl.Result{}, client.IgnoreNotFound(err)
|
|
}
|
|
|
|
nodeName := pod.Spec.NodeName
|
|
if nodeName == "" {
|
|
return reconcile.Result{RequeueAfter: defaultRequeueTime}, nil
|
|
}
|
|
|
|
var taintKey string
|
|
switch {
|
|
case strings.HasPrefix(pod.Name, registryFacade):
|
|
taintKey = registryFacadeTaintKey
|
|
case strings.HasPrefix(pod.Name, wsDaemon):
|
|
taintKey = wsDaemonTaintKey
|
|
default:
|
|
// nothing to do
|
|
return reconcile.Result{}, nil
|
|
}
|
|
|
|
healthy, err := checkPodHealth(pod)
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot check pod health")
|
|
return reconcile.Result{RequeueAfter: defaultRequeueTime}, nil
|
|
}
|
|
|
|
var node corev1.Node
|
|
err = r.Get(ctx, types.NamespacedName{Name: nodeName}, &node)
|
|
if err != nil {
|
|
if !errors.IsNotFound(err) {
|
|
log.WithError(err).Error("cannot get node")
|
|
}
|
|
return reconcile.Result{}, client.IgnoreNotFound(err)
|
|
}
|
|
|
|
if isNodeTaintExists(taintKey, node) != healthy {
|
|
// nothing to do, the taint already exists and is in the desired state.
|
|
return reconcile.Result{}, nil
|
|
}
|
|
|
|
err = updateNodeTaint(taintKey, !healthy, nodeName, r)
|
|
if err != nil {
|
|
log.WithError(err).
|
|
WithField("taintKey", taintKey).
|
|
WithField("add", !healthy).
|
|
WithField("nodeName", nodeName).
|
|
Error("cannot update node taint")
|
|
return reconcile.Result{RequeueAfter: defaultRequeueTime}, nil
|
|
}
|
|
|
|
return reconcile.Result{}, nil
|
|
}
|
|
|
|
func checkPodHealth(pod corev1.Pod) (bool, error) {
|
|
var (
|
|
ipAddress string
|
|
port string
|
|
)
|
|
switch {
|
|
case strings.HasPrefix(pod.Name, registryFacade):
|
|
ipAddress = pod.Status.HostIP
|
|
port = strconv.Itoa(registryFacadePort)
|
|
case strings.HasPrefix(pod.Name, wsDaemon):
|
|
ipAddress = pod.Status.PodIP
|
|
port = strconv.Itoa(wsdaemonPort)
|
|
default:
|
|
// nothing to do
|
|
return true, nil
|
|
}
|
|
|
|
if !pod.ObjectMeta.DeletionTimestamp.IsZero() {
|
|
// the pod is being removed.
|
|
// add the taint to the node
|
|
return false, nil
|
|
}
|
|
|
|
if !IsPodReady(pod) {
|
|
// not ready. Wait until the next update.
|
|
return false, nil
|
|
}
|
|
|
|
err := checkTCPPortIsReachable(ipAddress, port)
|
|
if err != nil {
|
|
log.WithField("host", ipAddress).WithField("port", port).WithField("pod", pod.Name).WithError(err).Error("checking if TCP port is open")
|
|
return false, nil
|
|
}
|
|
|
|
if strings.HasPrefix(pod.Name, registryFacade) {
|
|
err = checkRegistryFacade(ipAddress, port)
|
|
if err != nil {
|
|
log.WithError(err).Error("checking registry-facade")
|
|
return false, nil
|
|
}
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
type NodeReconciler struct {
|
|
client.Client
|
|
}
|
|
|
|
func (r *NodeReconciler) nodeFilter() predicate.Predicate {
|
|
return predicate.Funcs{
|
|
CreateFunc: func(e event.CreateEvent) bool {
|
|
node, ok := e.Object.(*corev1.Node)
|
|
if !ok {
|
|
return false
|
|
}
|
|
return isWorkspaceNode(*node)
|
|
},
|
|
UpdateFunc: func(e event.UpdateEvent) bool {
|
|
return false
|
|
},
|
|
DeleteFunc: func(e event.DeleteEvent) bool {
|
|
return false
|
|
},
|
|
}
|
|
}
|
|
|
|
func (r *NodeReconciler) reconcileAll(ctx context.Context) error {
|
|
log.Info("start reconciling all nodes")
|
|
|
|
var nodes corev1.NodeList
|
|
if err := r.List(ctx, &nodes); err != nil {
|
|
return fmt.Errorf("failed to list nodes: %w", err)
|
|
}
|
|
|
|
for _, node := range nodes.Items {
|
|
if node.Labels == nil {
|
|
continue
|
|
}
|
|
if !isWorkspaceNode(node) {
|
|
continue
|
|
}
|
|
|
|
err := updateNodeLabel(node.Name, r.Client)
|
|
if err != nil {
|
|
log.WithError(err).WithField("node", node.Name).Error("failed to initialize labels on node")
|
|
}
|
|
r.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: node.Name}})
|
|
}
|
|
|
|
log.Info("finished reconciling all nodes")
|
|
return nil
|
|
}
|
|
|
|
func (r *NodeReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
|
|
var node corev1.Node
|
|
err := r.Get(ctx, req.NamespacedName, &node)
|
|
if err != nil {
|
|
if !errors.IsNotFound(err) {
|
|
log.WithError(err).Error("unable to fetch node")
|
|
}
|
|
return ctrl.Result{}, client.IgnoreNotFound(err)
|
|
}
|
|
var podList corev1.PodList
|
|
err = r.List(ctx, &podList, client.MatchingFields{
|
|
"spec.nodeName": node.Name,
|
|
})
|
|
if err != nil {
|
|
return reconcile.Result{}, fmt.Errorf("cannot list pods: %w", err)
|
|
}
|
|
err = updateNodeLabel(node.Name, r.Client)
|
|
if err != nil {
|
|
log.WithError(err).WithField("node", node.Name).Error("failed to initialize labels on node")
|
|
}
|
|
isWsdaemonTaintExists := isNodeTaintExists(wsDaemonTaintKey, node)
|
|
isRegistryFacadeTaintExists := isNodeTaintExists(registryFacadeTaintKey, node)
|
|
isWsDaemonReady, isRegistryFacadeReady := false, false
|
|
for _, pod := range podList.Items {
|
|
if strings.HasPrefix(pod.Name, wsDaemon) {
|
|
isWsDaemonReady, err = checkPodHealth(pod)
|
|
if err != nil {
|
|
log.WithError(err).Error("checking pod health")
|
|
}
|
|
}
|
|
if strings.HasPrefix(pod.Name, registryFacade) {
|
|
isRegistryFacadeReady, err = checkPodHealth(pod)
|
|
if err != nil {
|
|
log.WithError(err).Error("checking pod health")
|
|
}
|
|
}
|
|
}
|
|
if isWsDaemonReady == isWsdaemonTaintExists {
|
|
updateNodeTaint(wsDaemonTaintKey, !isWsDaemonReady, node.Name, r)
|
|
}
|
|
if isRegistryFacadeReady == isRegistryFacadeTaintExists {
|
|
updateNodeTaint(registryFacadeTaintKey, !isRegistryFacadeReady, node.Name, r)
|
|
}
|
|
return reconcile.Result{}, nil
|
|
}
|
|
|
|
type NodeScaledownAnnotationController struct {
|
|
client.Client
|
|
nodesToReconcile chan string
|
|
stopChan chan struct{}
|
|
}
|
|
|
|
func NewNodeScaledownAnnotationController(client client.Client) (*NodeScaledownAnnotationController, error) {
|
|
controller := &NodeScaledownAnnotationController{
|
|
Client: client,
|
|
nodesToReconcile: make(chan string, 1000),
|
|
stopChan: make(chan struct{}),
|
|
}
|
|
|
|
return controller, nil
|
|
}
|
|
|
|
func (c *NodeScaledownAnnotationController) SetupWithManager(mgr ctrl.Manager) error {
|
|
go c.reconciliationWorker()
|
|
go c.periodicReconciliation()
|
|
|
|
return ctrl.NewControllerManagedBy(mgr).
|
|
Named("node-scaledown-annotation-controller").
|
|
For(&workspacev1.Workspace{}).
|
|
WithEventFilter(c.workspaceFilter()).
|
|
Complete(c)
|
|
}
|
|
|
|
// periodicReconciliation periodically reconciles all nodes in the cluster
|
|
func (c *NodeScaledownAnnotationController) periodicReconciliation() {
|
|
ticker := time.NewTicker(5 * time.Minute)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
log.Info("starting periodic full reconciliation")
|
|
ctx := context.Background()
|
|
if _, err := c.reconcileAllNodes(ctx); err != nil {
|
|
log.WithError(err).Error("periodic reconciliation failed")
|
|
}
|
|
case <-c.stopChan:
|
|
log.Info("stopping periodic full reconciliation")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// reconciliationWorker consumes nodesToReconcile and reconciles each node
|
|
func (c *NodeScaledownAnnotationController) reconciliationWorker() {
|
|
log.Info("reconciliation worker started")
|
|
for {
|
|
select {
|
|
case nodeName := <-c.nodesToReconcile:
|
|
ctx := context.Background()
|
|
if err := c.reconcileNode(ctx, nodeName); err != nil {
|
|
log.WithError(err).WithField("node", nodeName).Error("failed to reconcile node from queue")
|
|
}
|
|
case <-c.stopChan:
|
|
log.Info("reconciliation worker stopping")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *NodeScaledownAnnotationController) workspaceFilter() predicate.Predicate {
|
|
return predicate.Funcs{
|
|
CreateFunc: func(e event.CreateEvent) bool {
|
|
ws := e.Object.(*workspacev1.Workspace)
|
|
if ws.Status.Runtime == nil {
|
|
log.WithField("workspace", ws.Name).Info("workspace not ready yet")
|
|
return false
|
|
}
|
|
|
|
return ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != ""
|
|
},
|
|
UpdateFunc: func(e event.UpdateEvent) bool {
|
|
wsOld := e.ObjectOld.(*workspacev1.Workspace)
|
|
ws := e.ObjectNew.(*workspacev1.Workspace)
|
|
// if we haven't seen runtime info before and now it's there, let's reconcile.
|
|
// similarly, if the node name changed, we need to reconcile the old node as well.
|
|
if (wsOld.Status.Runtime == nil && ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != "") || // we just got runtime info
|
|
(wsOld.Status.Runtime != nil && ws.Status.Runtime != nil && wsOld.Status.Runtime.NodeName != ws.Status.Runtime.NodeName) { // node name changed
|
|
if wsOld.Status.Runtime != nil && wsOld.Status.Runtime.NodeName != "" {
|
|
c.queueNodeForReconciliation(wsOld.Status.Runtime.NodeName)
|
|
}
|
|
return true
|
|
}
|
|
|
|
return false
|
|
},
|
|
DeleteFunc: func(e event.DeleteEvent) bool {
|
|
ws := e.Object.(*workspacev1.Workspace)
|
|
if ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != "" {
|
|
c.queueNodeForReconciliation(ws.Status.Runtime.NodeName)
|
|
return true
|
|
}
|
|
return false
|
|
},
|
|
}
|
|
}
|
|
|
|
func (c *NodeScaledownAnnotationController) queueNodeForReconciliation(nodeName string) {
|
|
select {
|
|
case c.nodesToReconcile <- nodeName:
|
|
log.WithField("node", nodeName).Info("queued node for reconciliation")
|
|
default:
|
|
log.WithField("node", nodeName).Warn("reconciliation queue full")
|
|
}
|
|
}
|
|
|
|
func (c *NodeScaledownAnnotationController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
|
log.WithField("request", req.NamespacedName.String()).Info("WorkspaceCountController reconciling")
|
|
|
|
var ws workspacev1.Workspace
|
|
if err := c.Get(ctx, req.NamespacedName, &ws); err != nil {
|
|
if !errors.IsNotFound(err) {
|
|
log.WithError(err).WithField("workspace", req.NamespacedName).Error("unable to fetch Workspace")
|
|
return ctrl.Result{}, err
|
|
}
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
if ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != "" {
|
|
c.queueNodeForReconciliation(ws.Status.Runtime.NodeName)
|
|
}
|
|
|
|
log.WithField("runtime", ws.Status.Runtime).Warn("reconciling object with no Runtime/NodeName, which wasn't filtered out by workspaceFilter")
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// Cleanup method to be called when shutting down the controller
|
|
func (wc *NodeScaledownAnnotationController) Stop() {
|
|
close(wc.stopChan)
|
|
}
|
|
|
|
func (c *NodeScaledownAnnotationController) reconcileAllNodes(ctx context.Context) (ctrl.Result, error) {
|
|
var nodes corev1.NodeList
|
|
if err := c.List(ctx, &nodes); err != nil {
|
|
log.WithError(err).Error("failed to list nodes")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
for _, node := range nodes.Items {
|
|
c.queueNodeForReconciliation(node.Name)
|
|
}
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
func (c *NodeScaledownAnnotationController) reconcileNode(ctx context.Context, nodeName string) error {
|
|
var workspaceList workspacev1.WorkspaceList
|
|
if err := c.List(ctx, &workspaceList, client.MatchingFields{
|
|
"status.runtime.nodeName": nodeName,
|
|
}); err != nil {
|
|
return fmt.Errorf("failed to list workspaces: %w", err)
|
|
}
|
|
|
|
log.WithField("node", nodeName).WithField("count", len(workspaceList.Items)).Info("acting on workspaces")
|
|
count := len(workspaceList.Items)
|
|
|
|
return c.updateNodeAnnotation(ctx, nodeName, count)
|
|
}
|
|
|
|
func (c *NodeScaledownAnnotationController) updateNodeAnnotation(ctx context.Context, nodeName string, count int) error {
|
|
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
|
|
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
defer cancel()
|
|
|
|
var node corev1.Node
|
|
err := c.Get(ctx, types.NamespacedName{Name: nodeName}, &node)
|
|
if err != nil {
|
|
return fmt.Errorf("obtaining node %s: %w", nodeName, err)
|
|
}
|
|
|
|
shouldDisableScaleDown := count > 0
|
|
currentlyDisabled := false
|
|
if val, exists := node.Annotations["cluster-autoscaler.kubernetes.io/scale-down-disabled"]; exists {
|
|
currentlyDisabled = val == "true"
|
|
}
|
|
|
|
// Only update if the state needs to change
|
|
if shouldDisableScaleDown != currentlyDisabled {
|
|
if node.Annotations == nil {
|
|
node.Annotations = make(map[string]string)
|
|
}
|
|
|
|
if shouldDisableScaleDown {
|
|
node.Annotations["cluster-autoscaler.kubernetes.io/scale-down-disabled"] = "true"
|
|
log.WithField("nodeName", nodeName).Info("disabling scale-down for node")
|
|
} else {
|
|
delete(node.Annotations, "cluster-autoscaler.kubernetes.io/scale-down-disabled")
|
|
log.WithField("nodeName", nodeName).Info("enabling scale-down for node")
|
|
}
|
|
|
|
return c.Update(ctx, &node)
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func updateNodeTaint(taintKey string, add bool, nodeName string, client client.Client) error {
|
|
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
|
|
var node corev1.Node
|
|
err := client.Get(ctx, types.NamespacedName{Name: nodeName}, &node)
|
|
if err != nil {
|
|
if !errors.IsNotFound(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Create or remove taint
|
|
if add {
|
|
// Add taint if it doesn't exist
|
|
taintExists := false
|
|
for _, taint := range node.Spec.Taints {
|
|
if taint.Key == taintKey {
|
|
taintExists = true
|
|
break
|
|
}
|
|
}
|
|
if !taintExists {
|
|
node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{
|
|
Key: taintKey,
|
|
Value: "true",
|
|
Effect: corev1.TaintEffectNoSchedule,
|
|
})
|
|
log.WithField("taint", taintKey).WithField("node", nodeName).Info("adding taint to node")
|
|
}
|
|
} else {
|
|
// Remove taint if it exists
|
|
newTaints := make([]corev1.Taint, 0)
|
|
for _, taint := range node.Spec.Taints {
|
|
if taint.Key != taintKey {
|
|
newTaints = append(newTaints, taint)
|
|
}
|
|
}
|
|
if len(newTaints) != len(node.Spec.Taints) {
|
|
node.Spec.Taints = newTaints
|
|
log.WithField("taint", taintKey).WithField("node", nodeName).Info("removing taint from node")
|
|
}
|
|
}
|
|
|
|
err = client.Update(ctx, &node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func isNodeTaintExists(taintKey string, node corev1.Node) bool {
|
|
for _, taint := range node.Spec.Taints {
|
|
if taint.Key == taintKey {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func checkTCPPortIsReachable(host string, port string) error {
|
|
conn, err := net.DialTimeout("tcp", net.JoinHostPort(host, port), 1*time.Second)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer conn.Close()
|
|
|
|
return nil
|
|
}
|
|
|
|
func checkRegistryFacade(host, port string) error {
|
|
transport := newDefaultTransport()
|
|
transport.TLSClientConfig = &tls.Config{
|
|
InsecureSkipVerify: true,
|
|
}
|
|
|
|
client := &http.Client{
|
|
Transport: transport,
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
|
|
defer cancel()
|
|
|
|
dummyURL := fmt.Sprintf("https://%v:%v/v2/remote/not-a-valid-image/manifests/latest", host, port)
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, dummyURL, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("building HTTP request: %v", err)
|
|
}
|
|
|
|
req.Header.Set("Accept", "application/vnd.oci.image.manifest.v1+json, application/vnd.oci.image.index.v1+json")
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("unexpected error during HTTP request: %v", err)
|
|
}
|
|
resp.Body.Close()
|
|
|
|
if resp.StatusCode == http.StatusNotFound {
|
|
return nil
|
|
}
|
|
|
|
return fmt.Errorf("registry-facade is not ready yet")
|
|
}
|
|
|
|
func newDefaultTransport() *http.Transport {
|
|
return &http.Transport{
|
|
DialContext: (&net.Dialer{
|
|
Timeout: 1 * time.Second,
|
|
DualStack: false,
|
|
}).DialContext,
|
|
MaxIdleConns: 0,
|
|
MaxIdleConnsPerHost: 1,
|
|
IdleConnTimeout: 5 * time.Second,
|
|
ExpectContinueTimeout: 5 * time.Second,
|
|
DisableKeepAlives: true,
|
|
}
|
|
}
|
|
|
|
func isWorkspaceNode(node corev1.Node) bool {
|
|
_, isRegularWorkspaceNode := node.Labels[workspacesRegularLabel]
|
|
_, isHeadlessWorkspaceNode := node.Labels[workspacesHeadlessLabel]
|
|
return isRegularWorkspaceNode || isHeadlessWorkspaceNode
|
|
}
|
|
|
|
func updateNodeLabel(nodeName string, client client.Client) error {
|
|
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
|
|
var node corev1.Node
|
|
err := client.Get(ctx, types.NamespacedName{Name: nodeName}, &node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
registryFacadeLabelForNamespace := fmt.Sprintf(registryFacadeLabel, namespace)
|
|
wsDaemonLabelForNamespace := fmt.Sprintf(wsdaemonLabel, namespace)
|
|
|
|
needUpdate := false
|
|
|
|
if node.Labels == nil {
|
|
node.Labels = make(map[string]string)
|
|
}
|
|
|
|
if v := node.Labels[registryFacadeLabelForNamespace]; v != "true" {
|
|
needUpdate = true
|
|
}
|
|
if v := node.Labels[wsDaemonLabelForNamespace]; v != "true" {
|
|
needUpdate = true
|
|
}
|
|
|
|
if !needUpdate {
|
|
return nil
|
|
}
|
|
node.Labels[registryFacadeLabelForNamespace] = "true"
|
|
node.Labels[wsDaemonLabelForNamespace] = "true"
|
|
|
|
err = client.Update(ctx, &node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|