gitpod/components/ws-manager-mk2/controllers/workspace_controller.go

340 lines
11 KiB
Go

// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.
package controllers
import (
"context"
"fmt"
"time"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
"github.com/prometheus/client_golang/prometheus"
)
const (
metricsNamespace = "gitpod"
metricsWorkspaceSubsystem = "ws_manager_mk2"
// kubernetesOperationTimeout is the time we give Kubernetes operations in general.
kubernetesOperationTimeout = 5 * time.Second
)
func NewWorkspaceReconciler(c client.Client, scheme *runtime.Scheme, cfg *config.Configuration, reg prometheus.Registerer) (*WorkspaceReconciler, error) {
reconciler := &WorkspaceReconciler{
Client: c,
Scheme: scheme,
Config: cfg,
}
metrics, err := newControllerMetrics(reconciler)
if err != nil {
return nil, err
}
reg.MustRegister(metrics)
reconciler.metrics = metrics
return reconciler, nil
}
// WorkspaceReconciler reconciles a Workspace object
type WorkspaceReconciler struct {
client.Client
Scheme *runtime.Scheme
Config *config.Configuration
metrics *controllerMetrics
OnReconcile func(ctx context.Context, ws *workspacev1.Workspace)
}
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=pod,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pod/status,verbs=get
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// Modify the Reconcile function to compare the state specified by
// the Workspace object against the actual cluster state, and then
// perform operations to make the cluster state reflect the state specified by
// the user.
//
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.0/pkg/reconcile
func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)
var workspace workspacev1.Workspace
if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil {
if !errors.IsNotFound(err) {
log.Error(err, "unable to fetch workspace")
}
// we'll ignore not-found errors, since they can't be fixed by an immediate
// requeue (we'll need to wait for a new notification), and we can get them
// on deleted requests.
return ctrl.Result{}, client.IgnoreNotFound(err)
}
if workspace.Status.Conditions == nil {
workspace.Status.Conditions = []metav1.Condition{}
}
log.Info("reconciling workspace", "ws", req.NamespacedName)
var workspacePods corev1.PodList
err := r.List(ctx, &workspacePods, client.InNamespace(req.Namespace), client.MatchingFields{wsOwnerKey: req.Name})
if err != nil {
log.Error(err, "unable to list workspace pods")
return ctrl.Result{}, err
}
err = updateWorkspaceStatus(ctx, &workspace, workspacePods, r.Config)
if err != nil {
return ctrl.Result{}, err
}
r.updateMetrics(ctx, &workspace)
result, err := r.actOnStatus(ctx, &workspace, workspacePods)
if err != nil {
return result, err
}
err = r.Status().Update(ctx, &workspace)
if err != nil {
// log.WithValues("status", workspace).Error(err, "unable to update workspace status")
return ctrl.Result{Requeue: true}, err
}
if r.OnReconcile != nil {
r.OnReconcile(ctx, &workspace)
}
return ctrl.Result{}, nil
}
func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *workspacev1.Workspace, workspacePods corev1.PodList) (ctrl.Result, error) {
log := log.FromContext(ctx)
if len(workspacePods.Items) == 0 {
// if there isn't a workspace pod and we're not currently deleting this workspace,// create one.
switch {
case workspace.Status.PodStarts == 0:
sctx, err := newStartWorkspaceContext(ctx, r.Config, workspace)
if err != nil {
log.Error(err, "unable to create startWorkspace context")
return ctrl.Result{Requeue: true}, err
}
pod, err := r.createWorkspacePod(sctx)
if err != nil {
log.Error(err, "unable to produce workspace pod")
return ctrl.Result{}, err
}
if err := ctrl.SetControllerReference(workspace, pod, r.Scheme); err != nil {
return ctrl.Result{}, err
}
err = r.Create(ctx, pod)
if errors.IsAlreadyExists(err) {
// pod exists, we're good
} else if err != nil {
log.Error(err, "unable to create Pod for Workspace", "pod", pod)
return ctrl.Result{Requeue: true}, err
} else {
// TODO(cw): replicate the startup mechanism where pods can fail to be scheduled,
// need to be deleted and re-created
workspace.Status.PodStarts++
}
r.metrics.rememberWorkspace(workspace)
case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
// Done stopping workspace - remove finalizer.
if controllerutil.ContainsFinalizer(workspace, workspacev1.GitpodFinalizerName) {
controllerutil.RemoveFinalizer(workspace, workspacev1.GitpodFinalizerName)
if err := r.Update(ctx, workspace); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
}
// Workspace might have already been in a deleting state,
// but not guaranteed, so try deleting anyway.
err := r.Client.Delete(ctx, workspace)
return ctrl.Result{}, client.IgnoreNotFound(err)
}
return ctrl.Result{}, nil
}
// all actions below assume there is a pod
if len(workspacePods.Items) == 0 {
return ctrl.Result{}, nil
}
pod := &workspacePods.Items[0]
switch {
// if there is a pod, and it's failed, delete it
case wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) && !isPodBeingDeleted(pod):
err := r.Client.Delete(ctx, pod)
if errors.IsNotFound(err) {
// pod is gone - nothing to do here
} else {
return ctrl.Result{Requeue: true}, err
}
// if the pod was stopped by request, delete it
case wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionStoppedByRequest)) && !isPodBeingDeleted(pod):
var gracePeriodSeconds *int64
if c := wsk8s.GetCondition(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionStoppedByRequest)); c != nil {
if dt, err := time.ParseDuration(c.Message); err == nil {
s := int64(dt.Seconds())
gracePeriodSeconds = &s
}
}
err := r.Client.Delete(ctx, pod, &client.DeleteOptions{
GracePeriodSeconds: gracePeriodSeconds,
})
if errors.IsNotFound(err) {
// pod is gone - nothing to do here
} else {
return ctrl.Result{Requeue: true}, err
}
// if the workspace timed out, delete it
case wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) && !isPodBeingDeleted(pod):
err := r.Client.Delete(ctx, pod)
if errors.IsNotFound(err) {
// pod is gone - nothing to do here
} else {
return ctrl.Result{Requeue: true}, err
}
// if the content initialization failed, delete the pod
case wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") && !isPodBeingDeleted(pod):
err := r.Client.Delete(ctx, pod)
if errors.IsNotFound(err) {
// pod is gone - nothing to do here
} else {
return ctrl.Result{Requeue: true}, err
}
case isWorkspaceBeingDeleted(workspace) && !isPodBeingDeleted(pod):
// Workspace was requested to be deleted, propagate by deleting the Pod.
// The Pod deletion will then trigger workspace disposal steps.
err := r.Client.Delete(ctx, pod)
if errors.IsNotFound(err) {
// pod is gone - nothing to do here
} else {
return ctrl.Result{Requeue: true}, err
}
// we've disposed already - try to remove the finalizer and call it a day
case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
hadFinalizer := controllerutil.ContainsFinalizer(pod, workspacev1.GitpodFinalizerName)
controllerutil.RemoveFinalizer(pod, workspacev1.GitpodFinalizerName)
if err := r.Client.Update(ctx, pod); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to remove gitpod finalizer: %w", err)
}
if hadFinalizer {
// Requeue to remove workspace.
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}
}
return ctrl.Result{}, nil
}
func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *workspacev1.Workspace) {
log := log.FromContext(ctx)
phase := workspace.Status.Phase
if !r.metrics.shouldUpdate(&log, workspace) {
return
}
switch {
case phase == workspacev1.WorkspacePhasePending ||
phase == workspacev1.WorkspacePhaseCreating ||
phase == workspacev1.WorkspacePhaseInitializing:
if wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") {
r.metrics.countTotalRestoreFailures(&log, workspace)
r.metrics.countWorkspaceStartFailures(&log, workspace)
}
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
r.metrics.countWorkspaceStartFailures(&log, workspace)
}
case phase == workspacev1.WorkspacePhaseRunning:
r.metrics.recordWorkspaceStartupTime(&log, workspace)
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)) {
r.metrics.countTotalRestores(&log, workspace)
}
case phase == workspacev1.WorkspacePhaseStopped:
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)) {
r.metrics.countTotalBackups(&log, workspace)
r.metrics.countTotalBackupFailures(&log, workspace)
}
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) {
r.metrics.countTotalBackups(&log, workspace)
}
r.metrics.countWorkspaceStop(&log, workspace)
r.metrics.forgetWorkspace(workspace)
return
}
r.metrics.rememberWorkspace(workspace)
}
var (
wsOwnerKey = ".metadata.controller"
apiGVStr = workspacev1.GroupVersion.String()
)
// SetupWithManager sets up the controller with the Manager.
func (r *WorkspaceReconciler) SetupWithManager(mgr ctrl.Manager) error {
idx := func(rawObj client.Object) []string {
// grab the job object, extract the owner...
job := rawObj.(*corev1.Pod)
owner := metav1.GetControllerOf(job)
if owner == nil {
return nil
}
// ...make sure it's a workspace...
if owner.APIVersion != apiGVStr || owner.Kind != "Workspace" {
return nil
}
// ...and if so, return it
return []string{owner.Name}
}
err := mgr.GetFieldIndexer().IndexField(context.Background(), &corev1.Pod{}, wsOwnerKey, idx)
if err != nil {
return err
}
return ctrl.NewControllerManagedBy(mgr).
For(&workspacev1.Workspace{}).
Owns(&corev1.Pod{}).
Complete(r)
}