// Copyright (c) 2022 Gitpod GmbH. All rights reserved. // Licensed under the GNU Affero General Public License (AGPL). // See License-AGPL.txt in the project root for license information. package controllers import ( "context" "fmt" "time" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/log" wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes" "github.com/gitpod-io/gitpod/common-go/util" wsactivity "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity" config "github.com/gitpod-io/gitpod/ws-manager/api/config" workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" ) func NewTimeoutReconciler(c client.Client, cfg config.Configuration, activity *wsactivity.WorkspaceActivity) (*TimeoutReconciler, error) { if cfg.HeartbeatInterval == 0 { return nil, fmt.Errorf("invalid heartbeat interval, must not be 0") } reconcileInterval := time.Duration(cfg.HeartbeatInterval) // Reconcile interval is half the heartbeat interval to catch timed out workspaces in time. // See https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem why we need this. reconcileInterval /= 2 return &TimeoutReconciler{ Client: c, Config: cfg, activity: activity, reconcileInterval: reconcileInterval, ctrlStartTime: time.Now().UTC(), }, nil } // TimeoutReconciler reconciles workspace timeouts. This is a separate reconciler, as it // always requeues events for existing workspaces such that timeouts are checked on (at least) // a specified interval. The reconcile loop should therefore be light-weight as it's repeatedly // reconciling all workspaces in the cluster. type TimeoutReconciler struct { client.Client Config config.Configuration activity *wsactivity.WorkspaceActivity reconcileInterval time.Duration ctrlStartTime time.Time } //+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch // Reconcile will check the given workspace for timing out. When done, a new event gets // requeued automatically to ensure the workspace gets reconciled at least every reconcileInterval. func (r *TimeoutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) { log := log.FromContext(ctx).WithValues("ws", req.NamespacedName) var workspace workspacev1.Workspace if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil { if !apierrors.IsNotFound(err) { log.Error(err, "unable to fetch workspace") } // We'll ignore not-found errors, since they can't be fixed by an immediate // requeue (we'll need to wait for a new notification), and we can get them // on deleted requests. // On any other error, let the controller requeue an event with exponential // backoff. return ctrl.Result{}, client.IgnoreNotFound(err) } if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) { // Workspace has already been marked as timed out. // Return and don't requeue another reconciliation. return ctrl.Result{}, nil } // The workspace hasn't timed out yet. After this point, we always // want to requeue a reconciliation after the configured interval. defer func() { result.RequeueAfter = r.reconcileInterval }() timedout := r.isWorkspaceTimedOut(&workspace) if timedout == "" { // Hasn't timed out. return ctrl.Result{}, nil } // Workspace timed out, set Timeout condition. log.Info("Workspace timed out", "reason", timedout) if err = retry.RetryOnConflict(retry.DefaultBackoff, func() error { err := r.Get(ctx, types.NamespacedName{Name: workspace.Name, Namespace: workspace.Namespace}, &workspace) if err != nil { return err } workspace.Status.Conditions = wsk8s.AddUniqueCondition(workspace.Status.Conditions, metav1.Condition{ Type: string(workspacev1.WorkspaceConditionTimeout), Status: metav1.ConditionTrue, LastTransitionTime: metav1.Now(), Reason: "TimedOut", Message: timedout, }) return r.Status().Update(ctx, &workspace) }); err != nil { log.Error(err, "Failed to update workspace status with Timeout condition") return ctrl.Result{}, err } return ctrl.Result{}, nil } type timeoutActivity string const ( activityInit timeoutActivity = "initialization" activityStartup timeoutActivity = "startup" activityCreatingContainers timeoutActivity = "creating containers" activityPullingImages timeoutActivity = "pulling images" activityRunningHeadless timeoutActivity = "running the headless workspace" activityNone timeoutActivity = "period of inactivity" activityMaxLifetime timeoutActivity = "maximum lifetime" activityClosed timeoutActivity = "after being closed" activityInterrupted timeoutActivity = "workspace interruption" activityStopping timeoutActivity = "stopping" activityBackup timeoutActivity = "backup" ) // isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in. // This function does NOT use the Timeout condition, but rather is used to set that condition in the first place. func (r *TimeoutReconciler) isWorkspaceTimedOut(ws *workspacev1.Workspace) (reason string) { timeouts := r.Config.Timeouts phase := ws.Status.Phase decide := func(start time.Time, timeout util.Duration, activity timeoutActivity) string { td := time.Duration(timeout) inactivity := time.Since(start) if inactivity < td { return "" } return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td)) } start := ws.ObjectMeta.CreationTimestamp.Time lastActivity := r.activity.GetLastActivity(ws.Name) isClosed := wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed)) switch phase { case workspacev1.WorkspacePhasePending: return decide(start, timeouts.Initialization, activityInit) case workspacev1.WorkspacePhaseInitializing: return decide(start, timeouts.TotalStartup, activityStartup) case workspacev1.WorkspacePhaseCreating: activity := activityCreatingContainers // TODO: // if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE { // activity = activityPullingImages // } return decide(start, timeouts.TotalStartup, activity) case workspacev1.WorkspacePhaseRunning: // First check is always for the max lifetime if msg := decide(start, timeouts.MaxLifetime, activityMaxLifetime); msg != "" { return msg } timeout := timeouts.RegularWorkspace if customTimeout := ws.Spec.Timeout.Time; customTimeout != nil { timeout = util.Duration(customTimeout.Duration) } activity := activityNone if ws.Status.Headless { timeout = timeouts.HeadlessWorkspace lastActivity = &start activity = activityRunningHeadless } else if lastActivity == nil { // The workspace is up and running, but the user has never produced any activity, OR the controller // has restarted and not yet received a heartbeat for this workspace (since heartbeats are stored // in-memory and reset on restart). // First check whether the controller has restarted during this workspace's lifetime. // If it has, use the FirstUserActivity condition to determine whether there had already been any user activity // before the controller restart. // If the controller started before the workspace, then the user hasn't produced any activity yet. if r.ctrlStartTime.After(start) && wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFirstUserActivity)) { // The controller restarted during this workspace's lifetime, and the workspace has had activity before the restart, // so the last activity has been lost on restart. Therefore, "reset" the timeout and measure only since the controller startup time. start = r.ctrlStartTime } else { // This workspace hasn't had any user activity yet (also not before a potential controller restart). // So check for a startup timeout, and measure since workspace creation time. timeout = timeouts.TotalStartup } return decide(start, timeout, activityNone) } else if isClosed { return decide(*lastActivity, timeouts.AfterClose, activityClosed) } return decide(*lastActivity, timeout, activity) case workspacev1.WorkspacePhaseStopping: if isWorkspaceBeingDeleted(ws) && !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) { // Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted. // We basically don't expect a workspace to be in content finalization before it's been deleted. return decide(ws.DeletionTimestamp.Time, timeouts.ContentFinalization, activityBackup) } else if !isWorkspaceBeingDeleted(ws) { // workspaces that have not been deleted have never timed out return "" } else { return decide(ws.DeletionTimestamp.Time, timeouts.Stopping, activityStopping) } default: // The only other phases we can be in is stopped which is pointless to time out return "" } } func formatDuration(d time.Duration) string { d = d.Round(time.Minute) h := d / time.Hour d -= h * time.Hour m := d / time.Minute return fmt.Sprintf("%02dh%02dm", h, m) } // SetupWithManager sets up the controller with the Manager. func (r *TimeoutReconciler) SetupWithManager(mgr ctrl.Manager) error { maxConcurrentReconciles := r.Config.TimeoutMaxConcurrentReconciles if maxConcurrentReconciles <= 0 { maxConcurrentReconciles = 1 } return ctrl.NewControllerManagedBy(mgr). WithOptions(controller.Options{MaxConcurrentReconciles: maxConcurrentReconciles}). For(&workspacev1.Workspace{}). Complete(r) }