Kyle Brennan 0d1d0bd9f3
[ws-manager-mk2] logging improvements (#19635)
* [ws-manager-mk2] logging improvements

* Record phase transitions, rather record phase each time we reconcile
* Add OWI to log instances

Also, avoid logging workspace name in workspace_types.go, as it contains org and repo info.

* Fix logger WithValues

* Include OWI with reconciling workspace messages

* Include OWI with timeout logs

* Cleanup

* Rely on OWI on the context

Thanks, @WVerlaek!
2024-04-18 20:52:56 +02:00

279 lines
10 KiB
Go

// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.
package controllers
import (
"context"
"fmt"
"time"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/retry"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
k8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
"github.com/gitpod-io/gitpod/common-go/util"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/constants"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
"github.com/go-logr/logr"
)
func NewTimeoutReconciler(c client.Client, recorder record.EventRecorder, cfg config.Configuration, maintenance maintenance.Maintenance) (*TimeoutReconciler, error) {
if cfg.HeartbeatInterval == 0 {
return nil, fmt.Errorf("invalid heartbeat interval, must not be 0")
}
reconcileInterval := time.Duration(cfg.HeartbeatInterval)
// Reconcile interval is half the heartbeat interval to catch timed out workspaces in time.
// See https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem why we need this.
reconcileInterval /= 2
return &TimeoutReconciler{
Client: c,
Config: cfg,
reconcileInterval: reconcileInterval,
recorder: recorder,
maintenance: maintenance,
}, nil
}
// TimeoutReconciler reconciles workspace timeouts. This is a separate reconciler, as it
// always requeues events for existing workspaces such that timeouts are checked on (at least)
// a specified interval. The reconcile loop should therefore be light-weight as it's repeatedly
// reconciling all workspaces in the cluster.
type TimeoutReconciler struct {
client.Client
Config config.Configuration
reconcileInterval time.Duration
recorder record.EventRecorder
maintenance maintenance.Maintenance
}
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch
// Reconcile will check the given workspace for timing out. When done, a new event gets
// requeued automatically to ensure the workspace gets reconciled at least every reconcileInterval.
func (r *TimeoutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
log := log.FromContext(ctx).WithValues("ws", req.NamespacedName)
var workspace workspacev1.Workspace
if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil {
if !apierrors.IsNotFound(err) {
log.Error(err, "unable to fetch workspace")
}
// We'll ignore not-found errors, since they can't be fixed by an immediate
// requeue (we'll need to wait for a new notification), and we can get them
// on deleted requests.
// On any other error, let the controller requeue an event with exponential
// backoff.
return ctrl.Result{}, client.IgnoreNotFound(err)
}
log = log.WithValues("owi", workspace.OWI())
ctx = logr.NewContext(ctx, log)
if workspace.IsConditionTrue(workspacev1.WorkspaceConditionTimeout) {
// Workspace has already been marked as timed out.
// Return and don't requeue another reconciliation.
return ctrl.Result{}, nil
}
if r.maintenance.IsEnabled(ctx) {
// Don't reconcile timeouts in maintenance mode, to prevent workspace deletion.
// Requeue after some time to ensure we do still reconcile this workspace when
// maintenance mode ends.
return ctrl.Result{RequeueAfter: maintenanceRequeue}, nil
}
// The workspace hasn't timed out yet. After this point, we always
// want to requeue a reconciliation after the configured interval.
defer func() {
result.RequeueAfter = r.reconcileInterval
}()
timedout := r.isWorkspaceTimedOut(&workspace)
if timedout == "" {
// Hasn't timed out.
return ctrl.Result{}, nil
}
// Workspace timed out, set Timeout condition.
log.V(2).Info("Workspace timed out", "reason", timedout)
if err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
err := r.Get(ctx, types.NamespacedName{Name: workspace.Name, Namespace: workspace.Namespace}, &workspace)
if err != nil {
return err
}
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionTimeout(timedout))
return r.Status().Update(ctx, &workspace)
}); err != nil {
log.Error(err, "Failed to update workspace status with Timeout condition")
return ctrl.Result{}, fmt.Errorf("failed to add timeout condition: %w", err)
}
r.recorder.Event(&workspace, corev1.EventTypeNormal, "TimedOut", timedout)
return ctrl.Result{}, nil
}
type timeoutActivity string
const (
activityInit timeoutActivity = "initialization"
activityStartup timeoutActivity = "startup"
activityCreatingContainers timeoutActivity = "creating containers"
activityPullingImages timeoutActivity = "pulling images"
activityRunningHeadless timeoutActivity = "running the headless workspace"
activityNone timeoutActivity = "period of inactivity"
activityMaxLifetime timeoutActivity = "maximum lifetime"
activityClosed timeoutActivity = "after being closed"
activityInterrupted timeoutActivity = "workspace interruption"
activityStopping timeoutActivity = "stopping"
activityBackup timeoutActivity = "backup"
)
// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.
// This function does NOT use the Timeout condition, but rather is used to set that condition in the first place.
func (r *TimeoutReconciler) isWorkspaceTimedOut(ws *workspacev1.Workspace) (reason string) {
timeouts := r.Config.Timeouts
phase := ws.Status.Phase
decide := func(start time.Time, timeout util.Duration, activity timeoutActivity) string {
td := time.Duration(timeout)
inactivity := time.Since(start)
if inactivity < td {
return ""
}
return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td))
}
start := ws.ObjectMeta.CreationTimestamp.Time
lastActivity := activity.Last(ws)
isClosed := ws.IsConditionTrue(workspacev1.WorkspaceConditionClosed)
switch phase {
case workspacev1.WorkspacePhasePending:
return decide(start, timeouts.Initialization, activityInit)
case workspacev1.WorkspacePhaseInitializing:
return decide(start, timeouts.TotalStartup, activityStartup)
case workspacev1.WorkspacePhaseCreating:
activity := activityCreatingContainers
// TODO:
// if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {
// activity = activityPullingImages
// }
return decide(start, timeouts.TotalStartup, activity)
case workspacev1.WorkspacePhaseRunning:
// First check is always for the max lifetime
maxLifetime := r.getMaxLifetime(ws)
if msg := decide(start, maxLifetime, activityMaxLifetime); msg != "" {
return msg
}
timeout := timeouts.RegularWorkspace
if customTimeout := ws.Spec.Timeout.Time; customTimeout != nil {
timeout = util.Duration(customTimeout.Duration)
}
activity := activityNone
if ws.IsHeadless() {
timeout = timeouts.HeadlessWorkspace
lastActivity = &start
activity = activityRunningHeadless
} else if lastActivity == nil {
// The workspace is up and running, but the user has never produced any activity
return decide(start, timeouts.TotalStartup, activityNone)
} else if isClosed {
reason := func() string {
afterClosed := timeouts.AfterClose
if customClosedTimeout := ws.Spec.Timeout.ClosedTimeout; customClosedTimeout != nil {
afterClosed = util.Duration(customClosedTimeout.Duration)
if afterClosed == 0 {
return ""
}
}
return decide(*lastActivity, afterClosed, activityClosed)
}()
if reason != "" {
return reason
}
}
return decide(*lastActivity, timeout, activity)
case workspacev1.WorkspacePhaseStopping:
if isWorkspaceBeingDeleted(ws) && !ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete) {
// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted.
// We basically don't expect a workspace to be in content finalization before it's been deleted.
return decide(ws.DeletionTimestamp.Time, timeouts.ContentFinalization, activityBackup)
} else if !isWorkspaceBeingDeleted(ws) {
// workspaces that have not been deleted have never timed out
return ""
} else {
return decide(ws.DeletionTimestamp.Time, timeouts.Stopping, activityStopping)
}
default:
// The only other phases we can be in is stopped which is pointless to time out
return ""
}
}
func (r *TimeoutReconciler) getMaxLifetime(ws *workspacev1.Workspace) util.Duration {
if ws.Spec.Timeout.MaximumLifetime != nil {
return util.Duration(ws.Spec.Timeout.MaximumLifetime.Duration)
}
return r.Config.Timeouts.MaxLifetime
}
func formatDuration(d time.Duration) string {
d = d.Round(time.Minute)
h := d / time.Hour
d -= h * time.Hour
m := d / time.Minute
return fmt.Sprintf("%02dh%02dm", h, m)
}
// SetupWithManager sets up the controller with the Manager.
func (r *TimeoutReconciler) SetupWithManager(mgr ctrl.Manager) error {
maxConcurrentReconciles := r.Config.TimeoutMaxConcurrentReconciles
if maxConcurrentReconciles <= 0 {
maxConcurrentReconciles = 1
}
return ctrl.NewControllerManagedBy(mgr).
Named("timeout").
WithOptions(controller.Options{MaxConcurrentReconciles: maxConcurrentReconciles}).
For(&workspacev1.Workspace{}).
WithEventFilter(predicate.NewPredicateFuncs(func(object client.Object) bool {
for k, v := range object.GetLabels() {
if k == k8s.WorkspaceManagedByLabel {
switch v {
case constants.ManagedBy:
return true
default:
return false
}
}
}
return true
})).
Complete(r)
}