// Copyright (c) 2023 Gitpod GmbH. All rights reserved. // Licensed under the GNU Affero General Public License (AGPL). // See License.AGPL.txt in the project root for license information. package controllers import ( "context" "strings" "time" wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes" workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" "github.com/go-logr/logr" lru "github.com/hashicorp/golang-lru" "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/client" ) const ( workspaceStartupSeconds string = "workspace_startup_seconds" workspaceStartFailuresTotal string = "workspace_starts_failure_total" workspaceFailuresTotal string = "workspace_failure_total" workspaceStopsTotal string = "workspace_stops_total" workspaceBackupsTotal string = "workspace_backups_total" workspaceBackupFailuresTotal string = "workspace_backups_failure_total" workspaceRestoresTotal string = "workspace_restores_total" workspaceRestoresFailureTotal string = "workspace_restores_failure_total" ) type StopReason string const ( StopReasonFailed = "failed" StopReasonStartFailure = "start-failure" StopReasonAborted = "aborted" StopReasonOutOfSpace = "out-of-space" StopReasonTimeout = "timeout" StopReasonTabClosed = "tab-closed" StopReasonRegular = "regular-stop" ) type controllerMetrics struct { startupTimeHistVec *prometheus.HistogramVec totalStartsFailureCounterVec *prometheus.CounterVec totalFailuresCounterVec *prometheus.CounterVec totalStopsCounterVec *prometheus.CounterVec totalBackupCounterVec *prometheus.CounterVec totalBackupFailureCounterVec *prometheus.CounterVec totalRestoreCounterVec *prometheus.CounterVec totalRestoreFailureCounterVec *prometheus.CounterVec workspacePhases *phaseTotalVec timeoutSettings *timeoutSettingsVec // used to prevent recording metrics multiple times cache *lru.Cache } func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) { cache, err := lru.New(6000) if err != nil { return nil, err } return &controllerMetrics{ startupTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, Name: workspaceStartupSeconds, Help: "time it took for workspace pods to reach the running phase", Buckets: prometheus.ExponentialBuckets(2, 2, 10), }, []string{"type", "class"}), totalStartsFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, Name: workspaceStartFailuresTotal, Help: "total number of workspaces that failed to start", }, []string{"type", "class"}), totalFailuresCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, Name: workspaceFailuresTotal, Help: "total number of workspaces that had a failed condition", }, []string{"type", "class"}), totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, Name: workspaceStopsTotal, Help: "total number of workspaces stopped", }, []string{"reason", "type", "class"}), totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, Name: workspaceBackupsTotal, Help: "total number of workspace backups", }, []string{"type", "class"}), totalBackupFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, Name: workspaceBackupFailuresTotal, Help: "total number of workspace backup failures", }, []string{"type", "class"}), totalRestoreCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, Name: workspaceRestoresTotal, Help: "total number of workspace restores", }, []string{"type", "class"}), totalRestoreFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, Name: workspaceRestoresFailureTotal, Help: "total number of workspace restore failures", }, []string{"type", "class"}), workspacePhases: newPhaseTotalVec(r), timeoutSettings: newTimeoutSettingsVec(r), cache: cache, }, nil } func (m *controllerMetrics) recordWorkspaceStartupTime(log *logr.Logger, ws *workspacev1.Workspace) { class := ws.Spec.Class tpe := string(ws.Spec.Type) hist, err := m.startupTimeHistVec.GetMetricWithLabelValues(tpe, class) if err != nil { log.Error(err, "could not record workspace startup time", "type", tpe, "class", class) } hist.Observe(float64(time.Since(ws.CreationTimestamp.Time).Seconds())) } func (m *controllerMetrics) countWorkspaceStartFailures(log *logr.Logger, ws *workspacev1.Workspace) { class := ws.Spec.Class tpe := string(ws.Spec.Type) counter, err := m.totalStartsFailureCounterVec.GetMetricWithLabelValues(tpe, class) if err != nil { log.Error(err, "could not count workspace startup failure", "type", tpe, "class", class) } counter.Inc() } func (m *controllerMetrics) countWorkspaceFailure(log *logr.Logger, ws *workspacev1.Workspace) { class := ws.Spec.Class tpe := string(ws.Spec.Type) counter, err := m.totalFailuresCounterVec.GetMetricWithLabelValues(tpe, class) if err != nil { log.Error(err, "could not count workspace failure", "type", tpe, "class", class) } counter.Inc() } func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1.Workspace) { var reason string if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)); c != nil { reason = StopReasonFailed if !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)) { // Don't record 'failed' if there was a start failure. reason = StopReasonStartFailure } else if strings.Contains(c.Message, "Pod ephemeral local storage usage exceeds the total limit of containers") { reason = StopReasonOutOfSpace } } else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionAborted)) { reason = StopReasonAborted } else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) { reason = StopReasonTimeout } else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed)) { reason = StopReasonTabClosed } else { reason = StopReasonRegular } class := ws.Spec.Class tpe := string(ws.Spec.Type) counter, err := m.totalStopsCounterVec.GetMetricWithLabelValues(reason, tpe, class) if err != nil { log.Error(err, "could not count workspace stop", "reason", "unknown", "type", tpe, "class", class) } counter.Inc() } func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) { class := ws.Spec.Class tpe := string(ws.Spec.Type) counter, err := m.totalBackupCounterVec.GetMetricWithLabelValues(tpe, class) if err != nil { log.Error(err, "could not count workspace backup", "type", tpe, "class", class) } counter.Inc() } func (m *controllerMetrics) countTotalBackupFailures(log *logr.Logger, ws *workspacev1.Workspace) { class := ws.Spec.Class tpe := string(ws.Spec.Type) counter, err := m.totalBackupFailureCounterVec.GetMetricWithLabelValues(tpe, class) if err != nil { log.Error(err, "could not count workspace backup failure", "type", tpe, "class", class) } counter.Inc() } func (m *controllerMetrics) countTotalRestores(log *logr.Logger, ws *workspacev1.Workspace) { class := ws.Spec.Class tpe := string(ws.Spec.Type) counter, err := m.totalRestoreCounterVec.GetMetricWithLabelValues(tpe, class) if err != nil { log.Error(err, "could not count workspace restore", "type", tpe, "class", class) } counter.Inc() } func (m *controllerMetrics) countTotalRestoreFailures(log *logr.Logger, ws *workspacev1.Workspace) { class := ws.Spec.Class tpe := string(ws.Spec.Type) counter, err := m.totalRestoreFailureCounterVec.GetMetricWithLabelValues(tpe, class) if err != nil { log.Error(err, "could not count workspace restore failure", "type", tpe, "class", class) } counter.Inc() } func (m *controllerMetrics) containsWorkspace(ws *workspacev1.Workspace) bool { return m.cache.Contains(ws.Name) } func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace, state *metricState) { var s metricState if state != nil { s = *state } else { s = newMetricState(ws) } m.cache.Add(ws.Name, s) } func (m *controllerMetrics) forgetWorkspace(ws *workspacev1.Workspace) { m.cache.Remove(ws.Name) } // metricState is used to track which metrics have been recorded for a workspace. type metricState struct { phase workspacev1.WorkspacePhase recordedStartTime bool recordedInitFailure bool recordedStartFailure bool recordedFailure bool recordedContentReady bool recordedBackupFailed bool recordedBackupCompleted bool } func newMetricState(ws *workspacev1.Workspace) metricState { return metricState{ phase: ws.Status.Phase, // Here we assume that we've recorded metrics for the following states already if their conditions already exist. // This is to prevent these from being re-recorded after the controller restarts and clears the metric state for // each workspace. recordedStartTime: ws.Status.Phase == workspacev1.WorkspacePhaseRunning, recordedInitFailure: wsk8s.ConditionWithStatusAndReason(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, workspacev1.ReasonInitializationFailure), recordedStartFailure: ws.Status.Phase == workspacev1.WorkspacePhaseStopped && !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)), recordedFailure: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)), recordedContentReady: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)), recordedBackupFailed: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)), recordedBackupCompleted: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)), } } // getWorkspace returns the last recorded metric state for that workspace. func (m *controllerMetrics) getWorkspace(log *logr.Logger, ws *workspacev1.Workspace) (bool, metricState) { s, ok := m.cache.Get(ws.Name) if !ok { return false, metricState{} } return true, s.(metricState) } // Describe implements Collector. It will send exactly one Desc to the provided channel. func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) { m.startupTimeHistVec.Describe(ch) m.totalStopsCounterVec.Describe(ch) m.totalStartsFailureCounterVec.Describe(ch) m.totalFailuresCounterVec.Describe(ch) m.totalBackupCounterVec.Describe(ch) m.totalBackupFailureCounterVec.Describe(ch) m.totalRestoreCounterVec.Describe(ch) m.totalRestoreFailureCounterVec.Describe(ch) m.workspacePhases.Describe(ch) m.timeoutSettings.Describe(ch) } // Collect implements Collector. func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) { m.startupTimeHistVec.Collect(ch) m.totalStopsCounterVec.Collect(ch) m.totalStartsFailureCounterVec.Collect(ch) m.totalFailuresCounterVec.Collect(ch) m.totalBackupCounterVec.Collect(ch) m.totalBackupFailureCounterVec.Collect(ch) m.totalRestoreCounterVec.Collect(ch) m.totalRestoreFailureCounterVec.Collect(ch) m.workspacePhases.Collect(ch) m.timeoutSettings.Collect(ch) } // phaseTotalVec returns a gauge vector counting the workspaces per phase type phaseTotalVec struct { name string desc *prometheus.Desc reconciler *WorkspaceReconciler } func newPhaseTotalVec(r *WorkspaceReconciler) *phaseTotalVec { name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, "workspace_phase_total") return &phaseTotalVec{ name: name, desc: prometheus.NewDesc(name, "Current number of workspaces per phase", []string{"phase", "type", "class"}, prometheus.Labels(map[string]string{})), reconciler: r, } } // Describe implements Collector. It will send exactly one Desc to the provided channel. func (ptv *phaseTotalVec) Describe(ch chan<- *prometheus.Desc) { ch <- ptv.desc } // Collect implements Collector. func (ptv *phaseTotalVec) Collect(ch chan<- prometheus.Metric) { ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout) defer cancel() var workspaces workspacev1.WorkspaceList err := ptv.reconciler.List(ctx, &workspaces, client.InNamespace(ptv.reconciler.Config.Namespace)) if err != nil { return } counts := make(map[string]int) for _, ws := range workspaces.Items { counts[string(ws.Spec.Type)+"::"+string(ws.Status.Phase)+"::"+ws.Spec.Class]++ } for key, count := range counts { segs := strings.Split(key, "::") tpe, phase, class := segs[0], segs[1], segs[2] metric, err := prometheus.NewConstMetric(ptv.desc, prometheus.GaugeValue, float64(count), phase, tpe, class) if err != nil { continue } ch <- metric } } // timeoutSettingsVec provides a gauge of the currently active/inactive workspaces. // Adding both up returns the total number of workspaces. type timeoutSettingsVec struct { name string reconciler *WorkspaceReconciler desc *prometheus.Desc } func newTimeoutSettingsVec(r *WorkspaceReconciler) *timeoutSettingsVec { name := prometheus.BuildFQName("wsman", "workspace", "timeout_settings_total") desc := prometheus.NewDesc( name, "Current number of workspaces per timeout setting", []string{"timeout"}, prometheus.Labels(map[string]string{}), ) return &timeoutSettingsVec{ name: name, reconciler: r, desc: desc, } } // Describe implements Collector. It will send exactly one Desc to the provided channel. func (vec *timeoutSettingsVec) Describe(ch chan<- *prometheus.Desc) { ch <- vec.desc } // Collect implements Collector. func (tsv *timeoutSettingsVec) Collect(ch chan<- prometheus.Metric) { ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout) defer cancel() var workspaces workspacev1.WorkspaceList err := tsv.reconciler.List(ctx, &workspaces, client.InNamespace(tsv.reconciler.Config.Namespace)) if err != nil { return } timeouts := make(map[time.Duration]int) for _, ws := range workspaces.Items { if ws.Spec.Timeout.Time == nil { continue } timeouts[ws.Spec.Timeout.Time.Duration]++ } for phase, cnt := range timeouts { // metrics cannot be re-used, we have to create them every single time metric, err := prometheus.NewConstMetric(tsv.desc, prometheus.GaugeValue, float64(cnt), phase.String()) if err != nil { continue } ch <- metric } }