2023-01-30 23:22:38 +01:00

269 lines
9.3 KiB
Go

// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License.AGPL.txt in the project root for license information.
package controllers
import (
"context"
"strings"
"time"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/ws-manager-mk2/clock"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
)
// RegisterMetrics registers the Prometheus metrics of this manager
func (r *WorkspaceReconciler) RegisterMetrics(reg prometheus.Registerer) error {
r.clock.ReportBackwardsTime(
clock.PrometheusWallTimeMonotonicityReporter(
prometheus.WrapRegistererWithPrefix(metricsNamespace+metricsWorkspaceSubsystem, reg)))
return r.metrics.Register(reg)
}
const (
metricsNamespace = "gitpod"
metricsWorkspaceSubsystem = "ws_manager"
)
type metrics struct {
reconciler *WorkspaceReconciler
// Histogram
startupTimeHistVec *prometheus.HistogramVec
initializeTimeHistVec *prometheus.HistogramVec
finalizeTimeHistVec *prometheus.HistogramVec
// Counter
totalStartsCounterVec *prometheus.CounterVec
totalStartsFailureCounterVec *prometheus.CounterVec
totalStopsCounterVec *prometheus.CounterVec
totalBackupCounterVec *prometheus.CounterVec
totalBackupFailureCounterVec *prometheus.CounterVec
totalRestoreCounterVec *prometheus.CounterVec
totalRestoreFailureCounterVec *prometheus.CounterVec
totalUnintentionalWorkspaceStopCounterVec *prometheus.CounterVec
}
func newMetrics(r *WorkspaceReconciler) *metrics {
return &metrics{
reconciler: r,
startupTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_startup_seconds",
Help: "time it took for workspace pods to reach the running phase",
// same as components/ws-manager-bridge/src/prometheus-metrics-exporter.ts#L15
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
}, []string{"type", "class"}),
initializeTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_initialize_seconds",
Help: "time it took to initialize workspace",
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
}, []string{"type", "class"}),
finalizeTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_finalize_seconds",
Help: "time it took to finalize workspace",
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
}, []string{"type", "class"}),
totalStartsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_starts_total",
Help: "total number of workspaces started",
}, []string{"type", "class"}),
totalStartsFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_starts_failure_total",
Help: "total number of workspaces that failed to start",
}, []string{"type", "class"}),
totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_stops_total",
Help: "total number of workspaces stopped",
}, []string{"reason", "type", "class"}),
totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_backups_total",
Help: "total number of workspace backups",
}, []string{"type", "class"}),
totalBackupFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_backups_failure_total",
Help: "total number of workspace backup failures",
}, []string{"type", "class"}),
totalRestoreCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_restores_total",
Help: "total number of workspace restores",
}, []string{"type", "class"}),
totalRestoreFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_restores_failure_total",
Help: "total number of workspace restore failures",
}, []string{"type", "class"}),
totalUnintentionalWorkspaceStopCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_unintentional_stop_total",
Help: "total number of workspaces when container stopped without being deleted prior",
}, []string{"type", "class"}),
}
}
// Register registers all metrics ws-manager can export
func (m *metrics) Register(reg prometheus.Registerer) error {
collectors := []prometheus.Collector{
m.startupTimeHistVec,
m.initializeTimeHistVec,
m.finalizeTimeHistVec,
newPhaseTotalVec(m.reconciler),
newTimeoutSettingsVec(m.reconciler),
m.totalStartsCounterVec,
m.totalStartsFailureCounterVec,
m.totalStopsCounterVec,
m.totalBackupCounterVec,
m.totalBackupFailureCounterVec,
m.totalRestoreCounterVec,
m.totalRestoreFailureCounterVec,
m.totalUnintentionalWorkspaceStopCounterVec,
}
for _, c := range collectors {
err := reg.Register(c)
if err != nil {
return err
}
}
return nil
}
// phaseTotalVec returns a gauge vector counting the workspaces per phase
type phaseTotalVec struct {
name string
desc *prometheus.Desc
reconciler *WorkspaceReconciler
}
func newPhaseTotalVec(r *WorkspaceReconciler) *phaseTotalVec {
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, "workspace_phase_total")
return &phaseTotalVec{
name: name,
desc: prometheus.NewDesc(name, "Current number of workspaces per phase", []string{"phase", "type", "class"}, prometheus.Labels(map[string]string{})),
reconciler: r,
}
}
// Describe implements Collector. It will send exactly one Desc to the provided channel.
func (ptv *phaseTotalVec) Describe(ch chan<- *prometheus.Desc) {
ch <- ptv.desc
}
// Collect implements Collector.
func (ptv *phaseTotalVec) Collect(ch chan<- prometheus.Metric) {
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
defer cancel()
var workspaces workspacev1.WorkspaceList
err := ptv.reconciler.List(ctx, &workspaces, client.InNamespace(ptv.reconciler.Config.Namespace))
if err != nil {
log.WithError(err).Debugf("cannot list workspaces for %s gauge", ptv.name)
return
}
counts := make(map[string]int)
for _, ws := range workspaces.Items {
counts[string(ws.Spec.Type)+"::"+string(ws.Status.Phase)+"::"+ws.Spec.Class]++
}
for key, count := range counts {
segs := strings.Split(key, "::")
tpe, phase, class := segs[0], segs[1], segs[2]
metric, err := prometheus.NewConstMetric(ptv.desc, prometheus.GaugeValue, float64(count), phase, tpe, class)
if err != nil {
log.WithError(err).Warnf("cannot create workspace metric - %s will be inaccurate", ptv.name)
continue
}
ch <- metric
}
}
// timeoutSettingsVec provides a gauge of the currently active/inactive workspaces.
// Adding both up returns the total number of workspaces.
type timeoutSettingsVec struct {
name string
reconciler *WorkspaceReconciler
desc *prometheus.Desc
}
func newTimeoutSettingsVec(r *WorkspaceReconciler) *timeoutSettingsVec {
name := prometheus.BuildFQName("wsman", "workspace", "timeout_settings_total")
desc := prometheus.NewDesc(
name,
"Current number of workspaces per timeout setting",
[]string{"timeout"},
prometheus.Labels(map[string]string{}),
)
return &timeoutSettingsVec{
name: name,
reconciler: r,
desc: desc,
}
}
// Describe implements Collector. It will send exactly one Desc to the provided channel.
func (vec *timeoutSettingsVec) Describe(ch chan<- *prometheus.Desc) {
ch <- vec.desc
}
// Collect implements Collector.
func (tsv *timeoutSettingsVec) Collect(ch chan<- prometheus.Metric) {
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
defer cancel()
var workspaces workspacev1.WorkspaceList
err := tsv.reconciler.List(ctx, &workspaces, client.InNamespace(tsv.reconciler.Config.Namespace))
if err != nil {
log.WithError(err).Debugf("cannot list workspaces for %s gauge", tsv.name)
return
}
timeouts := make(map[time.Duration]int)
for _, ws := range workspaces.Items {
if ws.Spec.Timeout.Time == nil {
continue
}
timeouts[ws.Spec.Timeout.Time.Duration]++
}
for phase, cnt := range timeouts {
// metrics cannot be re-used, we have to create them every single time
metric, err := prometheus.NewConstMetric(tsv.desc, prometheus.GaugeValue, float64(cnt), phase.String())
if err != nil {
log.WithError(err).Warnf("cannot create workspace metric - %s will be inaccurate", tsv.name)
continue
}
ch <- metric
}
}