mirror of
https://github.com/gitpod-io/gitpod.git
synced 2025-12-08 17:36:30 +00:00
269 lines
9.3 KiB
Go
269 lines
9.3 KiB
Go
// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
|
|
// Licensed under the GNU Affero General Public License (AGPL).
|
|
// See License.AGPL.txt in the project root for license information.
|
|
|
|
package controllers
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
|
|
|
"github.com/gitpod-io/gitpod/common-go/log"
|
|
"github.com/gitpod-io/gitpod/ws-manager-mk2/clock"
|
|
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
|
|
)
|
|
|
|
// RegisterMetrics registers the Prometheus metrics of this manager
|
|
func (r *WorkspaceReconciler) RegisterMetrics(reg prometheus.Registerer) error {
|
|
r.clock.ReportBackwardsTime(
|
|
clock.PrometheusWallTimeMonotonicityReporter(
|
|
prometheus.WrapRegistererWithPrefix(metricsNamespace+metricsWorkspaceSubsystem, reg)))
|
|
|
|
return r.metrics.Register(reg)
|
|
}
|
|
|
|
const (
|
|
metricsNamespace = "gitpod"
|
|
metricsWorkspaceSubsystem = "ws_manager"
|
|
)
|
|
|
|
type metrics struct {
|
|
reconciler *WorkspaceReconciler
|
|
|
|
// Histogram
|
|
startupTimeHistVec *prometheus.HistogramVec
|
|
initializeTimeHistVec *prometheus.HistogramVec
|
|
finalizeTimeHistVec *prometheus.HistogramVec
|
|
|
|
// Counter
|
|
totalStartsCounterVec *prometheus.CounterVec
|
|
totalStartsFailureCounterVec *prometheus.CounterVec
|
|
totalStopsCounterVec *prometheus.CounterVec
|
|
totalBackupCounterVec *prometheus.CounterVec
|
|
totalBackupFailureCounterVec *prometheus.CounterVec
|
|
totalRestoreCounterVec *prometheus.CounterVec
|
|
totalRestoreFailureCounterVec *prometheus.CounterVec
|
|
totalUnintentionalWorkspaceStopCounterVec *prometheus.CounterVec
|
|
}
|
|
|
|
func newMetrics(r *WorkspaceReconciler) *metrics {
|
|
return &metrics{
|
|
reconciler: r,
|
|
startupTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_startup_seconds",
|
|
Help: "time it took for workspace pods to reach the running phase",
|
|
// same as components/ws-manager-bridge/src/prometheus-metrics-exporter.ts#L15
|
|
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
|
|
}, []string{"type", "class"}),
|
|
initializeTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_initialize_seconds",
|
|
Help: "time it took to initialize workspace",
|
|
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
|
|
}, []string{"type", "class"}),
|
|
finalizeTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_finalize_seconds",
|
|
Help: "time it took to finalize workspace",
|
|
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
|
|
}, []string{"type", "class"}),
|
|
totalStartsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_starts_total",
|
|
Help: "total number of workspaces started",
|
|
}, []string{"type", "class"}),
|
|
totalStartsFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_starts_failure_total",
|
|
Help: "total number of workspaces that failed to start",
|
|
}, []string{"type", "class"}),
|
|
totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_stops_total",
|
|
Help: "total number of workspaces stopped",
|
|
}, []string{"reason", "type", "class"}),
|
|
totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_backups_total",
|
|
Help: "total number of workspace backups",
|
|
}, []string{"type", "class"}),
|
|
totalBackupFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_backups_failure_total",
|
|
Help: "total number of workspace backup failures",
|
|
}, []string{"type", "class"}),
|
|
totalRestoreCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_restores_total",
|
|
Help: "total number of workspace restores",
|
|
}, []string{"type", "class"}),
|
|
totalRestoreFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_restores_failure_total",
|
|
Help: "total number of workspace restore failures",
|
|
}, []string{"type", "class"}),
|
|
totalUnintentionalWorkspaceStopCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: metricsWorkspaceSubsystem,
|
|
Name: "workspace_unintentional_stop_total",
|
|
Help: "total number of workspaces when container stopped without being deleted prior",
|
|
}, []string{"type", "class"}),
|
|
}
|
|
}
|
|
|
|
// Register registers all metrics ws-manager can export
|
|
func (m *metrics) Register(reg prometheus.Registerer) error {
|
|
collectors := []prometheus.Collector{
|
|
m.startupTimeHistVec,
|
|
m.initializeTimeHistVec,
|
|
m.finalizeTimeHistVec,
|
|
newPhaseTotalVec(m.reconciler),
|
|
newTimeoutSettingsVec(m.reconciler),
|
|
m.totalStartsCounterVec,
|
|
m.totalStartsFailureCounterVec,
|
|
m.totalStopsCounterVec,
|
|
m.totalBackupCounterVec,
|
|
m.totalBackupFailureCounterVec,
|
|
m.totalRestoreCounterVec,
|
|
m.totalRestoreFailureCounterVec,
|
|
m.totalUnintentionalWorkspaceStopCounterVec,
|
|
}
|
|
|
|
for _, c := range collectors {
|
|
err := reg.Register(c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// phaseTotalVec returns a gauge vector counting the workspaces per phase
|
|
type phaseTotalVec struct {
|
|
name string
|
|
desc *prometheus.Desc
|
|
reconciler *WorkspaceReconciler
|
|
}
|
|
|
|
func newPhaseTotalVec(r *WorkspaceReconciler) *phaseTotalVec {
|
|
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, "workspace_phase_total")
|
|
return &phaseTotalVec{
|
|
name: name,
|
|
desc: prometheus.NewDesc(name, "Current number of workspaces per phase", []string{"phase", "type", "class"}, prometheus.Labels(map[string]string{})),
|
|
reconciler: r,
|
|
}
|
|
}
|
|
|
|
// Describe implements Collector. It will send exactly one Desc to the provided channel.
|
|
func (ptv *phaseTotalVec) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- ptv.desc
|
|
}
|
|
|
|
// Collect implements Collector.
|
|
func (ptv *phaseTotalVec) Collect(ch chan<- prometheus.Metric) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
|
|
defer cancel()
|
|
|
|
var workspaces workspacev1.WorkspaceList
|
|
err := ptv.reconciler.List(ctx, &workspaces, client.InNamespace(ptv.reconciler.Config.Namespace))
|
|
if err != nil {
|
|
log.WithError(err).Debugf("cannot list workspaces for %s gauge", ptv.name)
|
|
return
|
|
}
|
|
|
|
counts := make(map[string]int)
|
|
for _, ws := range workspaces.Items {
|
|
counts[string(ws.Spec.Type)+"::"+string(ws.Status.Phase)+"::"+ws.Spec.Class]++
|
|
}
|
|
|
|
for key, count := range counts {
|
|
segs := strings.Split(key, "::")
|
|
tpe, phase, class := segs[0], segs[1], segs[2]
|
|
|
|
metric, err := prometheus.NewConstMetric(ptv.desc, prometheus.GaugeValue, float64(count), phase, tpe, class)
|
|
if err != nil {
|
|
log.WithError(err).Warnf("cannot create workspace metric - %s will be inaccurate", ptv.name)
|
|
continue
|
|
}
|
|
|
|
ch <- metric
|
|
}
|
|
}
|
|
|
|
// timeoutSettingsVec provides a gauge of the currently active/inactive workspaces.
|
|
// Adding both up returns the total number of workspaces.
|
|
type timeoutSettingsVec struct {
|
|
name string
|
|
reconciler *WorkspaceReconciler
|
|
desc *prometheus.Desc
|
|
}
|
|
|
|
func newTimeoutSettingsVec(r *WorkspaceReconciler) *timeoutSettingsVec {
|
|
name := prometheus.BuildFQName("wsman", "workspace", "timeout_settings_total")
|
|
desc := prometheus.NewDesc(
|
|
name,
|
|
"Current number of workspaces per timeout setting",
|
|
[]string{"timeout"},
|
|
prometheus.Labels(map[string]string{}),
|
|
)
|
|
return &timeoutSettingsVec{
|
|
name: name,
|
|
reconciler: r,
|
|
desc: desc,
|
|
}
|
|
}
|
|
|
|
// Describe implements Collector. It will send exactly one Desc to the provided channel.
|
|
func (vec *timeoutSettingsVec) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- vec.desc
|
|
}
|
|
|
|
// Collect implements Collector.
|
|
func (tsv *timeoutSettingsVec) Collect(ch chan<- prometheus.Metric) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
|
|
defer cancel()
|
|
|
|
var workspaces workspacev1.WorkspaceList
|
|
err := tsv.reconciler.List(ctx, &workspaces, client.InNamespace(tsv.reconciler.Config.Namespace))
|
|
if err != nil {
|
|
log.WithError(err).Debugf("cannot list workspaces for %s gauge", tsv.name)
|
|
return
|
|
}
|
|
|
|
timeouts := make(map[time.Duration]int)
|
|
for _, ws := range workspaces.Items {
|
|
if ws.Spec.Timeout.Time == nil {
|
|
continue
|
|
}
|
|
|
|
timeouts[ws.Spec.Timeout.Time.Duration]++
|
|
}
|
|
|
|
for phase, cnt := range timeouts {
|
|
// metrics cannot be re-used, we have to create them every single time
|
|
metric, err := prometheus.NewConstMetric(tsv.desc, prometheus.GaugeValue, float64(cnt), phase.String())
|
|
if err != nil {
|
|
log.WithError(err).Warnf("cannot create workspace metric - %s will be inaccurate", tsv.name)
|
|
continue
|
|
}
|
|
|
|
ch <- metric
|
|
}
|
|
}
|