mirror of
https://github.com/gitpod-io/gitpod.git
synced 2025-12-08 17:36:30 +00:00
113 lines
4.5 KiB
YAML
113 lines
4.5 KiB
YAML
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
|
|
# Licensed under the GNU Affero General Public License (AGPL).
|
|
# See License.AGPL.txt in the project root for license information.
|
|
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: kubernetes
|
|
app.kubernetes.io/part-of: kube-prometheus
|
|
prometheus: k8s
|
|
role: alert-rules
|
|
name: kubernetes-monitoring-rules
|
|
namespace: monitoring-satellite
|
|
spec:
|
|
groups:
|
|
- name: kubernetes
|
|
rules:
|
|
- alert: KubeDaemonSetNotScheduled
|
|
annotations:
|
|
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
|
summary: DaemonSet pods are not scheduled.
|
|
expr: |
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
|
-
|
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
- alert: KubeJobNotCompleted
|
|
annotations:
|
|
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete.
|
|
summary: Job did not complete in time
|
|
expr: |
|
|
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"}
|
|
and
|
|
kube_job_status_active{job="kube-state-metrics"} > 0) > 43200
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
- alert: KubeJobFailed
|
|
annotations:
|
|
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
|
|
summary: Job failed to complete.
|
|
expr: |
|
|
kube_job_failed{job="kube-state-metrics"} > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
- alert: KubePersistentVolumeFillingUp
|
|
annotations:
|
|
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
|
summary: PersistentVolume is filling up.
|
|
expr: |
|
|
(
|
|
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
|
/
|
|
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
|
|
) < 0.03
|
|
and
|
|
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
|
|
unless on(namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
unless on(namespace, persistentvolumeclaim)
|
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
- alert: KubePersistentVolumeErrors
|
|
annotations:
|
|
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
|
|
summary: PersistentVolume is having issues with provisioning.
|
|
expr: |
|
|
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
- alert: KubeVersionMismatch
|
|
annotations:
|
|
description: There are {{ $value }} different semantic versions of Kubernetes components running.
|
|
summary: Different semantic versions of Kubernetes components running.
|
|
expr: |
|
|
count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
- alert: KubeNodeNotReady
|
|
annotations:
|
|
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
|
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/KubeNodeNotReady.md
|
|
summary: Node is not ready.
|
|
expr: |
|
|
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
- alert: KubeletDown
|
|
annotations:
|
|
description: Kubelet has disappeared from Prometheus target discovery.
|
|
summary: Target disappeared from Prometheus target discovery.
|
|
expr: |
|
|
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|