2022-12-08 13:05:19 -03:00

113 lines
4.5 KiB
YAML

# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
# Licensed under the GNU Affero General Public License (AGPL).
# See License.AGPL.txt in the project root for license information.
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/name: kubernetes
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
role: alert-rules
name: kubernetes-monitoring-rules
namespace: monitoring-satellite
spec:
groups:
- name: kubernetes
rules:
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
summary: DaemonSet pods are not scheduled.
expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
team: platform
- alert: KubeJobNotCompleted
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete.
summary: Job did not complete in time
expr: |
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"}
and
kube_job_status_active{job="kube-state-metrics"} > 0) > 43200
labels:
severity: warning
team: platform
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
summary: Job failed to complete.
expr: |
kube_job_failed{job="kube-state-metrics"} > 0
for: 15m
labels:
severity: warning
team: platform
- alert: KubePersistentVolumeFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
summary: PersistentVolume is filling up.
expr: |
(
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
team: platform
- alert: KubePersistentVolumeErrors
annotations:
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
summary: PersistentVolume is having issues with provisioning.
expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
labels:
severity: critical
team: platform
- alert: KubeVersionMismatch
annotations:
description: There are {{ $value }} different semantic versions of Kubernetes components running.
summary: Different semantic versions of Kubernetes components running.
expr: |
count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
team: platform
- alert: KubeNodeNotReady
annotations:
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/KubeNodeNotReady.md
summary: Node is not ready.
expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m
labels:
severity: critical
team: platform
- alert: KubeletDown
annotations:
description: Kubelet has disappeared from Prometheus target discovery.
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical
team: platform