# Copyright (c) 2022 Gitpod GmbH. All rights reserved. # Licensed under the GNU Affero General Public License (AGPL). # See License-AGPL.txt in the project root for license information. apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: app.kubernetes.io/name: kubernetes app.kubernetes.io/part-of: kube-prometheus prometheus: k8s role: alert-rules name: kubernetes-monitoring-rules namespace: monitoring-satellite spec: groups: - name: kubernetes rules: - alert: KubeDaemonSetNotScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' summary: DaemonSet pods are not scheduled. expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning team: platform - alert: KubeJobNotCompleted annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete. summary: Job did not complete in time expr: | time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"} and kube_job_status_active{job="kube-state-metrics"} > 0) > 43200 labels: severity: warning team: platform - alert: KubeJobFailed annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. summary: Job failed to complete. expr: | kube_job_failed{job="kube-state-metrics"} > 0 for: 15m labels: severity: warning team: platform - alert: KubePersistentVolumeFillingUp annotations: description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. summary: PersistentVolume is filling up. expr: | ( kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} ) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m labels: severity: critical team: platform - alert: KubePersistentVolumeErrors annotations: description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. summary: PersistentVolume is having issues with provisioning. expr: | kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 for: 5m labels: severity: critical team: platform - alert: KubeVersionMismatch annotations: description: There are {{ $value }} different semantic versions of Kubernetes components running. summary: Different semantic versions of Kubernetes components running. expr: | count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 for: 15m labels: severity: warning team: platform - alert: KubeNodeNotReady annotations: description: '{{ $labels.node }} has been unready for more than 15 minutes.' runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/KubeNodeNotReady.md summary: Node is not ready. expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 for: 15m labels: severity: critical team: platform - alert: KubeletDown annotations: description: Kubelet has disappeared from Prometheus target discovery. summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kubelet", metrics_path="/metrics"} == 1) for: 15m labels: severity: critical team: platform