--- # можно набирать примеров отсюда # https://awesome-prometheus-alerts.grep.to/rules.html groups: - name: standard rules: - alert: _plchldr expr: up == -999 for: 999m labels: severity: info annotations: &anno alert_started_vl_time: "{{ with $b := printf `ALERTS_FOR_STATE{job=\"%s\",instance=\"%s\"} + 36000` $labels.job $labels.instance | query }}{{if $b}}{{ with $a := $b | first | value | humanizeTimestamp }}{{- slice $a 0 19 -}}{{end}}{{end}}{{end}}" - alert: jackbot failed expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1 for: 1m labels: severity: warning annotations: <<: *anno summary: "PIPISA IS DOWN!" description: "Pipisa on {{ $labels.instance }} does not working!" - alert: jackbot failed expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1 for: 5m labels: severity: cricical annotations: <<: *anno summary: "PIPISA IS DOWN!" description: "Pipisa on {{ $labels.instance }} does not working!" ### отслужило своё, майнеров больше нет. # - alert: MAINER JACK KURWA!! # expr: node_load15 > 2 # for: 20m # labels: # severity: cricical # annotations: # <<: *anno # summary: "It THAT shit again!" # description: "Kill fucking mainer processes!" - alert: Uptime expr: floor((time() - node_boot_time_seconds)) < 3600 for: 5m labels: severity: warning annotations: <<: *anno summary: "Uptime less than 1 hour" description: "Uptime on {{ $labels.instance }} is less than 1 hour" - alert: LoadAverage expr: (node_load5{}) > ( instance:node_cpus:count{} ) for: 5m labels: severity: warning annotations: <<: *anno summary: "High LoadAverage5" description: | {{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} - alert: LoadAverage expr: (node_load15{}) > ( instance:node_cpus:count{} ) for: 5m labels: severity: critical annotations: <<: *anno summary: "High LoadAverage15" description: | {{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} - alert: RAM expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 10 for: 10m labels: severity: warning annotations: <<: *anno summary: "Low available memory" description: "Free RAM: {{ printf `%.2f` $value }}% Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}" - alert: RAM expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 5 for: 10m labels: severity: critical annotations: <<: *anno summary: "Low available memory" description: "Free RAM: {{ printf `%.2f` $value }}% Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}" - alert: iNodes expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 10 for: 10m labels: severity: warning annotations: <<: *anno summary: "[WARN] Low available inodes" description: "Available i-nodes: {{ printf `%.2f` $value }}%\n" - alert: iNodes expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 5 for: 10m labels: severity: critical annotations: <<: *anno summary: "[CRIT] Host out of inodes" description: "Available i-nodes: {{ printf `%.2f` $value }}%\n" - alert: DiskUsage expr: ( node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup|/swap", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes ) * 100 < 10 for: 5m labels: severity: info annotations: <<: *anno summary: "Disk usage is more than 90%" description: | {{ $labels.device }} ({{ $labels.mountpoint }}): {{ printf `node_filesystem_avail_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }} / {{ printf `node_filesystem_size_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }} Свободного места: {{ printf `%.2f` $value }}% - alert: DiskUsagePredict expr: | (node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes) * 100 < 10 and predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs|fuse.lxcfs|squashfs"}[1h], 4 * 3600) < 0 for: 5m labels: severity: critical annotations: <<: *anno summary: "Disk usage is more than 90% and will fill soon" description: "{{ $labels.mountpoint }} usage is more than 90% and will fill soon on {{ $labels.instance }}" - name: Prometheus rules: - alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 for: 0m labels: severity: cricical annotations: <<: *anno summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) description: "Alertmanager is failing sending notifications on {{ $labels.host }}" - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 0m labels: severity: warning annotations: <<: *anno summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) description: "Prometheus configuration reload error on {{ $labels.host }}" - alert: PrometheusConsulServiceDiscoveryError expr: increase(prometheus_sd_consul_rpc_failures_total[15m]) > 0 for: 0m labels: severity: critical annotations: <<: *anno summary: Prometheus consul_sd many failures (instance {{ $labels.instance }}) description: "Prometheus consul_sd many failures on {{ $labels.host }}"