188 lines
7.8 KiB
YAML
Raw Permalink Normal View History

2025-02-06 02:20:33 +10:00
---
# можно набирать примеров отсюда
# https://awesome-prometheus-alerts.grep.to/rules.html
groups:
- name: standard
rules:
- alert: _plchldr
expr: up == -999
for: 999m
labels:
severity: info
annotations: &anno
alert_started_vl_time: "{{ with $b := printf `ALERTS_FOR_STATE{job=\"%s\",instance=\"%s\"} + 36000` $labels.job $labels.instance | query }}{{if $b}}{{ with $a := $b | first | value | humanizeTimestamp }}{{- slice $a 0 19 -}}{{end}}{{end}}{{end}}"
- alert: jackbot failed
expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
for: 1m
labels:
severity: warning
annotations:
<<: *anno
summary: "PIPISA IS DOWN!"
description: "Pipisa on {{ $labels.instance }} does not working!"
- alert: jackbot failed
expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
for: 5m
labels:
severity: cricical
annotations:
<<: *anno
summary: "PIPISA IS DOWN!"
description: "Pipisa on {{ $labels.instance }} does not working!"
### отслужило своё, майнеров больше нет.
# - alert: MAINER JACK KURWA!!
# expr: node_load15 > 2
# for: 20m
# labels:
# severity: cricical
# annotations:
# <<: *anno
# summary: "It THAT shit again!"
# description: "Kill fucking mainer processes!"
- alert: Uptime
expr: floor((time() - node_boot_time_seconds)) < 3600
for: 5m
labels:
severity: warning
annotations:
<<: *anno
summary: "Uptime less than 1 hour"
description: "Uptime on {{ $labels.instance }} is less than 1 hour"
- alert: LoadAverage
expr: (node_load5{}) > ( instance:node_cpus:count{} )
for: 5m
labels:
severity: warning
annotations:
<<: *anno
summary: "High LoadAverage5"
description: |
{{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}
- alert: LoadAverage
expr: (node_load15{}) > ( instance:node_cpus:count{} )
for: 5m
labels:
severity: critical
annotations:
<<: *anno
summary: "High LoadAverage15"
description: |
{{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}
- alert: RAM
expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 10
for: 10m
labels:
severity: warning
annotations:
<<: *anno
summary: "Low available memory"
description: "Free RAM: {{ printf `%.2f` $value }}% Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"
- alert: RAM
expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 5
for: 10m
labels:
severity: critical
annotations:
<<: *anno
summary: "Low available memory"
description: "Free RAM: {{ printf `%.2f` $value }}% Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"
- alert: iNodes
expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 10
for: 10m
labels:
severity: warning
annotations:
<<: *anno
summary: "[WARN] Low available inodes"
description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"
- alert: iNodes
expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 5
for: 10m
labels:
severity: critical
annotations:
<<: *anno
summary: "[CRIT] Host out of inodes"
description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"
- alert: DiskUsage
expr: ( node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup|/swap", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes ) * 100 < 10
for: 5m
labels:
severity: info
annotations:
<<: *anno
summary: "Disk usage is more than 90%"
description: |
{{ $labels.device }} ({{ $labels.mountpoint }}): {{ printf `node_filesystem_avail_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }} / {{ printf `node_filesystem_size_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }}
Свободного места: {{ printf `%.2f` $value }}%
- alert: DiskUsagePredict
expr: |
(node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes) * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs|fuse.lxcfs|squashfs"}[1h], 4 * 3600) < 0
for: 5m
labels:
severity: critical
annotations:
<<: *anno
summary: "Disk usage is more than 90% and will fill soon"
description: "{{ $labels.mountpoint }} usage is more than 90% and will fill soon on {{ $labels.instance }}"
- name: Prometheus
rules:
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 0m
labels:
severity: cricical
annotations:
<<: *anno
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
description: "Alertmanager is failing sending notifications on {{ $labels.host }}"
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
<<: *anno
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
description: "Prometheus configuration reload error on {{ $labels.host }}"
- alert: PrometheusConsulServiceDiscoveryError
expr: increase(prometheus_sd_consul_rpc_failures_total[15m]) > 0
for: 0m
labels:
severity: critical
annotations:
<<: *anno
summary: Prometheus consul_sd many failures (instance {{ $labels.instance }})
description: "Prometheus consul_sd many failures on {{ $labels.host }}"