ansible-repo/files/prometheus/alerts.yaml

---
# можно набирать примеров отсюда
# https://awesome-prometheus-alerts.grep.to/rules.html


groups:
 

  - name: standard

    rules:
    - alert: _plchldr
      expr:  up == -999
      for:   999m
      labels:
        severity: info
      annotations: &anno
        alert_started_vl_time: "{{ with $b := printf `ALERTS_FOR_STATE{job=\"%s\",instance=\"%s\"} + 36000` $labels.job $labels.instance | query }}{{if $b}}{{ with $a := $b | first | value | humanizeTimestamp }}{{- slice $a 0 19 -}}{{end}}{{end}}{{end}}" 


    - alert: jackbot failed
      expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
      for: 1m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "PIPISA IS DOWN!"
        description: "Pipisa on {{ $labels.instance }} does not working!"

    - alert: jackbot failed
      expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
      for: 5m
      labels:
        severity: cricical
      annotations:
        <<: *anno
        summary: "PIPISA IS DOWN!"
        description: "Pipisa on {{ $labels.instance }} does not working!"


    ### отслужило своё, майнеров больше нет.
    # - alert: MAINER JACK KURWA!!
    #   expr: node_load15 > 2
    #   for: 20m
    #   labels:
    #     severity: cricical
    #   annotations:
    #     <<: *anno
    #     summary: "It THAT shit again!"
    #     description: "Kill fucking mainer processes!"


    - alert: Uptime
      expr: floor((time() - node_boot_time_seconds)) < 3600
      for: 5m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "Uptime less than 1 hour"
        description: "Uptime on {{ $labels.instance }} is less than 1 hour"

    - alert: LoadAverage
      expr: (node_load5{}) > ( instance:node_cpus:count{} )
      for: 5m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "High LoadAverage5"
        description: |
            {{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}

    - alert: LoadAverage
      expr: (node_load15{}) > ( instance:node_cpus:count{} )
      for: 5m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: "High LoadAverage15"
        description: |
            {{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}

    - alert: RAM
      expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 10
      for: 10m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "Low available memory"
        description: "Free RAM: {{ printf `%.2f` $value }}%  Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"

    - alert: RAM
      expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 5
      for: 10m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: "Low available memory"
        description: "Free RAM: {{ printf `%.2f` $value }}%  Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"

    - alert: iNodes
      expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 10
      for: 10m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "[WARN] Low available inodes"
        description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"

    - alert: iNodes
      expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 5
      for: 10m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: "[CRIT] Host out of inodes"
        description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"


    - alert: DiskUsage 
      expr: ( node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup|/swap", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes ) * 100 < 10
      for: 5m
      labels:
        severity: info
      annotations:
        <<: *anno
        summary: "Disk usage is more than 90%"
        description: |
          {{ $labels.device }} ({{ $labels.mountpoint }}): {{ printf `node_filesystem_avail_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }} / {{ printf `node_filesystem_size_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }}
          Свободного места: {{ printf `%.2f` $value }}%
 
    - alert: DiskUsagePredict
      expr: |
        (node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes) * 100 < 10
        and
        predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs|fuse.lxcfs|squashfs"}[1h], 4 * 3600) < 0
      for: 5m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: "Disk usage is more than 90% and will fill soon"
        description: "{{ $labels.mountpoint }} usage is more than 90% and will fill soon on {{ $labels.instance }}"
               
  - name: Prometheus
    rules:
    - alert: PrometheusAlertmanagerNotificationFailing
      expr: rate(alertmanager_notifications_failed_total[1m]) > 0
      for: 0m
      labels:
        severity: cricical
      annotations:
        <<: *anno
        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
        description: "Alertmanager is failing sending notifications on {{ $labels.host }}"

    - alert: PrometheusConfigurationReloadFailure
      expr: prometheus_config_last_reload_successful != 1
      for: 0m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
        description: "Prometheus configuration reload error on {{ $labels.host }}"

    - alert: PrometheusConsulServiceDiscoveryError
      expr: increase(prometheus_sd_consul_rpc_failures_total[15m]) > 0
      for: 0m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: Prometheus consul_sd many failures (instance {{ $labels.instance }})
        description: "Prometheus consul_sd many failures on {{ $labels.host }}"
resolv.conf solved! 2025-02-06 02:20:33 +10:00			`---`
			`# можно набирать примеров отсюда`
			`# https://awesome-prometheus-alerts.grep.to/rules.html`


			`groups:`


			`- name: standard`

			`rules:`
			`- alert: _plchldr`
			`expr: up == -999`
			`for: 999m`
			`labels:`
			`severity: info`
			`annotations: &anno`
			alert_started_vl_time: "{{ with $b := printf `ALERTS_FOR_STATE{job=\"%s\",instance=\"%s\"} + 36000` $labels.job $labels.instance \| query }}{{if $b}}{{ with $a := $b \| first \| value \| humanizeTimestamp }}{{- slice $a 0 19 -}}{{end}}{{end}}{{end}}"


			`- alert: jackbot failed`
			`expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1`
			`for: 1m`
			`labels:`
			`severity: warning`
			`annotations:`
			`<<: *anno`
			`summary: "PIPISA IS DOWN!"`
			`description: "Pipisa on {{ $labels.instance }} does not working!"`

			`- alert: jackbot failed`
			`expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1`
			`for: 5m`
			`labels:`
			`severity: cricical`
			`annotations:`
			`<<: *anno`
			`summary: "PIPISA IS DOWN!"`
			`description: "Pipisa on {{ $labels.instance }} does not working!"`


			`### отслужило своё, майнеров больше нет.`
			`# - alert: MAINER JACK KURWA!!`
			`# expr: node_load15 > 2`
			`# for: 20m`
			`# labels:`
			`# severity: cricical`
			`# annotations:`
			`# <<: *anno`
			`# summary: "It THAT shit again!"`
			`# description: "Kill fucking mainer processes!"`



			`- alert: Uptime`
			`expr: floor((time() - node_boot_time_seconds)) < 3600`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`<<: *anno`
			`summary: "Uptime less than 1 hour"`
			`description: "Uptime on {{ $labels.instance }} is less than 1 hour"`

			`- alert: LoadAverage`
			`expr: (node_load5{}) > ( instance:node_cpus:count{} )`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`<<: *anno`
			`summary: "High LoadAverage5"`
			`description: \|`
			{{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance \| query \| first \| value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance \| query \| first \| value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance \| query \| first \| value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance \| query \| first \| value }}

			`- alert: LoadAverage`
			`expr: (node_load15{}) > ( instance:node_cpus:count{} )`
			`for: 5m`
			`labels:`
			`severity: critical`
			`annotations:`
			`<<: *anno`
			`summary: "High LoadAverage15"`
			`description: \|`
			{{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance \| query \| first \| value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance \| query \| first \| value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance \| query \| first \| value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance \| query \| first \| value }}

			`- alert: RAM`
			`expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 10`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`<<: *anno`
			`summary: "Low available memory"`
			description: "Free RAM: {{ printf `%.2f` $value }}% Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance \| query \| first \| value \| humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance \| query \| first \| value \| humanize1024 }}"

			`- alert: RAM`
			`expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 5`
			`for: 10m`
			`labels:`
			`severity: critical`
			`annotations:`
			`<<: *anno`
			`summary: "Low available memory"`
			description: "Free RAM: {{ printf `%.2f` $value }}% Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance \| query \| first \| value \| humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance \| query \| first \| value \| humanize1024 }}"

			`- alert: iNodes`
			`expr: (node_filesystem_files_free{fstype!~"rootfs\|fuse.lxcfs\|squashfs",mountpoint!~"/boot\|boot/efi\|/backup\|/swap"} / node_filesystem_files) * 100 < 10`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`<<: *anno`
			`summary: "[WARN] Low available inodes"`
			description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"

			`- alert: iNodes`
			`expr: (node_filesystem_files_free{fstype!~"rootfs\|fuse.lxcfs\|squashfs",mountpoint!~"/boot\|boot/efi\|/backup\|/swap"} / node_filesystem_files) * 100 < 5`
			`for: 10m`
			`labels:`
			`severity: critical`
			`annotations:`
			`<<: *anno`
			`summary: "[CRIT] Host out of inodes"`
			description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"


			`- alert: DiskUsage`
			`expr: ( node_filesystem_avail_bytes{mountpoint!~"/boot\|boot/efi\|/backup\|/swap", fstype!~"rootfs\|fuse.lxcfs\|squashfs"}/ node_filesystem_size_bytes ) * 100 < 10`
			`for: 5m`
			`labels:`
			`severity: info`
			`annotations:`
			`<<: *anno`
			`summary: "Disk usage is more than 90%"`
			`description: \|`
			{{ $labels.device }} ({{ $labels.mountpoint }}): {{ printf `node_filesystem_avail_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance \| query \| first \| value \| humanize1024 }} / {{ printf `node_filesystem_size_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance \| query \| first \| value \| humanize1024 }}
			Свободного места: {{ printf `%.2f` $value }}%

			`- alert: DiskUsagePredict`
			`expr: \|`
			`(node_filesystem_avail_bytes{mountpoint!~"/boot\|boot/efi\|/backup", fstype!~"rootfs\|fuse.lxcfs\|squashfs"}/ node_filesystem_size_bytes) * 100 < 10`
			`and`
			`predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs\|fuse.lxcfs\|squashfs"}[1h], 4 * 3600) < 0`
			`for: 5m`
			`labels:`
			`severity: critical`
			`annotations:`
			`<<: *anno`
			`summary: "Disk usage is more than 90% and will fill soon"`
			`description: "{{ $labels.mountpoint }} usage is more than 90% and will fill soon on {{ $labels.instance }}"`

			`- name: Prometheus`
			`rules:`
			`- alert: PrometheusAlertmanagerNotificationFailing`
			`expr: rate(alertmanager_notifications_failed_total[1m]) > 0`
			`for: 0m`
			`labels:`
			`severity: cricical`
			`annotations:`
			`<<: *anno`
			`summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})`
			`description: "Alertmanager is failing sending notifications on {{ $labels.host }}"`

			`- alert: PrometheusConfigurationReloadFailure`
			`expr: prometheus_config_last_reload_successful != 1`
			`for: 0m`
			`labels:`
			`severity: warning`
			`annotations:`
			`<<: *anno`
			`summary: Prometheus configuration reload failure (instance {{ $labels.instance }})`
			`description: "Prometheus configuration reload error on {{ $labels.host }}"`

			`- alert: PrometheusConsulServiceDiscoveryError`
			`expr: increase(prometheus_sd_consul_rpc_failures_total[15m]) > 0`
			`for: 0m`
			`labels:`
			`severity: critical`
			`annotations:`
			`<<: *anno`
			`summary: Prometheus consul_sd many failures (instance {{ $labels.instance }})`
			`description: "Prometheus consul_sd many failures on {{ $labels.host }}"`