---
# можно набирать примеров отсюда
# https://awesome-prometheus-alerts.grep.to/rules.html


groups:
 

  - name: standard

    rules:
    - alert: _plchldr
      expr:  up == -999
      for:   999m
      labels:
        severity: info
      annotations: &anno
        alert_started_vl_time: "{{ with $b := printf `ALERTS_FOR_STATE{job=\"%s\",instance=\"%s\"} + 36000` $labels.job $labels.instance | query }}{{if $b}}{{ with $a := $b | first | value | humanizeTimestamp }}{{- slice $a 0 19 -}}{{end}}{{end}}{{end}}" 


    - alert: jackbot failed
      expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
      for: 1m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "PIPISA IS DOWN!"
        description: "Pipisa on {{ $labels.instance }} does not working!"

    - alert: jackbot failed
      expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
      for: 5m
      labels:
        severity: cricical
      annotations:
        <<: *anno
        summary: "PIPISA IS DOWN!"
        description: "Pipisa on {{ $labels.instance }} does not working!"


    ### отслужило своё, майнеров больше нет.
    # - alert: MAINER JACK KURWA!!
    #   expr: node_load15 > 2
    #   for: 20m
    #   labels:
    #     severity: cricical
    #   annotations:
    #     <<: *anno
    #     summary: "It THAT shit again!"
    #     description: "Kill fucking mainer processes!"



    - alert: Uptime
      expr: floor((time() - node_boot_time_seconds)) < 3600
      for: 5m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "Uptime less than 1 hour"
        description: "Uptime on {{ $labels.instance }} is less than 1 hour"

    - alert: LoadAverage
      expr: (node_load5{}) > ( instance:node_cpus:count{} )
      for: 5m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "High LoadAverage5"
        description: |
            {{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}

    - alert: LoadAverage
      expr: (node_load15{}) > ( instance:node_cpus:count{} )
      for: 5m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: "High LoadAverage15"
        description: |
            {{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}

    - alert: RAM
      expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 10
      for: 10m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "Low available memory"
        description: "Free RAM: {{ printf `%.2f` $value }}%  Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"

    - alert: RAM
      expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 5
      for: 10m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: "Low available memory"
        description: "Free RAM: {{ printf `%.2f` $value }}%  Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"

    - alert: iNodes
      expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 10
      for: 10m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: "[WARN] Low available inodes"
        description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"

    - alert: iNodes
      expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 5
      for: 10m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: "[CRIT] Host out of inodes"
        description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"


    - alert: DiskUsage 
      expr: ( node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup|/swap", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes ) * 100 < 10
      for: 5m
      labels:
        severity: info
      annotations:
        <<: *anno
        summary: "Disk usage is more than 90%"
        description: |
          {{ $labels.device }} ({{ $labels.mountpoint }}): {{ printf `node_filesystem_avail_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }} / {{ printf `node_filesystem_size_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }}
          Свободного места: {{ printf `%.2f` $value }}%
 
    - alert: DiskUsagePredict
      expr: |
        (node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes) * 100 < 10
        and
        predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs|fuse.lxcfs|squashfs"}[1h], 4 * 3600) < 0
      for: 5m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: "Disk usage is more than 90% and will fill soon"
        description: "{{ $labels.mountpoint }} usage is more than 90% and will fill soon on {{ $labels.instance }}"
               
  - name: Prometheus
    rules:
    - alert: PrometheusAlertmanagerNotificationFailing
      expr: rate(alertmanager_notifications_failed_total[1m]) > 0
      for: 0m
      labels:
        severity: cricical
      annotations:
        <<: *anno
        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
        description: "Alertmanager is failing sending notifications on {{ $labels.host }}"

    - alert: PrometheusConfigurationReloadFailure
      expr: prometheus_config_last_reload_successful != 1
      for: 0m
      labels:
        severity: warning
      annotations:
        <<: *anno
        summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
        description: "Prometheus configuration reload error on {{ $labels.host }}"

    - alert: PrometheusConsulServiceDiscoveryError
      expr: increase(prometheus_sd_consul_rpc_failures_total[15m]) > 0
      for: 0m
      labels:
        severity: critical
      annotations:
        <<: *anno
        summary: Prometheus consul_sd many failures (instance {{ $labels.instance }})
        description: "Prometheus consul_sd many failures on {{ $labels.host }}"