resolv.conf solved!

2025-02-06 02:20:33 +10:00
parent fe33aee2fe
commit 4ba0cadf07
31 changed files with 1296 additions and 530 deletions
--- a/ansible.cfg
+++ b/ansible.cfg
@@ -18,7 +18,8 @@ bin_ansible_callbacks = True
 host_key_checking = false

 #vault_password_file = /etc/ansible/.vaulto
-vault_password_file = /tmp/.vaulto
+#vault_password_file = /tmp/.vaulto
+vault_password_file = /usr/share/.vaulto

 # callback_plugins = /etc/ansible/plugins/callback
 # callback_whitelist = telegram
--- a/environments/just-created/group_vars/lxc/ssh-creds.yml
+++ b/environments/just-created/group_vars/lxc/ssh-creds.yml
@@ -0,0 +1,5 @@
+---
+ansible_ssh_user: root
+ansible_ssh_pass: admin
+ansible_sudo_pass: admin
+ansible_ssh_private_key_file: '/home/hogweed1/id25519.key'
--- a/environments/just-created/host_vars/192.168.0.32.yml
+++ b/environments/just-created/host_vars/192.168.0.32.yml
@@ -2,4 +2,4 @@
 ansible_ssh_user: hogweed1
 ansible_ssh_pass: coloredhorses
 ansible_sudo_pass: coloredhorses
-ansible_ssh_private_key_file: '/home/hogweed1/id25519.key'
+#ansible_ssh_private_key_file: '/home/hogweed1/id25519.key'
--- a/environments/just-created/host_vars/k3s-rancher.guaranteedstruggle.host.yml
+++ b/environments/just-created/host_vars/k3s-rancher.guaranteedstruggle.host.yml
@@ -0,0 +1,5 @@
+---
+ansible_ssh_user: hogweed1
+ansible_ssh_pass: coloredhorses
+ansible_sudo_pass: coloredhorses
+#ansible_ssh_private_key_file: '/home/hogweed1/id25519.key'
--- a/environments/just-created/hosts.yml
+++ b/environments/just-created/hosts.yml
@@ -1,11 +1,24 @@
 ---
-# all: # keys must be unique, i.e. only one 'hosts' per group
-#   hosts:
+all: # keys must be unique, i.e. only one 'hosts' per group
+  hosts:
+    #k3s-rancher.guaranteedstruggle.host:
 #       #nexus.guaranteedstruggle.host:
 #       #printing-slut.guaranteedstruggle.host:
 #       harbor.guaranteedstruggle.host:
 
+    #192.168.0.26
+    #192.168.0.32:
 lxc: # keys must be unique, i.e. only one 'hosts' per group
  hosts:
      ### but its a vm wtf
-      harbor.guaranteedstruggle.host:
+      #harbor.guaranteedstruggle.host:
+
+      #etcd.guaranteedstruggle.host:
+      #prometheus.guaranteedstruggle.host:
+   # 192.168.0.240
+    #192.168.0.251
+    #192.168.0.40
+    #192.168.0.88
+    #192.168.0.52
+    #192.168.0.113
+    #recording-slut.guaranteedstruggle.host:
--- a/environments/proxmoxes/hosts.yml
+++ b/environments/proxmoxes/hosts.yml
@@ -2,6 +2,7 @@
 physical_machines:
    hosts:
        cyberbully.guaranteedstruggle.host:
+        #
        gpu-slut.guaranteedstruggle.host:
    children:
        proxmoxes:
@@ -20,11 +21,15 @@ semyons: # keys must be unique, i.e. only one 'hosts' per group
        semyon-0x04.guaranteedstruggle.host:
        semyon-0x05.guaranteedstruggle.host:
 vms:
+    hosts:
+        #recording-slut.guaranteedstruggle.host:
+        #192.168.0.26
    children:
        printer:
        kubernetes:
        docker:

+
 docker:
    hosts:
        swarm-node1.guaranteedstruggle.host:
@@ -52,4 +57,15 @@ printer:
        printing-slut.guaranteedstruggle.host:

 #### TODO 
-# lxc:
+lxc:
+  hosts:
+      ### but its a vm wtf
+      #harbor.guaranteedstruggle.host:  
+      #etcd.guaranteedstruggle.host:
+      prometheus.guaranteedstruggle.host:
+      recording-slut.guaranteedstruggle.host:
+
+      pg.just-for-me.internal:
+      grafana.just-for-me.internal:
+      price-loader.just-for-me.internal:
+
--- a/files/alertmanager/alertmanager.service
+++ b/files/alertmanager/alertmanager.service
@@ -0,0 +1,21 @@
+[Unit]
+Description=Prometheus alertmanager
+Wants=network-online.target
+After=network-online.target
+
+[Service]
+User=prometheus
+Group=prometheus
+EnvironmentFile=-/etc/sysconfig/alertmanager
+ExecStart=/usr/sbin/alertmanager \
+    --config.file=/etc/alertmanager/alertmanager.yaml \
+    --storage.path=/base/alertmanager \
+    --web.config.file=/etc/prometheus/web-config.yaml
+
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=process
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+
--- a/files/alertmanager/alertmanager.yaml
+++ b/files/alertmanager/alertmanager.yaml
@@ -0,0 +1,50 @@
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: [  'alertname', 'job' ]
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 1h
+  receiver: what-went-wrong
+
+#   routes:
+
+receivers:
+# /dev/null receiver
+- name: 'blackhole'
+
+# конфа
+- name: 'what-went-wrong'
+  telegram_configs:
+  - send_resolved: true
+    bot_token: '6472915685:AAHPvgrQoqG7DxtfbnHWPe3Lfild-CGJ1j8'
+    chat_id: -4023350326
+    message: '{{ template "teletempl" . }}'
+    api_url: https://api.telegram.org
+    parse_mode: HTML
+# - name: 'vdk2ch'
+#   telegram_configs:
+#   - send_resolved: true
+#     bot_token: '5724991559:AAEuLvpLsgP6LHRGMSyFtQLlR5qPQUO4b_w'
+#     chat_id: -1001355646177
+#     message: '{{ template "teletempl" . }}'
+#     api_url: https://api.telegram.org
+#     parse_mode: HTML
+
+# A list of inhibition rules.
+#inhibit_rules: 
+
+templates:
+  - '/etc/alertmanager/templates/my.tmpl'
+
+# A list of time intervals for muting/activating routes.
+# time_intervals:
+#   - name: business_hours
+#     time_intervals:
+#       - weekdays: ['monday:friday']
+#         times:
+#         # Начало в 10:00 Asia/Vladivostok
+#         - start_time: '00:00'
+#         # Заканчивается в 19:00 Asia/Vladivostok
+#           end_time: '09:00'
--- a/files/alertmanager/simple_telegram.tmpl
+++ b/files/alertmanager/simple_telegram.tmpl
@@ -0,0 +1,33 @@
+{{ define "teletempl" }}
+<b>{{ .CommonLabels.alertname }} : </b>
+{{- if eq .Status "firing" -}}
+    <b>{{ .Status | toUpper}} 🔥</b>
+{{- end -}}
+{{- if eq .Status "resolved" -}}
+    <b>{{ .Status | toUpper}} ✅</b>
+{{- end -}}
+{{ $alerts_count := len .Alerts }}
+{{ if eq $alerts_count 1 -}}     {{/* Single alert block */}}
+{{ .CommonAnnotations.summary }}
+
+Host: {{ .CommonLabels.host }}
+Instance: {{ .CommonLabels.instance }}
+Job: <b>{{ .CommonLabels.job }}</b>
+ 
+Details:
+{{ .CommonAnnotations.description }}
+
+Alert started: [ {{ .CommonAnnotations.alert_started_vl_time }} ]
+
+{{- else -}}                      {{/* Grouped alert block */}}
+{{ .CommonAnnotations.summary }}
+
+Job: <b>{{ .CommonLabels.job }}</b>
+ 
+
+Instances:
+{{- range .Alerts }}
+{{ .Labels.instance }} [ {{ .Annotations.alert_started_vl_time }} ]
+{{- end }}
+{{ end }}
+{{ end }}
--- a/files/prometheus/alerts.yaml
+++ b/files/prometheus/alerts.yaml
@@ -0,0 +1,188 @@
+---
+# можно набирать примеров отсюда
+# https://awesome-prometheus-alerts.grep.to/rules.html
+
+
+groups:
+ 
+
+  - name: standard
+
+    rules:
+    - alert: _plchldr
+      expr:  up == -999
+      for:   999m
+      labels:
+        severity: info
+      annotations: &anno
+        alert_started_vl_time: "{{ with $b := printf `ALERTS_FOR_STATE{job=\"%s\",instance=\"%s\"} + 36000` $labels.job $labels.instance | query }}{{if $b}}{{ with $a := $b | first | value | humanizeTimestamp }}{{- slice $a 0 19 -}}{{end}}{{end}}{{end}}" 
+
+
+    - alert: jackbot failed
+      expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        <<: *anno
+        summary: "PIPISA IS DOWN!"
+        description: "Pipisa on {{ $labels.instance }} does not working!"
+
+    - alert: jackbot failed
+      expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
+      for: 5m
+      labels:
+        severity: cricical
+      annotations:
+        <<: *anno
+        summary: "PIPISA IS DOWN!"
+        description: "Pipisa on {{ $labels.instance }} does not working!"
+
+
+    ### отслужило своё, майнеров больше нет.
+    # - alert: MAINER JACK KURWA!!
+    #   expr: node_load15 > 2
+    #   for: 20m
+    #   labels:
+    #     severity: cricical
+    #   annotations:
+    #     <<: *anno
+    #     summary: "It THAT shit again!"
+    #     description: "Kill fucking mainer processes!"
+
+
+
+    - alert: Uptime
+      expr: floor((time() - node_boot_time_seconds)) < 3600
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        <<: *anno
+        summary: "Uptime less than 1 hour"
+        description: "Uptime on {{ $labels.instance }} is less than 1 hour"
+
+    - alert: LoadAverage
+      expr: (node_load5{}) > ( instance:node_cpus:count{} )
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        <<: *anno
+        summary: "High LoadAverage5"
+        description: |
+            {{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}
+
+    - alert: LoadAverage
+      expr: (node_load15{}) > ( instance:node_cpus:count{} )
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        <<: *anno
+        summary: "High LoadAverage15"
+        description: |
+            {{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}
+
+    - alert: RAM
+      expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 10
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        <<: *anno
+        summary: "Low available memory"
+        description: "Free RAM: {{ printf `%.2f` $value }}%  Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"
+
+    - alert: RAM
+      expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 5
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        <<: *anno
+        summary: "Low available memory"
+        description: "Free RAM: {{ printf `%.2f` $value }}%  Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"
+
+    - alert: iNodes
+      expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 10
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        <<: *anno
+        summary: "[WARN] Low available inodes"
+        description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"
+
+    - alert: iNodes
+      expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 5
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        <<: *anno
+        summary: "[CRIT] Host out of inodes"
+        description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"
+
+
+    - alert: DiskUsage 
+      expr: ( node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup|/swap", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes ) * 100 < 10
+      for: 5m
+      labels:
+        severity: info
+      annotations:
+        <<: *anno
+        summary: "Disk usage is more than 90%"
+        description: |
+          {{ $labels.device }} ({{ $labels.mountpoint }}): {{ printf `node_filesystem_avail_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }} / {{ printf `node_filesystem_size_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }}
+          Свободного места: {{ printf `%.2f` $value }}%
+ 
+    - alert: DiskUsagePredict
+      expr: |
+        (node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes) * 100 < 10
+        and
+        predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs|fuse.lxcfs|squashfs"}[1h], 4 * 3600) < 0
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        <<: *anno
+        summary: "Disk usage is more than 90% and will fill soon"
+        description: "{{ $labels.mountpoint }} usage is more than 90% and will fill soon on {{ $labels.instance }}"
+               
+  - name: Prometheus
+    rules:
+    - alert: PrometheusAlertmanagerNotificationFailing
+      expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: cricical
+      annotations:
+        <<: *anno
+        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+        description: "Alertmanager is failing sending notifications on {{ $labels.host }}"
+
+    - alert: PrometheusConfigurationReloadFailure
+      expr: prometheus_config_last_reload_successful != 1
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        <<: *anno
+        summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+        description: "Prometheus configuration reload error on {{ $labels.host }}"
+
+    - alert: PrometheusConsulServiceDiscoveryError
+      expr: increase(prometheus_sd_consul_rpc_failures_total[15m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        <<: *anno
+        summary: Prometheus consul_sd many failures (instance {{ $labels.instance }})
+        description: "Prometheus consul_sd many failures on {{ $labels.host }}"
+
+
+
+
+   
--- a/files/prometheus/prometheus.service
+++ b/files/prometheus/prometheus.service
@@ -0,0 +1,30 @@
+[Unit]
+Description=Prometheus
+Wants=network-online.target
+After=network-online.target
+
+[Service]
+User=prometheus
+Group=prometheus
+Type=simple
+Restart=always
+OOMScoreAdjust=-1000
+LimitNOFILE=16384
+ExecStart=/usr/sbin/prometheus \
+    --config.file /etc/prometheus/prometheus.yaml \
+    --web.config.file=/etc/prometheus/web-config.yaml \
+    --storage.tsdb.path /prometheus-data/ \
+    --storage.tsdb.retention.time 180d \
+    --storage.tsdb.max-block-duration=2h \
+    --storage.tsdb.min-block-duration=2h \
+    --web.enable-remote-write-receiver \
+    --web.console.templates=/etc/prometheus/consoles \
+    --web.console.libraries=/etc/prometheus/console_libraries \
+    --web.enable-admin-api \
+    --query.max-samples=50000000
+
+ExecReload=/usr/bin/kill -s HUP $MAINPID
+ExecStop=/usr/bin/kill -s QUIT $MAINPID
+
+[Install]
+WantedBy=multi-user.target
--- a/files/prometheus/prometheus.yaml
+++ b/files/prometheus/prometheus.yaml
@@ -0,0 +1,179 @@
+# my global config
+global:
+  scrape_interval:     10s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute.
+  #external_labels:
+ 
+  # scrape_timeout is set to the global default (10s).
+
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+  - scheme: https
+    static_configs:
+    - targets: ['alertmanager.guaranteedstruggle.host'] 
+
+# Writing data to remote long-term storage (VictoriaMetrics)
+# remote_write:
+#   - url:  
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  - '/etc/prometheus/alerts.yaml'
+  - '/etc/prometheus/service_alerts/*.yaml'
+ 
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+
+
+  #### TODO вынести в шаблоны сбор экспортеров на основе ролей машин
+
+  - job_name: 'node-exporters'
+    scheme: http
+    static_configs:
+    - targets: 
+      - 'semyon-0x01:9100'
+      - 'semyon-0x02:9100'
+      - 'semyon-0x03:9100'
+      - 'semyon-0x04:9100'
+      - 'semyon-0x05:9100'
+
+      - 'king-albert:9100'
+      - 'gpu-slut:9100'
+    # relabel_configs:
+    # - target_label:  instance
+    #   replacement:   'cyberbully:9100'
+    # - target_label:  host
+    #   replacement:   cyberbully
+ 
+  - job_name: 'node-exporters-vms'
+    scheme: http
+    static_configs:
+    - targets:
+      - 'printing-slut:9100'
+
+      - 'swarm-node1:9100'
+      - 'swarm-node2:9100'
+      - 'swarm-node3:9100'
+
+      - 'harbor:9100'
+      
+      - 'rke2-master1:9100'
+      - 'rke2-master2:9100'
+      - 'rke2-master3:9100'
+      - 'rke2-worker1:9100'
+      - 'rke2-worker2:9100'
+      - 'rke2-worker3:9100'
+      - 'rke2-worker4:9100'
+      - 'rke2-worker5:9100'
+
+      - 'k3s-rancher:9100'
+      - 'k3s-awx:9100'
+
+  # # - job_name: 'node-exporters-lxc'
+  # #   scheme: http
+  # #   static_configs:
+  # #   - targets:
+
+
+  - job_name: 'impi-exporters'
+    scheme: http
+    static_configs:
+    - targets:
+      #- 'cyberbully:9290'
+      - 'king-albert:9290'
+      # - 'semyon-0x01:9290'
+      # - 'semyon-0x02:9290'
+      # - 'semyon-0x03:9290'
+      # - 'semyon-0x04:9290'
+      # - 'semyon-0x05:9290'
+      # - 'gpu-slut:9290'
+
+  # пиписа-экспортер
+  # # - job_name: 'vdk2ch-pipisa-exporter'
+  # #   scheme: http
+  # #   static_configs:
+  # #   - targets:
+  # #     - '192.168.0.55:9992' 
+  # #   relabel_configs:
+  # #   - target_label:  instance
+  # #     replacement:   'cyberbully:9992'
+  # #   - target_label:  host
+  # #     replacement:   cyberbully
+ 
+  # пиписа-экспортер
+  # - job_name: 'vllm-exporter'
+  #   scheme: http
+  #   static_configs:
+  #   - targets:
+  #     - '192.168.0.4:8000' 
+  #   relabel_configs:
+  #   - target_label:  instance
+  #     replacement:   'new-computer-home:8000'
+  #   - target_label:  host
+  #     replacement:   new-computer-home
+ 
+ 
+  # 
+  # # - job_name: 'nginx-vts-metrics'
+  # #   scheme: http
+  # #   metrics_path: /status/format/prometheus
+  # #   static_configs:
+  # #   - targets:
+  # #     - '192.168.0.55:9042' 
+  # #   relabel_configs:
+  # #   - target_label:  instance
+  # #     replacement:   'cyberbully:9042'
+    # - target_label:  host
+    #   replacement:   cyberbully
+
+  # шиндоус-экспортер поверх ноута через домашний вайфай
+  # # - job_name: 'i-programmed-my-home-computer'
+  # #   scheme: http
+  # #   static_configs:
+  # #   - targets:
+  # #     - '192.168.0.2:9182'  
+  # #     - '192.168.0.3:9182'  
+  # #   relabel_configs:
+  # #   - source_labels: [__address__]
+  # #     regex:        "(192.168.0.2.+)"
+  # #     target_label:  instance
+  # #     replacement:   'Desktop-O50pt4s:9182'
+  # #   - source_labels: [__address__]
+  # #     regex:        "(192.168.0.2.+)"
+  # #     target_label:  host
+  # #     replacement:   Desktop-O50pt4s
+  # #   - source_labels: [__address__]
+  # #     regex:        "(192.168.0.3.+)"
+  # #     target_label:  instance
+  # #     replacement:   'Desktop-edov3u5:9182'
+  # #   - source_labels: [__address__]
+  # #     regex:        "(192.168.0.3.+)"
+  # #     target_label:  host
+  # #     replacement:   Desktop-edov3u5
+
+  # 
+  # # - job_name: 'nvidia-gpu-metrics'
+  # #   scheme: http
+  # #   static_configs:
+  # #   - targets:
+  # #     - '192.168.0.2:9835' 
+  # #   relabel_configs:
+  # #   - target_label:  instance
+  # #     replacement:   'Desktop-O50pt4s:9835'
+
+
+  # # # личный твиттус
+  # # - job_name: 'pleroma'
+  # #   metrics_path: /api/pleroma/app_metrics
+  # #   scheme: https
+  # #   static_configs:
+  # #   - targets: ['social.vdk2ch.ru']
+
+  # хайпервиза
+  - job_name: 'proxmox'
+    metrics_path: /pve
+    static_configs:
+    - targets:
+      - 'king-albert.guaranteedstruggle.host:9221'
--- a/files/prometheus/rules.yaml
+++ b/files/prometheus/rules.yaml
@@ -0,0 +1,14 @@
+groups:
+  - name: node-exporter-rules
+    rules:
+  
+    # CPU count
+    - record: instance:node_cpus:count
+      expr: count(node_cpu_seconds_total{mode="idle"}) without (cpu,mode)
+
+  #взято отсюда
+  # https://stackoverflow.com/questions/52480567/count-alerts-fired-by-prometheus
+  - name: alerts
+    rules:
+    - record: ALERTS_FOR_STATE:firing
+      expr: ALERTS_FOR_STATE and ignoring(alertstate) ALERTS{alertstate="firing"}
--- a/files/prometheus/web-config.yaml
+++ b/files/prometheus/web-config.yaml
@@ -0,0 +1,3 @@
+# tls_server_config:
+#   cert_file: /etc/prometheus/ssl/ .crt
+#   key_file: /etc/prometheus/ssl/ .key
--- a/playbooks/_common-setup.yml
+++ b/playbooks/_common-setup.yml
@@ -0,0 +1,6 @@
+#### TODO обе роли - пакаджесы и юзеры
+---
+- import_playbook: packages.yml
+- import_playbook: resolvconf.yml
+- import_playbook: users.yml
+- import_playbook: exporters.yml
--- a/playbooks/exporters.yml
+++ b/playbooks/exporters.yml
@@ -1,6 +1,6 @@
 ---
 - name: node exporter!
-  hosts: all
+  hosts: all:!lxc
  gather_facts: yes
  become: yes
  roles:
@@ -9,7 +9,7 @@
    - prometheus.prometheus.node_exporter    
      #node_exporter_local_cache_path: "/tmp/node_exporter_cache"
 - name: for hardware monitoring
-  hosts: physical_machines
+  hosts: king-albert.guaranteedstruggle.host
  gather_facts: yes
  become: yes
  roles:
--- a/playbooks/packages.yml
+++ b/playbooks/packages.yml
@@ -14,6 +14,10 @@
          - net-tools
          - vim
          - sudo
+          - tree 
+          - jq
+          - rsync
+          
        #state: latest 
        state: present 

--- a/playbooks/pingo.yml
+++ b/playbooks/pingo.yml
@@ -4,7 +4,7 @@
 # remote_user: root

  gather_facts: no
-  become: yes
+  become: no # yes
  tasks:
  - name: pingu!
    ansible.builtin.ping:
--- a/playbooks/resolvconf.yml
+++ b/playbooks/resolvconf.yml
@@ -0,0 +1,65 @@
+--- 
+- name: make resolv.conf work fine
+  hosts: all 
+  become: yes
+  tasks: 
+  - name: Install the packages versions
+    ansible.builtin.package:
+      name:
+        - systemd-resolved 
+      state: present  
+  - name: Make small file
+    register: systemd_resolved_conf
+    copy:
+      dest: "/etc/systemd/resolved.conf"
+      content: |
+        #  This file is part of systemd.
+        #
+        #  systemd is free software; you can redistribute it and/or modify it under the
+        #  terms of the GNU Lesser General Public License as published by the Free
+        #  Software Foundation; either version 2.1 of the License, or (at your option)
+        #  any later version.
+        #
+        # Entries in this file show the compile time defaults. Local configuration
+        # should be created by either modifying this file, or by creating "drop-ins" in
+        # the resolved.conf.d/ subdirectory. The latter is generally recommended.
+        # Defaults can be restored by simply deleting this file and all drop-ins.
+        #
+        # Use 'systemd-analyze cat-config systemd/resolved.conf' to display the full config. 
+        # See resolved.conf(5) for details.
+
+        [Resolve]
+        # Some examples of DNS servers which may be used for DNS= and FallbackDNS=:
+        # Cloudflare: 1.1.1.1#cloudflare-dns.com 1.0.0.1#cloudflare-dns.com 2606:4700:4700::1111#cloudflare-dns.com 2606:4700:4700::1001#cloudflare-dns.com
+        # Google:     8.8.8.8#dns.google 8.8.4.4#dns.google 2001:4860:4860::8888#dns.google 2001:4860:4860::8844#dns.google
+        # Quad9:      9.9.9.9#dns.quad9.net 149.112.112.112#dns.quad9.net 2620:fe::fe#dns.quad9.net 2620:fe::9#dns.quad9.net
+        DNS=192.168.0.88
+        FallbackDNS=192.168.0.1
+        Domains=guaranteedstruggle.host,just-for-me.internal
+        #DNSSEC=no
+        #DNSOverTLS=no
+        #MulticastDNS=yes
+        #LLMNR=yes
+        #Cache=yes
+        #CacheFromLocalhost=no
+        DNSStubListener=yes
+        #DNSStubListenerExtra=
+        #ReadEtcHosts=yes
+        #ResolveUnicastSingleLabel=no
+
+        
+  - name: Make fix for resolv-conf rewriting  
+    copy:
+      dest: "/etc/dhcp/dhclient-enter-hooks.d/nodnsupdate"
+      content: | 
+        #!/bin/sh
+        make_resolv_conf(){
+                :
+        }
+      mode: +x
+
+  - name: restart service
+    service:
+      name: systemd-resolved
+      state: restarted
+    when: systemd_resolved_conf.changed
--- a/playbooks/software/prometheus.yml
+++ b/playbooks/software/prometheus.yml
@@ -0,0 +1,133 @@
+---
+- name: prom
+  hosts:
+  - prometheus.guaranteedstruggle.host 
+  vars:
+    prom_version: '2.55.1'
+  gather_facts: yes
+  become: yes
+  tasks: 
+
+  - name: Ensure group "prometheus" exists
+    ansible.builtin.group:
+      name:  prometheus
+      state: present
+  - name: Add user "prometheus"  
+    ansible.builtin.user:
+      name:   prometheus
+      groups: prometheus
+      shell: /sbin/nologin
+      create_home: no
+      append: yes
+      comment: "prometheus nologin User"
+      state: present 
+
+  - name: Creates directory
+    ansible.builtin.file:
+      path: /etc/prometheus
+      state: directory
+      group: prometheus
+      owner: prometheus 
+  - name: Creates directory
+    ansible.builtin.file:
+      path: /usr/share/prometheus
+      state: directory
+      group: prometheus
+      owner: prometheus 
+  - name: Creates directory
+    ansible.builtin.file:
+      path: /prometheus-data
+      state: directory
+      group: prometheus
+      owner: prometheus 
+
+  - name: Unarchive a file that needs to be downloaded (added in 2.0)
+    ansible.builtin.unarchive:
+      src: https://github.com/prometheus/prometheus/releases/download/v{{prom_version}}/prometheus-{{prom_version}}.linux-amd64.tar.gz
+      dest: /usr/share/prometheus
+      creates: /usr/share/prometheus/prometheus-{{prom_version}}.linux-amd64
+      remote_src: yes
+
+
+  - name: Create a symbolic link
+    ansible.builtin.file:
+      src: /usr/share/prometheus/prometheus-{{prom_version}}.linux-amd64/prometheus
+      dest: /usr/sbin/prometheus
+      owner: prometheus
+      group: prometheus
+      state: link
+  - name: Create a symbolic link
+    ansible.builtin.file:
+      src: /usr/share/prometheus/prometheus-{{prom_version}}.linux-amd64/promtool
+      dest: /usr/sbin/promtool
+      owner: prometheus
+      group: prometheus
+      state: link
+
+  - name: Copy prometheus.yaml
+    register: prometheus_config_file
+    copy:
+      src: ../../files/prometheus/prometheus.yaml
+      dest: /etc/prometheus/prometheus.yaml 
+    notify:
+      - reload prometheus
+  - name: Copy web-config
+    register: web_config_file
+    copy:
+      src: ../../files/prometheus/web-config.yaml
+      dest: /etc/prometheus/web-config.yaml 
+    notify:
+      - reload prometheus
+  - name: Copy rules.yaml
+    register: rules_file
+    copy:
+      src: ../../files/prometheus/rules.yaml
+      dest: /etc/prometheus/rules.yaml 
+    notify:
+      - reload prometheus
+  - name: Copy alerts.yaml
+    register: alerts_file
+    copy:
+      src: ../../files/prometheus/alerts.yaml
+      dest: /etc/prometheus/alerts.yaml
+    notify:
+      - reload prometheus
+
+      
+  - name: Copy prometheus.service
+    register: prometheus_service_file
+    copy:
+      src: ../../files/prometheus/prometheus.service
+      dest: /etc/systemd/system/prometheus.service
+
+
+  - name: ensure service
+    ansible.builtin.systemd_service:
+      name: prometheus
+      state: started
+      enabled: true
+      
+  # - name: reload service
+  #   ansible.builtin.systemd_service:
+  #     name: prometheus
+  #     state: reloaded
+  #   when:
+  #   - rules_file.changed
+  #   - alerts_file.changed
+  #   - prometheus_service_file.changed
+  #   - web_config_file.changed
+
+  - name: Just force systemd to reread configs
+    ansible.builtin.systemd_service:
+      daemon_reload: true
+    when: prometheus_service_file.changed
+
+
+
+  handlers:
+  - name: reload prometheus
+    ansible.builtin.systemd_service:
+      name: prometheus
+      state: reloaded
+
+#### TODO как откатывать неудачную проверку promtool'ом ?
--- a/requirements.yml
+++ b/requirements.yml
@@ -6,4 +6,4 @@ collections:
    version: 4.1.0 
    
  - name: prometheus.prometheus
-    version: 0.18.0
+    version: 0.22.0