resolv.conf solved!
This commit is contained in:
parent
fe33aee2fe
commit
4ba0cadf07
@ -18,7 +18,8 @@ bin_ansible_callbacks = True
|
|||||||
host_key_checking = false
|
host_key_checking = false
|
||||||
|
|
||||||
#vault_password_file = /etc/ansible/.vaulto
|
#vault_password_file = /etc/ansible/.vaulto
|
||||||
vault_password_file = /tmp/.vaulto
|
#vault_password_file = /tmp/.vaulto
|
||||||
|
vault_password_file = /usr/share/.vaulto
|
||||||
|
|
||||||
# callback_plugins = /etc/ansible/plugins/callback
|
# callback_plugins = /etc/ansible/plugins/callback
|
||||||
# callback_whitelist = telegram
|
# callback_whitelist = telegram
|
||||||
|
5
environments/just-created/group_vars/lxc/ssh-creds.yml
Normal file
5
environments/just-created/group_vars/lxc/ssh-creds.yml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
ansible_ssh_user: root
|
||||||
|
ansible_ssh_pass: admin
|
||||||
|
ansible_sudo_pass: admin
|
||||||
|
ansible_ssh_private_key_file: '/home/hogweed1/id25519.key'
|
@ -2,4 +2,4 @@
|
|||||||
ansible_ssh_user: hogweed1
|
ansible_ssh_user: hogweed1
|
||||||
ansible_ssh_pass: coloredhorses
|
ansible_ssh_pass: coloredhorses
|
||||||
ansible_sudo_pass: coloredhorses
|
ansible_sudo_pass: coloredhorses
|
||||||
ansible_ssh_private_key_file: '/home/hogweed1/id25519.key'
|
#ansible_ssh_private_key_file: '/home/hogweed1/id25519.key'
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
ansible_ssh_user: hogweed1
|
||||||
|
ansible_ssh_pass: coloredhorses
|
||||||
|
ansible_sudo_pass: coloredhorses
|
||||||
|
#ansible_ssh_private_key_file: '/home/hogweed1/id25519.key'
|
@ -1,11 +1,24 @@
|
|||||||
---
|
---
|
||||||
# all: # keys must be unique, i.e. only one 'hosts' per group
|
all: # keys must be unique, i.e. only one 'hosts' per group
|
||||||
# hosts:
|
hosts:
|
||||||
|
#k3s-rancher.guaranteedstruggle.host:
|
||||||
# #nexus.guaranteedstruggle.host:
|
# #nexus.guaranteedstruggle.host:
|
||||||
# #printing-slut.guaranteedstruggle.host:
|
# #printing-slut.guaranteedstruggle.host:
|
||||||
# harbor.guaranteedstruggle.host:
|
# harbor.guaranteedstruggle.host:
|
||||||
|
|
||||||
|
#192.168.0.26
|
||||||
|
#192.168.0.32:
|
||||||
lxc: # keys must be unique, i.e. only one 'hosts' per group
|
lxc: # keys must be unique, i.e. only one 'hosts' per group
|
||||||
hosts:
|
hosts:
|
||||||
### but its a vm wtf
|
### but its a vm wtf
|
||||||
harbor.guaranteedstruggle.host:
|
#harbor.guaranteedstruggle.host:
|
||||||
|
|
||||||
|
#etcd.guaranteedstruggle.host:
|
||||||
|
#prometheus.guaranteedstruggle.host:
|
||||||
|
# 192.168.0.240
|
||||||
|
#192.168.0.251
|
||||||
|
#192.168.0.40
|
||||||
|
#192.168.0.88
|
||||||
|
#192.168.0.52
|
||||||
|
#192.168.0.113
|
||||||
|
#recording-slut.guaranteedstruggle.host:
|
@ -2,6 +2,7 @@
|
|||||||
physical_machines:
|
physical_machines:
|
||||||
hosts:
|
hosts:
|
||||||
cyberbully.guaranteedstruggle.host:
|
cyberbully.guaranteedstruggle.host:
|
||||||
|
#
|
||||||
gpu-slut.guaranteedstruggle.host:
|
gpu-slut.guaranteedstruggle.host:
|
||||||
children:
|
children:
|
||||||
proxmoxes:
|
proxmoxes:
|
||||||
@ -20,11 +21,15 @@ semyons: # keys must be unique, i.e. only one 'hosts' per group
|
|||||||
semyon-0x04.guaranteedstruggle.host:
|
semyon-0x04.guaranteedstruggle.host:
|
||||||
semyon-0x05.guaranteedstruggle.host:
|
semyon-0x05.guaranteedstruggle.host:
|
||||||
vms:
|
vms:
|
||||||
|
hosts:
|
||||||
|
#recording-slut.guaranteedstruggle.host:
|
||||||
|
#192.168.0.26
|
||||||
children:
|
children:
|
||||||
printer:
|
printer:
|
||||||
kubernetes:
|
kubernetes:
|
||||||
docker:
|
docker:
|
||||||
|
|
||||||
|
|
||||||
docker:
|
docker:
|
||||||
hosts:
|
hosts:
|
||||||
swarm-node1.guaranteedstruggle.host:
|
swarm-node1.guaranteedstruggle.host:
|
||||||
@ -52,4 +57,15 @@ printer:
|
|||||||
printing-slut.guaranteedstruggle.host:
|
printing-slut.guaranteedstruggle.host:
|
||||||
|
|
||||||
#### TODO
|
#### TODO
|
||||||
# lxc:
|
lxc:
|
||||||
|
hosts:
|
||||||
|
### but its a vm wtf
|
||||||
|
#harbor.guaranteedstruggle.host:
|
||||||
|
#etcd.guaranteedstruggle.host:
|
||||||
|
prometheus.guaranteedstruggle.host:
|
||||||
|
recording-slut.guaranteedstruggle.host:
|
||||||
|
|
||||||
|
pg.just-for-me.internal:
|
||||||
|
grafana.just-for-me.internal:
|
||||||
|
price-loader.just-for-me.internal:
|
||||||
|
|
||||||
|
21
files/alertmanager/alertmanager.service
Normal file
21
files/alertmanager/alertmanager.service
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Prometheus alertmanager
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=prometheus
|
||||||
|
Group=prometheus
|
||||||
|
EnvironmentFile=-/etc/sysconfig/alertmanager
|
||||||
|
ExecStart=/usr/sbin/alertmanager \
|
||||||
|
--config.file=/etc/alertmanager/alertmanager.yaml \
|
||||||
|
--storage.path=/base/alertmanager \
|
||||||
|
--web.config.file=/etc/prometheus/web-config.yaml
|
||||||
|
|
||||||
|
ExecReload=/bin/kill -HUP $MAINPID
|
||||||
|
KillMode=process
|
||||||
|
Restart=always
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
|
50
files/alertmanager/alertmanager.yaml
Normal file
50
files/alertmanager/alertmanager.yaml
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: [ 'alertname', 'job' ]
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 1h
|
||||||
|
receiver: what-went-wrong
|
||||||
|
|
||||||
|
# routes:
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
# /dev/null receiver
|
||||||
|
- name: 'blackhole'
|
||||||
|
|
||||||
|
# конфа
|
||||||
|
- name: 'what-went-wrong'
|
||||||
|
telegram_configs:
|
||||||
|
- send_resolved: true
|
||||||
|
bot_token: '6472915685:AAHPvgrQoqG7DxtfbnHWPe3Lfild-CGJ1j8'
|
||||||
|
chat_id: -4023350326
|
||||||
|
message: '{{ template "teletempl" . }}'
|
||||||
|
api_url: https://api.telegram.org
|
||||||
|
parse_mode: HTML
|
||||||
|
# - name: 'vdk2ch'
|
||||||
|
# telegram_configs:
|
||||||
|
# - send_resolved: true
|
||||||
|
# bot_token: '5724991559:AAEuLvpLsgP6LHRGMSyFtQLlR5qPQUO4b_w'
|
||||||
|
# chat_id: -1001355646177
|
||||||
|
# message: '{{ template "teletempl" . }}'
|
||||||
|
# api_url: https://api.telegram.org
|
||||||
|
# parse_mode: HTML
|
||||||
|
|
||||||
|
# A list of inhibition rules.
|
||||||
|
#inhibit_rules:
|
||||||
|
|
||||||
|
templates:
|
||||||
|
- '/etc/alertmanager/templates/my.tmpl'
|
||||||
|
|
||||||
|
# A list of time intervals for muting/activating routes.
|
||||||
|
# time_intervals:
|
||||||
|
# - name: business_hours
|
||||||
|
# time_intervals:
|
||||||
|
# - weekdays: ['monday:friday']
|
||||||
|
# times:
|
||||||
|
# # Начало в 10:00 Asia/Vladivostok
|
||||||
|
# - start_time: '00:00'
|
||||||
|
# # Заканчивается в 19:00 Asia/Vladivostok
|
||||||
|
# end_time: '09:00'
|
33
files/alertmanager/simple_telegram.tmpl
Normal file
33
files/alertmanager/simple_telegram.tmpl
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
{{ define "teletempl" }}
|
||||||
|
<b>{{ .CommonLabels.alertname }} : </b>
|
||||||
|
{{- if eq .Status "firing" -}}
|
||||||
|
<b>{{ .Status | toUpper}} 🔥</b>
|
||||||
|
{{- end -}}
|
||||||
|
{{- if eq .Status "resolved" -}}
|
||||||
|
<b>{{ .Status | toUpper}} ✅</b>
|
||||||
|
{{- end -}}
|
||||||
|
{{ $alerts_count := len .Alerts }}
|
||||||
|
{{ if eq $alerts_count 1 -}} {{/* Single alert block */}}
|
||||||
|
{{ .CommonAnnotations.summary }}
|
||||||
|
|
||||||
|
Host: {{ .CommonLabels.host }}
|
||||||
|
Instance: {{ .CommonLabels.instance }}
|
||||||
|
Job: <b>{{ .CommonLabels.job }}</b>
|
||||||
|
|
||||||
|
Details:
|
||||||
|
{{ .CommonAnnotations.description }}
|
||||||
|
|
||||||
|
Alert started: [ {{ .CommonAnnotations.alert_started_vl_time }} ]
|
||||||
|
|
||||||
|
{{- else -}} {{/* Grouped alert block */}}
|
||||||
|
{{ .CommonAnnotations.summary }}
|
||||||
|
|
||||||
|
Job: <b>{{ .CommonLabels.job }}</b>
|
||||||
|
|
||||||
|
|
||||||
|
Instances:
|
||||||
|
{{- range .Alerts }}
|
||||||
|
{{ .Labels.instance }} [ {{ .Annotations.alert_started_vl_time }} ]
|
||||||
|
{{- end }}
|
||||||
|
{{ end }}
|
||||||
|
{{ end }}
|
188
files/prometheus/alerts.yaml
Normal file
188
files/prometheus/alerts.yaml
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
---
|
||||||
|
# можно набирать примеров отсюда
|
||||||
|
# https://awesome-prometheus-alerts.grep.to/rules.html
|
||||||
|
|
||||||
|
|
||||||
|
groups:
|
||||||
|
|
||||||
|
|
||||||
|
- name: standard
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- alert: _plchldr
|
||||||
|
expr: up == -999
|
||||||
|
for: 999m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations: &anno
|
||||||
|
alert_started_vl_time: "{{ with $b := printf `ALERTS_FOR_STATE{job=\"%s\",instance=\"%s\"} + 36000` $labels.job $labels.instance | query }}{{if $b}}{{ with $a := $b | first | value | humanizeTimestamp }}{{- slice $a 0 19 -}}{{end}}{{end}}{{end}}"
|
||||||
|
|
||||||
|
|
||||||
|
- alert: jackbot failed
|
||||||
|
expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "PIPISA IS DOWN!"
|
||||||
|
description: "Pipisa on {{ $labels.instance }} does not working!"
|
||||||
|
|
||||||
|
- alert: jackbot failed
|
||||||
|
expr: node_systemd_unit_state{ name="jack_bot.service", state="active" } != 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: cricical
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "PIPISA IS DOWN!"
|
||||||
|
description: "Pipisa on {{ $labels.instance }} does not working!"
|
||||||
|
|
||||||
|
|
||||||
|
### отслужило своё, майнеров больше нет.
|
||||||
|
# - alert: MAINER JACK KURWA!!
|
||||||
|
# expr: node_load15 > 2
|
||||||
|
# for: 20m
|
||||||
|
# labels:
|
||||||
|
# severity: cricical
|
||||||
|
# annotations:
|
||||||
|
# <<: *anno
|
||||||
|
# summary: "It THAT shit again!"
|
||||||
|
# description: "Kill fucking mainer processes!"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- alert: Uptime
|
||||||
|
expr: floor((time() - node_boot_time_seconds)) < 3600
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "Uptime less than 1 hour"
|
||||||
|
description: "Uptime on {{ $labels.instance }} is less than 1 hour"
|
||||||
|
|
||||||
|
- alert: LoadAverage
|
||||||
|
expr: (node_load5{}) > ( instance:node_cpus:count{} )
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "High LoadAverage5"
|
||||||
|
description: |
|
||||||
|
{{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}
|
||||||
|
|
||||||
|
- alert: LoadAverage
|
||||||
|
expr: (node_load15{}) > ( instance:node_cpus:count{} )
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "High LoadAverage15"
|
||||||
|
description: |
|
||||||
|
{{ $labels.host }} [{{ printf `instance:node_cpus:count{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} CPU] LA: {{ printf `node_load1{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load5{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }} {{ printf `node_load15{host='%s', instance='%s'}` .Labels.host .Labels.instance | query | first | value }}
|
||||||
|
|
||||||
|
- alert: RAM
|
||||||
|
expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "Low available memory"
|
||||||
|
description: "Free RAM: {{ printf `%.2f` $value }}% Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"
|
||||||
|
|
||||||
|
- alert: RAM
|
||||||
|
expr: node_memory_MemAvailable_bytes{ } / node_memory_MemTotal_bytes * 100 < 5
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "Low available memory"
|
||||||
|
description: "Free RAM: {{ printf `%.2f` $value }}% Свободно {{ printf `node_memory_MemAvailable_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }} из {{ printf `node_memory_MemTotal_bytes{instance='%s'}` .Labels.instance | query | first | value | humanize1024 }}"
|
||||||
|
|
||||||
|
- alert: iNodes
|
||||||
|
expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 10
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "[WARN] Low available inodes"
|
||||||
|
description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"
|
||||||
|
|
||||||
|
- alert: iNodes
|
||||||
|
expr: (node_filesystem_files_free{fstype!~"rootfs|fuse.lxcfs|squashfs",mountpoint!~"/boot|boot/efi|/backup|/swap"} / node_filesystem_files) * 100 < 5
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "[CRIT] Host out of inodes"
|
||||||
|
description: "Available i-nodes: {{ printf `%.2f` $value }}%\n"
|
||||||
|
|
||||||
|
|
||||||
|
- alert: DiskUsage
|
||||||
|
expr: ( node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup|/swap", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes ) * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "Disk usage is more than 90%"
|
||||||
|
description: |
|
||||||
|
{{ $labels.device }} ({{ $labels.mountpoint }}): {{ printf `node_filesystem_avail_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }} / {{ printf `node_filesystem_size_bytes{mountpoint='%s', device='%s', instance='%s'}` .Labels.mountpoint .Labels.device .Labels.instance | query | first | value | humanize1024 }}
|
||||||
|
Свободного места: {{ printf `%.2f` $value }}%
|
||||||
|
|
||||||
|
- alert: DiskUsagePredict
|
||||||
|
expr: |
|
||||||
|
(node_filesystem_avail_bytes{mountpoint!~"/boot|boot/efi|/backup", fstype!~"rootfs|fuse.lxcfs|squashfs"}/ node_filesystem_size_bytes) * 100 < 10
|
||||||
|
and
|
||||||
|
predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs|fuse.lxcfs|squashfs"}[1h], 4 * 3600) < 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: "Disk usage is more than 90% and will fill soon"
|
||||||
|
description: "{{ $labels.mountpoint }} usage is more than 90% and will fill soon on {{ $labels.instance }}"
|
||||||
|
|
||||||
|
- name: Prometheus
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusAlertmanagerNotificationFailing
|
||||||
|
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: cricical
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||||
|
description: "Alertmanager is failing sending notifications on {{ $labels.host }}"
|
||||||
|
|
||||||
|
- alert: PrometheusConfigurationReloadFailure
|
||||||
|
expr: prometheus_config_last_reload_successful != 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus configuration reload error on {{ $labels.host }}"
|
||||||
|
|
||||||
|
- alert: PrometheusConsulServiceDiscoveryError
|
||||||
|
expr: increase(prometheus_sd_consul_rpc_failures_total[15m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
<<: *anno
|
||||||
|
summary: Prometheus consul_sd many failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus consul_sd many failures on {{ $labels.host }}"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
30
files/prometheus/prometheus.service
Normal file
30
files/prometheus/prometheus.service
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Prometheus
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=prometheus
|
||||||
|
Group=prometheus
|
||||||
|
Type=simple
|
||||||
|
Restart=always
|
||||||
|
OOMScoreAdjust=-1000
|
||||||
|
LimitNOFILE=16384
|
||||||
|
ExecStart=/usr/sbin/prometheus \
|
||||||
|
--config.file /etc/prometheus/prometheus.yaml \
|
||||||
|
--web.config.file=/etc/prometheus/web-config.yaml \
|
||||||
|
--storage.tsdb.path /prometheus-data/ \
|
||||||
|
--storage.tsdb.retention.time 180d \
|
||||||
|
--storage.tsdb.max-block-duration=2h \
|
||||||
|
--storage.tsdb.min-block-duration=2h \
|
||||||
|
--web.enable-remote-write-receiver \
|
||||||
|
--web.console.templates=/etc/prometheus/consoles \
|
||||||
|
--web.console.libraries=/etc/prometheus/console_libraries \
|
||||||
|
--web.enable-admin-api \
|
||||||
|
--query.max-samples=50000000
|
||||||
|
|
||||||
|
ExecReload=/usr/bin/kill -s HUP $MAINPID
|
||||||
|
ExecStop=/usr/bin/kill -s QUIT $MAINPID
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
179
files/prometheus/prometheus.yaml
Normal file
179
files/prometheus/prometheus.yaml
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
# my global config
|
||||||
|
global:
|
||||||
|
scrape_interval: 10s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
|
||||||
|
evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute.
|
||||||
|
#external_labels:
|
||||||
|
|
||||||
|
# scrape_timeout is set to the global default (10s).
|
||||||
|
|
||||||
|
# Alertmanager configuration
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- scheme: https
|
||||||
|
static_configs:
|
||||||
|
- targets: ['alertmanager.guaranteedstruggle.host']
|
||||||
|
|
||||||
|
# Writing data to remote long-term storage (VictoriaMetrics)
|
||||||
|
# remote_write:
|
||||||
|
# - url:
|
||||||
|
|
||||||
|
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||||
|
rule_files:
|
||||||
|
- '/etc/prometheus/alerts.yaml'
|
||||||
|
- '/etc/prometheus/service_alerts/*.yaml'
|
||||||
|
|
||||||
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
|
# Here it's Prometheus itself.
|
||||||
|
scrape_configs:
|
||||||
|
|
||||||
|
|
||||||
|
#### TODO вынести в шаблоны сбор экспортеров на основе ролей машин
|
||||||
|
|
||||||
|
- job_name: 'node-exporters'
|
||||||
|
scheme: http
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 'semyon-0x01:9100'
|
||||||
|
- 'semyon-0x02:9100'
|
||||||
|
- 'semyon-0x03:9100'
|
||||||
|
- 'semyon-0x04:9100'
|
||||||
|
- 'semyon-0x05:9100'
|
||||||
|
|
||||||
|
- 'king-albert:9100'
|
||||||
|
- 'gpu-slut:9100'
|
||||||
|
# relabel_configs:
|
||||||
|
# - target_label: instance
|
||||||
|
# replacement: 'cyberbully:9100'
|
||||||
|
# - target_label: host
|
||||||
|
# replacement: cyberbully
|
||||||
|
|
||||||
|
- job_name: 'node-exporters-vms'
|
||||||
|
scheme: http
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 'printing-slut:9100'
|
||||||
|
|
||||||
|
- 'swarm-node1:9100'
|
||||||
|
- 'swarm-node2:9100'
|
||||||
|
- 'swarm-node3:9100'
|
||||||
|
|
||||||
|
- 'harbor:9100'
|
||||||
|
|
||||||
|
- 'rke2-master1:9100'
|
||||||
|
- 'rke2-master2:9100'
|
||||||
|
- 'rke2-master3:9100'
|
||||||
|
- 'rke2-worker1:9100'
|
||||||
|
- 'rke2-worker2:9100'
|
||||||
|
- 'rke2-worker3:9100'
|
||||||
|
- 'rke2-worker4:9100'
|
||||||
|
- 'rke2-worker5:9100'
|
||||||
|
|
||||||
|
- 'k3s-rancher:9100'
|
||||||
|
- 'k3s-awx:9100'
|
||||||
|
|
||||||
|
# # - job_name: 'node-exporters-lxc'
|
||||||
|
# # scheme: http
|
||||||
|
# # static_configs:
|
||||||
|
# # - targets:
|
||||||
|
|
||||||
|
|
||||||
|
- job_name: 'impi-exporters'
|
||||||
|
scheme: http
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
#- 'cyberbully:9290'
|
||||||
|
- 'king-albert:9290'
|
||||||
|
# - 'semyon-0x01:9290'
|
||||||
|
# - 'semyon-0x02:9290'
|
||||||
|
# - 'semyon-0x03:9290'
|
||||||
|
# - 'semyon-0x04:9290'
|
||||||
|
# - 'semyon-0x05:9290'
|
||||||
|
# - 'gpu-slut:9290'
|
||||||
|
|
||||||
|
# пиписа-экспортер
|
||||||
|
# # - job_name: 'vdk2ch-pipisa-exporter'
|
||||||
|
# # scheme: http
|
||||||
|
# # static_configs:
|
||||||
|
# # - targets:
|
||||||
|
# # - '192.168.0.55:9992'
|
||||||
|
# # relabel_configs:
|
||||||
|
# # - target_label: instance
|
||||||
|
# # replacement: 'cyberbully:9992'
|
||||||
|
# # - target_label: host
|
||||||
|
# # replacement: cyberbully
|
||||||
|
|
||||||
|
# пиписа-экспортер
|
||||||
|
# - job_name: 'vllm-exporter'
|
||||||
|
# scheme: http
|
||||||
|
# static_configs:
|
||||||
|
# - targets:
|
||||||
|
# - '192.168.0.4:8000'
|
||||||
|
# relabel_configs:
|
||||||
|
# - target_label: instance
|
||||||
|
# replacement: 'new-computer-home:8000'
|
||||||
|
# - target_label: host
|
||||||
|
# replacement: new-computer-home
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# # - job_name: 'nginx-vts-metrics'
|
||||||
|
# # scheme: http
|
||||||
|
# # metrics_path: /status/format/prometheus
|
||||||
|
# # static_configs:
|
||||||
|
# # - targets:
|
||||||
|
# # - '192.168.0.55:9042'
|
||||||
|
# # relabel_configs:
|
||||||
|
# # - target_label: instance
|
||||||
|
# # replacement: 'cyberbully:9042'
|
||||||
|
# - target_label: host
|
||||||
|
# replacement: cyberbully
|
||||||
|
|
||||||
|
# шиндоус-экспортер поверх ноута через домашний вайфай
|
||||||
|
# # - job_name: 'i-programmed-my-home-computer'
|
||||||
|
# # scheme: http
|
||||||
|
# # static_configs:
|
||||||
|
# # - targets:
|
||||||
|
# # - '192.168.0.2:9182'
|
||||||
|
# # - '192.168.0.3:9182'
|
||||||
|
# # relabel_configs:
|
||||||
|
# # - source_labels: [__address__]
|
||||||
|
# # regex: "(192.168.0.2.+)"
|
||||||
|
# # target_label: instance
|
||||||
|
# # replacement: 'Desktop-O50pt4s:9182'
|
||||||
|
# # - source_labels: [__address__]
|
||||||
|
# # regex: "(192.168.0.2.+)"
|
||||||
|
# # target_label: host
|
||||||
|
# # replacement: Desktop-O50pt4s
|
||||||
|
# # - source_labels: [__address__]
|
||||||
|
# # regex: "(192.168.0.3.+)"
|
||||||
|
# # target_label: instance
|
||||||
|
# # replacement: 'Desktop-edov3u5:9182'
|
||||||
|
# # - source_labels: [__address__]
|
||||||
|
# # regex: "(192.168.0.3.+)"
|
||||||
|
# # target_label: host
|
||||||
|
# # replacement: Desktop-edov3u5
|
||||||
|
|
||||||
|
#
|
||||||
|
# # - job_name: 'nvidia-gpu-metrics'
|
||||||
|
# # scheme: http
|
||||||
|
# # static_configs:
|
||||||
|
# # - targets:
|
||||||
|
# # - '192.168.0.2:9835'
|
||||||
|
# # relabel_configs:
|
||||||
|
# # - target_label: instance
|
||||||
|
# # replacement: 'Desktop-O50pt4s:9835'
|
||||||
|
|
||||||
|
|
||||||
|
# # # личный твиттус
|
||||||
|
# # - job_name: 'pleroma'
|
||||||
|
# # metrics_path: /api/pleroma/app_metrics
|
||||||
|
# # scheme: https
|
||||||
|
# # static_configs:
|
||||||
|
# # - targets: ['social.vdk2ch.ru']
|
||||||
|
|
||||||
|
# хайпервиза
|
||||||
|
- job_name: 'proxmox'
|
||||||
|
metrics_path: /pve
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 'king-albert.guaranteedstruggle.host:9221'
|
14
files/prometheus/rules.yaml
Normal file
14
files/prometheus/rules.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
groups:
|
||||||
|
- name: node-exporter-rules
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# CPU count
|
||||||
|
- record: instance:node_cpus:count
|
||||||
|
expr: count(node_cpu_seconds_total{mode="idle"}) without (cpu,mode)
|
||||||
|
|
||||||
|
#взято отсюда
|
||||||
|
# https://stackoverflow.com/questions/52480567/count-alerts-fired-by-prometheus
|
||||||
|
- name: alerts
|
||||||
|
rules:
|
||||||
|
- record: ALERTS_FOR_STATE:firing
|
||||||
|
expr: ALERTS_FOR_STATE and ignoring(alertstate) ALERTS{alertstate="firing"}
|
3
files/prometheus/web-config.yaml
Normal file
3
files/prometheus/web-config.yaml
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# tls_server_config:
|
||||||
|
# cert_file: /etc/prometheus/ssl/ .crt
|
||||||
|
# key_file: /etc/prometheus/ssl/ .key
|
6
playbooks/_common-setup.yml
Normal file
6
playbooks/_common-setup.yml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
#### TODO обе роли - пакаджесы и юзеры
|
||||||
|
---
|
||||||
|
- import_playbook: packages.yml
|
||||||
|
- import_playbook: resolvconf.yml
|
||||||
|
- import_playbook: users.yml
|
||||||
|
- import_playbook: exporters.yml
|
@ -1,6 +1,6 @@
|
|||||||
---
|
---
|
||||||
- name: node exporter!
|
- name: node exporter!
|
||||||
hosts: all
|
hosts: all:!lxc
|
||||||
gather_facts: yes
|
gather_facts: yes
|
||||||
become: yes
|
become: yes
|
||||||
roles:
|
roles:
|
||||||
@ -9,7 +9,7 @@
|
|||||||
- prometheus.prometheus.node_exporter
|
- prometheus.prometheus.node_exporter
|
||||||
#node_exporter_local_cache_path: "/tmp/node_exporter_cache"
|
#node_exporter_local_cache_path: "/tmp/node_exporter_cache"
|
||||||
- name: for hardware monitoring
|
- name: for hardware monitoring
|
||||||
hosts: physical_machines
|
hosts: king-albert.guaranteedstruggle.host
|
||||||
gather_facts: yes
|
gather_facts: yes
|
||||||
become: yes
|
become: yes
|
||||||
roles:
|
roles:
|
||||||
|
@ -14,6 +14,10 @@
|
|||||||
- net-tools
|
- net-tools
|
||||||
- vim
|
- vim
|
||||||
- sudo
|
- sudo
|
||||||
|
- tree
|
||||||
|
- jq
|
||||||
|
- rsync
|
||||||
|
|
||||||
#state: latest
|
#state: latest
|
||||||
state: present
|
state: present
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
# remote_user: root
|
# remote_user: root
|
||||||
|
|
||||||
gather_facts: no
|
gather_facts: no
|
||||||
become: yes
|
become: no # yes
|
||||||
tasks:
|
tasks:
|
||||||
- name: pingu!
|
- name: pingu!
|
||||||
ansible.builtin.ping:
|
ansible.builtin.ping:
|
||||||
|
65
playbooks/resolvconf.yml
Normal file
65
playbooks/resolvconf.yml
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
---
|
||||||
|
- name: make resolv.conf work fine
|
||||||
|
hosts: all
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: Install the packages versions
|
||||||
|
ansible.builtin.package:
|
||||||
|
name:
|
||||||
|
- systemd-resolved
|
||||||
|
state: present
|
||||||
|
- name: Make small file
|
||||||
|
register: systemd_resolved_conf
|
||||||
|
copy:
|
||||||
|
dest: "/etc/systemd/resolved.conf"
|
||||||
|
content: |
|
||||||
|
# This file is part of systemd.
|
||||||
|
#
|
||||||
|
# systemd is free software; you can redistribute it and/or modify it under the
|
||||||
|
# terms of the GNU Lesser General Public License as published by the Free
|
||||||
|
# Software Foundation; either version 2.1 of the License, or (at your option)
|
||||||
|
# any later version.
|
||||||
|
#
|
||||||
|
# Entries in this file show the compile time defaults. Local configuration
|
||||||
|
# should be created by either modifying this file, or by creating "drop-ins" in
|
||||||
|
# the resolved.conf.d/ subdirectory. The latter is generally recommended.
|
||||||
|
# Defaults can be restored by simply deleting this file and all drop-ins.
|
||||||
|
#
|
||||||
|
# Use 'systemd-analyze cat-config systemd/resolved.conf' to display the full config.
|
||||||
|
# See resolved.conf(5) for details.
|
||||||
|
|
||||||
|
[Resolve]
|
||||||
|
# Some examples of DNS servers which may be used for DNS= and FallbackDNS=:
|
||||||
|
# Cloudflare: 1.1.1.1#cloudflare-dns.com 1.0.0.1#cloudflare-dns.com 2606:4700:4700::1111#cloudflare-dns.com 2606:4700:4700::1001#cloudflare-dns.com
|
||||||
|
# Google: 8.8.8.8#dns.google 8.8.4.4#dns.google 2001:4860:4860::8888#dns.google 2001:4860:4860::8844#dns.google
|
||||||
|
# Quad9: 9.9.9.9#dns.quad9.net 149.112.112.112#dns.quad9.net 2620:fe::fe#dns.quad9.net 2620:fe::9#dns.quad9.net
|
||||||
|
DNS=192.168.0.88
|
||||||
|
FallbackDNS=192.168.0.1
|
||||||
|
Domains=guaranteedstruggle.host,just-for-me.internal
|
||||||
|
#DNSSEC=no
|
||||||
|
#DNSOverTLS=no
|
||||||
|
#MulticastDNS=yes
|
||||||
|
#LLMNR=yes
|
||||||
|
#Cache=yes
|
||||||
|
#CacheFromLocalhost=no
|
||||||
|
DNSStubListener=yes
|
||||||
|
#DNSStubListenerExtra=
|
||||||
|
#ReadEtcHosts=yes
|
||||||
|
#ResolveUnicastSingleLabel=no
|
||||||
|
|
||||||
|
|
||||||
|
- name: Make fix for resolv-conf rewriting
|
||||||
|
copy:
|
||||||
|
dest: "/etc/dhcp/dhclient-enter-hooks.d/nodnsupdate"
|
||||||
|
content: |
|
||||||
|
#!/bin/sh
|
||||||
|
make_resolv_conf(){
|
||||||
|
:
|
||||||
|
}
|
||||||
|
mode: +x
|
||||||
|
|
||||||
|
- name: restart service
|
||||||
|
service:
|
||||||
|
name: systemd-resolved
|
||||||
|
state: restarted
|
||||||
|
when: systemd_resolved_conf.changed
|
133
playbooks/software/prometheus.yml
Normal file
133
playbooks/software/prometheus.yml
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
---
|
||||||
|
- name: prom
|
||||||
|
hosts:
|
||||||
|
- prometheus.guaranteedstruggle.host
|
||||||
|
vars:
|
||||||
|
prom_version: '2.55.1'
|
||||||
|
gather_facts: yes
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
|
||||||
|
- name: Ensure group "prometheus" exists
|
||||||
|
ansible.builtin.group:
|
||||||
|
name: prometheus
|
||||||
|
state: present
|
||||||
|
- name: Add user "prometheus"
|
||||||
|
ansible.builtin.user:
|
||||||
|
name: prometheus
|
||||||
|
groups: prometheus
|
||||||
|
shell: /sbin/nologin
|
||||||
|
create_home: no
|
||||||
|
append: yes
|
||||||
|
comment: "prometheus nologin User"
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Creates directory
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /etc/prometheus
|
||||||
|
state: directory
|
||||||
|
group: prometheus
|
||||||
|
owner: prometheus
|
||||||
|
- name: Creates directory
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /usr/share/prometheus
|
||||||
|
state: directory
|
||||||
|
group: prometheus
|
||||||
|
owner: prometheus
|
||||||
|
- name: Creates directory
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /prometheus-data
|
||||||
|
state: directory
|
||||||
|
group: prometheus
|
||||||
|
owner: prometheus
|
||||||
|
|
||||||
|
- name: Unarchive a file that needs to be downloaded (added in 2.0)
|
||||||
|
ansible.builtin.unarchive:
|
||||||
|
src: https://github.com/prometheus/prometheus/releases/download/v{{prom_version}}/prometheus-{{prom_version}}.linux-amd64.tar.gz
|
||||||
|
dest: /usr/share/prometheus
|
||||||
|
creates: /usr/share/prometheus/prometheus-{{prom_version}}.linux-amd64
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
|
|
||||||
|
- name: Create a symbolic link
|
||||||
|
ansible.builtin.file:
|
||||||
|
src: /usr/share/prometheus/prometheus-{{prom_version}}.linux-amd64/prometheus
|
||||||
|
dest: /usr/sbin/prometheus
|
||||||
|
owner: prometheus
|
||||||
|
group: prometheus
|
||||||
|
state: link
|
||||||
|
- name: Create a symbolic link
|
||||||
|
ansible.builtin.file:
|
||||||
|
src: /usr/share/prometheus/prometheus-{{prom_version}}.linux-amd64/promtool
|
||||||
|
dest: /usr/sbin/promtool
|
||||||
|
owner: prometheus
|
||||||
|
group: prometheus
|
||||||
|
state: link
|
||||||
|
|
||||||
|
- name: Copy prometheus.yaml
|
||||||
|
register: prometheus_config_file
|
||||||
|
copy:
|
||||||
|
src: ../../files/prometheus/prometheus.yaml
|
||||||
|
dest: /etc/prometheus/prometheus.yaml
|
||||||
|
notify:
|
||||||
|
- reload prometheus
|
||||||
|
- name: Copy web-config
|
||||||
|
register: web_config_file
|
||||||
|
copy:
|
||||||
|
src: ../../files/prometheus/web-config.yaml
|
||||||
|
dest: /etc/prometheus/web-config.yaml
|
||||||
|
notify:
|
||||||
|
- reload prometheus
|
||||||
|
- name: Copy rules.yaml
|
||||||
|
register: rules_file
|
||||||
|
copy:
|
||||||
|
src: ../../files/prometheus/rules.yaml
|
||||||
|
dest: /etc/prometheus/rules.yaml
|
||||||
|
notify:
|
||||||
|
- reload prometheus
|
||||||
|
- name: Copy alerts.yaml
|
||||||
|
register: alerts_file
|
||||||
|
copy:
|
||||||
|
src: ../../files/prometheus/alerts.yaml
|
||||||
|
dest: /etc/prometheus/alerts.yaml
|
||||||
|
notify:
|
||||||
|
- reload prometheus
|
||||||
|
|
||||||
|
|
||||||
|
- name: Copy prometheus.service
|
||||||
|
register: prometheus_service_file
|
||||||
|
copy:
|
||||||
|
src: ../../files/prometheus/prometheus.service
|
||||||
|
dest: /etc/systemd/system/prometheus.service
|
||||||
|
|
||||||
|
|
||||||
|
- name: ensure service
|
||||||
|
ansible.builtin.systemd_service:
|
||||||
|
name: prometheus
|
||||||
|
state: started
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# - name: reload service
|
||||||
|
# ansible.builtin.systemd_service:
|
||||||
|
# name: prometheus
|
||||||
|
# state: reloaded
|
||||||
|
# when:
|
||||||
|
# - rules_file.changed
|
||||||
|
# - alerts_file.changed
|
||||||
|
# - prometheus_service_file.changed
|
||||||
|
# - web_config_file.changed
|
||||||
|
|
||||||
|
- name: Just force systemd to reread configs
|
||||||
|
ansible.builtin.systemd_service:
|
||||||
|
daemon_reload: true
|
||||||
|
when: prometheus_service_file.changed
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
handlers:
|
||||||
|
- name: reload prometheus
|
||||||
|
ansible.builtin.systemd_service:
|
||||||
|
name: prometheus
|
||||||
|
state: reloaded
|
||||||
|
|
||||||
|
#### TODO как откатывать неудачную проверку promtool'ом ?
|
@ -6,4 +6,4 @@ collections:
|
|||||||
version: 4.1.0
|
version: 4.1.0
|
||||||
|
|
||||||
- name: prometheus.prometheus
|
- name: prometheus.prometheus
|
||||||
version: 0.18.0
|
version: 0.22.0
|
Loading…
x
Reference in New Issue
Block a user