From f91004fff63de435bf58beb40ea940ba1603c193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Sun, 19 Sep 2021 12:03:52 +0200 Subject: [PATCH] feat(monitoring): add more rules, run grafana in retiolum --- configs/monitoring/pull.nix | 65 +++++++++++++++++++++++++------- configs/monitoring/template.tmpl | 25 ------------ 2 files changed, 51 insertions(+), 39 deletions(-) delete mode 100644 configs/monitoring/template.tmpl diff --git a/configs/monitoring/pull.nix b/configs/monitoring/pull.nix index f439b29..2d47737 100644 --- a/configs/monitoring/pull.nix +++ b/configs/monitoring/pull.nix @@ -5,14 +5,12 @@ in { services.grafana = { enable = true; - domain = "monitoring.xn--kiern-0qa.de"; - port = 2342; + domain = "grafana.kmein.r"; + port = 9444; addr = "127.0.0.1"; }; services.nginx.virtualHosts.${config.services.grafana.domain} = { - enableACME = true; - forceSSL = true; locations."/" = { proxyPass = "http://127.0.0.1:${toString config.services.grafana.port}"; proxyWebsockets = true; @@ -25,32 +23,56 @@ in rules = [ { alert = "ServiceDown"; - for = "5m"; expr = ''node_systemd_unit_state{state="failed"} == 1''; - labels.severity = "warning"; annotations = { - summary = "{{ $labels.name }} is down."; + summary = "{{$labels.job}}: Service {{$labels.name}} failed to start."; }; } { alert = "RootPartitionFull"; - for = "30m"; - expr = ''(node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; - labels.severity = "warning"; + for = "10m"; + expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; annotations = { - summary = "{{ $labels.job }} root disk full."; + summary = "{{ $labels.job }}: Filesystem is running out of space soon."; description = ''The root disk of {{ $labels.job }} has {{ $value | printf "%.2f" }}% free disk space (threshold at ${toString diskFreeThreshold}%).''; }; } + { + alert = "RootPartitionFullWeek"; + for = "1h"; + expr = ''node_filesystem_free_bytes{mountpoint="/"} '' + + ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0''; + annotations = { + summary = "{{$labels.job}}: Filesystem is running out of space in 7 days."; + }; + } + { + alert = "HighLoad"; + expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0''; + for = "10m"; + annotations = { + summary = "{{$labels.job}}: Running on high load: {{$value}}"; + }; + } + { + alert = "HighRAM"; + expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; + for = "1h"; + annotations.summary = "{{$labels.job}}: Using lots of RAM."; + } { alert = "HostDown"; expr = ''up == 0''; for = "5m"; - labels.severity = "warning"; annotations = { summary = "Host {{ $labels.job }} down for 5 minutes."; }; } + { + alert = "Reboot"; + expr = "time() - node_boot_time_seconds < 300"; + annotations.summary = "{{$labels.alias}}: Reboot"; + } ]; }]; })]; @@ -83,7 +105,20 @@ in --alertmanager.url=http://localhost:9093 --log.level=info \ --store=bolt --bolt.path=/var/lib/alertbot/bot.db \ --listen.addr="0.0.0.0:16320" \ - --template.paths=${./template.tmpl}''; + --template.paths=${pkgs.writeText "template.tmpl" '' + {{ define "telegram.default" }} + {{range .Alerts -}} + {{ if eq .Status "firing" }} + ⚠ {{ index .Annotations "summary"}} + {{ index .Annotations "description" }} + + See on Grafana. + {{ else -}} + 😌 {{ index .Annotations "summary"}} + {{- end }} + {{end -}} + {{end}} + ''}''; }; }; @@ -126,7 +161,9 @@ in } ]; - networking.firewall.allowedTCPPorts = [ lokiConfig.server.http_listen_port ]; + networking.firewall.allowedTCPPorts = [ + lokiConfig.server.http_listen_port + ]; services.loki = { enable = true; diff --git a/configs/monitoring/template.tmpl b/configs/monitoring/template.tmpl deleted file mode 100644 index 9f9b72e..0000000 --- a/configs/monitoring/template.tmpl +++ /dev/null @@ -1,25 +0,0 @@ -{{ define "telegram.default" }} -{{range .Alerts -}} -{{ $severity := index .Labels "severity" }} -{{ $desc := "" }} -{{ $grafana := "d/alpUteInz/niveum" }} -{{ if eq .Status "firing" }} - {{ $desc = index .Annotations "description" }} - {{ $grafana = index .Annotations "url" }} - {{- if eq $severity "critical" -}} - 🔥 CRITICAL 🔥 - {{- else if eq $severity "warning" -}} - ⚠ WARNING ⚠ - {{- else -}} - {{ $severity }} - {{- end -}} -{{ else -}} - {{ $desc = "The issue has been resolved. 😌" }} - 🎉 RESOLVED 🎉 -{{- end }} -{{ index .Labels "alertname"}} -{{ index .Annotations "summary"}}: {{ $desc }} - -See on Grafana. -{{end -}} -{{end}}