From fe8af09148cb6a20bb0268691d9862769f92a8eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Fri, 28 Jan 2022 09:04:40 +0100 Subject: [PATCH] feat(monitoring): improve messages --- systems/makanek/monitoring/default.nix | 27 +++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/systems/makanek/monitoring/default.nix b/systems/makanek/monitoring/default.nix index c95f990..3054841 100644 --- a/systems/makanek/monitoring/default.nix +++ b/systems/makanek/monitoring/default.nix @@ -27,7 +27,7 @@ in alert = "ServiceDown"; expr = ''node_systemd_unit_state{state="failed"} == 1''; annotations = { - summary = "{{$labels.job}}: Service {{$labels.name}} failed to start."; + summary = "{{$labels.name}} failed on {{$labels.job}}"; }; } { @@ -35,8 +35,7 @@ in for = "10m"; expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; annotations = { - summary = "{{ $labels.job }}: Filesystem is running out of space soon."; - description = ''The root disk of {{ $labels.job }} has {{ $value | printf "%.2f" }}% free disk space (threshold at ${toString diskFreeThreshold}%).''; + summary = ''{{ $labels.job }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%''; }; } { @@ -45,7 +44,7 @@ in expr = ''node_filesystem_free_bytes{mountpoint="/"} '' + ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0''; annotations = { - summary = "{{$labels.job}}: Filesystem is running out of space in 7 days."; + summary = "{{$labels.job}} running out of space in 7 days"; }; } { @@ -53,62 +52,62 @@ in expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0''; for = "10m"; annotations = { - summary = "{{$labels.job}}: Running on high load: {{$value}}"; + summary = "{{$labels.job}} running on high load: {{$value}}"; }; } { alert = "HighRAM"; expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; for = "1h"; - annotations.summary = "{{$labels.job}}: Using lots of RAM."; + annotations.summary = "{{$labels.job}} using lots of RAM"; } { alert = "UptimeMonster"; expr = "time() - node_boot_time_seconds > 2592000"; - annotations.summary = "{{$labels.job}}: up for more than 30 days."; + annotations.summary = "uptime monster {{$labels.job}} up for more than 30 days"; } { alert = "HostDown"; expr = ''up == 0''; for = "5m"; annotations = { - summary = "Host {{ $labels.job }} down for 5 minutes."; + summary = "{{ $labels.job }} seeming down since 5 minutes"; }; } { alert = "Reboot"; expr = "time() - node_boot_time_seconds < 300"; - annotations.summary = "{{$labels.job}}: Reboot"; + annotations.summary = "{{$labels.job}} rebooted"; } { alert = "ProbeFailed"; expr = "probe_success == 0"; for = "5m"; - annotations.summary = "{{$labels.instance}}: probe failed"; + annotations.summary = "HTTP probe failed for {{$labels.instance}}"; } { alert = "SlowProbe"; expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1"; for = "5m"; - annotations.summary = "{{$labels.instance}}: HTTP probe slow"; + annotations.summary = "HTTP probe slow for {{$labels.instance}}"; } { alert = "HttpStatusCode"; expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)"; for = "5m"; - annotations.summary = "{{$labels.instance}}: status code {{$value}}"; + annotations.summary = "status code {{$value}} for {{$labels.instance}}"; } { alert = "SslExpirySoon"; expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30"; for = "5m"; - annotations.summary = "{{$labels.instance}}: SSL certificate expires in 30 days"; + annotations.summary = "SSL certificate for {{$labels.instance}} expires in 30 days"; } { alert = "SslExpiry"; expr = "probe_ssl_earliest_cert_expiry - time() <= 0"; for = "5m"; - annotations.summary = "{{$labels.instance}}: SSL certificate has expired"; + annotations.summary = "SSL certificate for {{$labels.instance}} has expired"; } ]; }];