From cf72c1df1f08765c4b956b5f94874278e106ee35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Sun, 27 Oct 2024 15:44:52 +0100 Subject: [PATCH] alertmanager: more alerts? --- systems/makanek/monitoring/default.nix | 56 +++++++++++++++++++++----- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/systems/makanek/monitoring/default.nix b/systems/makanek/monitoring/default.nix index 086ad13..4069707 100644 --- a/systems/makanek/monitoring/default.nix +++ b/systems/makanek/monitoring/default.nix @@ -89,10 +89,10 @@ in { name = "niveum"; rules = [ { - alert = "ServiceDown"; - expr = ''node_systemd_unit_state{state="failed"} == 1''; + alert = "HostSystemdServiceCrashed"; + expr = ''(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}''; annotations = { - summary = "{{$labels.name}} failed on {{$labels.job}}"; + summary = "{{$labels.name}} failed on {{$labels.instance}}"; }; } { @@ -100,7 +100,7 @@ in { for = "10m"; expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; annotations = { - summary = ''{{ $labels.job }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%''; + summary = ''{{ $labels.instance }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%''; }; } { @@ -110,7 +110,7 @@ in { ''node_filesystem_free_bytes{mountpoint="/"} '' + ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0''; annotations = { - summary = "{{$labels.job}} running out of space in 7 days"; + summary = "{{$labels.instance}} running out of space in 7 days"; }; } { @@ -118,32 +118,68 @@ in { expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0''; for = "10m"; annotations = { - summary = "{{$labels.job}} running on high load: {{$value}}"; + summary = "{{$labels.instance}} running on high load: {{$value}}"; }; } + { + alert = "HostUnusualNetworkThroughputIn"; + expr = ''(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100''; + for = "5m"; + annotations.summary = "Host unusual network throughput in (instance {{ $labels.instance }})"; + } + { + alert = "HostUnusualNetworkThroughputOut"; + expr = ''(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100''; + for = "5m"; + annotations.summary = "Host unusual network throughput out (instance {{ $labels.instance }})"; + } + { + alert = "HostUnusualDiskReadRate"; + expr = ''(rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50''; + for = "5m"; + annotations.summary = "Host unusual disk read rate (instance {{ $labels.instance }})"; + } + { + alert = "HostUnusualDiskWriteRate"; + expr = ''(rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50''; + for = "2m"; + annotations.summary = "Host unusual disk write rate (instance {{ $labels.instance }})"; + } + { + alert = "HostOutOfInodes"; + expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0''; + for = "2m"; + annotations.summary = "Host out of inodes (instance {{ $labels.instance }})"; + } + { + alert = "HostInodesWillFillIn24Hours"; + expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0''; + for = "2m"; + annotations.summary = "Host inodes will fill in 24 hours (instance {{ $labels.instance }})"; + } { alert = "HighRAM"; expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; for = "1h"; - annotations.summary = "{{$labels.job}} using lots of RAM"; + annotations.summary = "{{$labels.instance}} using lots of RAM"; } { alert = "UptimeMonster"; expr = "time() - node_boot_time_seconds > 2592000"; - annotations.summary = "uptime monster {{$labels.job}} up for more than 30 days"; + annotations.summary = "uptime monster {{$labels.instance}} up for more than 30 days"; } { alert = "HostDown"; expr = ''up == 0''; for = "5m"; annotations = { - summary = "{{ $labels.job }} seeming down since 5 minutes"; + summary = "{{ $labels.instance }} seeming down since 5 minutes"; }; } { alert = "Reboot"; expr = "time() - node_boot_time_seconds < 300"; - annotations.summary = "{{$labels.job}} rebooted"; + annotations.summary = "{{$labels.instance}} rebooted"; } { alert = "ProbeFailed";