From 6e0026ed5cb9809399e49ad04e31deecc5994824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Thu, 7 Aug 2025 10:13:01 +0200 Subject: [PATCH] alertmanager: alert lassulus for mastodon outtime --- systems/makanek/monitoring/default.nix | 329 ++++++++++++++----------- 1 file changed, 190 insertions(+), 139 deletions(-) diff --git a/systems/makanek/monitoring/default.nix b/systems/makanek/monitoring/default.nix index 42f3654..1ec4ab6 100644 --- a/systems/makanek/monitoring/default.nix +++ b/systems/makanek/monitoring/default.nix @@ -3,11 +3,13 @@ config, pkgs, ... -}: let +}: +let lokiConfig = import ./loki.nix; blackboxConfig = import ./blackbox.nix; inherit (import ../../../lib) restic; -in { +in +{ services.grafana = { enable = true; settings = { @@ -80,143 +82,150 @@ in { } ]; - services.prometheus.rules = let - diskFreeThreshold = 10; - in [ - (builtins.toJSON { - groups = [ - { - name = "niveum"; - rules = [ - { - alert = "HostSystemdServiceCrashed"; - expr = ''(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}''; - annotations = { - description = "{{$labels.name}} failed on {{$labels.instance}}"; - }; - } - { - alert = "RootPartitionFull"; - for = "10m"; - expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; - annotations = { - description = ''{{ $labels.instance }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%''; - }; - } - { - alert = "RootPartitionFullWeek"; - for = "1h"; - expr = - ''node_filesystem_free_bytes{mountpoint="/"} '' - + ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0''; - annotations = { - description = "{{$labels.instance}} running out of space in 7 days"; - }; - } - { - alert = "HighLoad"; - expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0''; - for = "10m"; - annotations = { - description = "{{$labels.instance}} running on high load: {{$value}}"; - }; - } - { - alert = "HostUnusualNetworkThroughputIn"; - expr = ''(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100''; - for = "5m"; - annotations.description = "Host unusual network throughput in (instance {{ $labels.instance }})"; - } - { - alert = "HostUnusualNetworkThroughputOut"; - expr = ''(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100''; - for = "5m"; - annotations.description = "Host unusual network throughput out (instance {{ $labels.instance }})"; - } - { - alert = "HostUnusualDiskReadRate"; - expr = ''(rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50''; - for = "5m"; - annotations.description = "Host unusual disk read rate (instance {{ $labels.instance }})"; - } - { - alert = "HostUnusualDiskWriteRate"; - expr = ''(rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50''; - for = "2m"; - annotations.description = "Host unusual disk write rate (instance {{ $labels.instance }})"; - } - { - alert = "HostOutOfInodes"; - expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0''; - for = "2m"; - annotations.description = "Host out of inodes (instance {{ $labels.instance }})"; - } - { - alert = "HostInodesWillFillIn24Hours"; - expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0''; - for = "2m"; - annotations.description = "Host inodes will fill in 24 hours (instance {{ $labels.instance }})"; - } - { - alert = "HighRAM"; - expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; - for = "1h"; - annotations.description = "{{$labels.instance}} using lots of RAM"; - } - { - alert = "UptimeMonster"; - expr = "time() - node_boot_time_seconds > 2592000"; - annotations.description = "uptime monster {{$labels.instance}} up for more than 30 days"; - } - { - alert = "HostDown"; - expr = ''up == 0''; - for = "5m"; - annotations = { - description = "{{ $labels.instance }} seeming down since 5 minutes"; - }; - } - { - alert = "Reboot"; - expr = "time() - node_boot_time_seconds < 300"; - annotations.description = "{{$labels.instance}} rebooted"; - } - { - alert = "ProbeFailed"; - expr = "probe_success == 0"; - for = "5m"; - annotations.description = "HTTP probe failed for {{$labels.instance}}"; - } - { - alert = "SlowProbe"; - expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1"; - for = "5m"; - annotations.description = "HTTP probe slow for {{$labels.instance}}"; - } - { - alert = "HttpStatusCode"; - expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)"; - for = "5m"; - annotations.description = "status code {{$value}} for {{$labels.instance}}"; - } - { - alert = "SslExpirySoon"; - expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30"; - for = "5m"; - annotations.description = "SSL certificate for {{$labels.instance}} expires in 30 days"; - } - { - alert = "SslExpiry"; - expr = "probe_ssl_earliest_cert_expiry - time() <= 0"; - for = "5m"; - annotations.description = "SSL certificate for {{$labels.instance}} has expired"; - } - ]; - } - ]; - }) - ]; - + services.prometheus.rules = + let + diskFreeThreshold = 10; + in + [ + (builtins.toJSON { + groups = [ + { + name = "niveum"; + rules = [ + { + alert = "HostSystemdServiceCrashed"; + expr = ''(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}''; + annotations = { + description = "{{$labels.name}} failed on {{$labels.instance}}"; + }; + } + { + alert = "RootPartitionFull"; + for = "10m"; + expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; + annotations = { + description = ''{{ $labels.instance }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%''; + }; + } + { + alert = "RootPartitionFullWeek"; + for = "1h"; + expr = + ''node_filesystem_free_bytes{mountpoint="/"} '' + + ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0''; + annotations = { + description = "{{$labels.instance}} running out of space in 7 days"; + }; + } + { + alert = "HighLoad"; + expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0''; + for = "10m"; + annotations = { + description = "{{$labels.instance}} running on high load: {{$value}}"; + }; + } + { + alert = "HostUnusualNetworkThroughputIn"; + expr = ''(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100''; + for = "5m"; + annotations.description = "Host unusual network throughput in (instance {{ $labels.instance }})"; + } + { + alert = "HostUnusualNetworkThroughputOut"; + expr = ''(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100''; + for = "5m"; + annotations.description = "Host unusual network throughput out (instance {{ $labels.instance }})"; + } + { + alert = "HostUnusualDiskReadRate"; + expr = ''(rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50''; + for = "5m"; + annotations.description = "Host unusual disk read rate (instance {{ $labels.instance }})"; + } + { + alert = "HostUnusualDiskWriteRate"; + expr = ''(rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50''; + for = "2m"; + annotations.description = "Host unusual disk write rate (instance {{ $labels.instance }})"; + } + { + alert = "HostOutOfInodes"; + expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0''; + for = "2m"; + annotations.description = "Host out of inodes (instance {{ $labels.instance }})"; + } + { + alert = "HostInodesWillFillIn24Hours"; + expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0''; + for = "2m"; + annotations.description = "Host inodes will fill in 24 hours (instance {{ $labels.instance }})"; + } + { + alert = "HighRAM"; + expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; + for = "1h"; + annotations.description = "{{$labels.instance}} using lots of RAM"; + } + { + alert = "UptimeMonster"; + expr = "time() - node_boot_time_seconds > 2592000"; + annotations.description = "uptime monster {{$labels.instance}} up for more than 30 days"; + } + { + alert = "HostDown"; + expr = ''up == 0''; + for = "5m"; + annotations = { + description = "{{ $labels.instance }} seeming down since 5 minutes"; + }; + } + { + alert = "Reboot"; + expr = "time() - node_boot_time_seconds < 300"; + annotations.description = "{{$labels.instance}} rebooted"; + } + { + alert = "Mastodon"; + expr = ''probe_success{instance="https://social.krebsco.de"}''; + for = "5m"; + annotations.description = "Mastodon instance {{$labels.instance}} is down"; + } + { + alert = "ProbeFailed"; + expr = "probe_success == 0"; + for = "5m"; + annotations.description = "HTTP probe failed for {{$labels.instance}}"; + } + { + alert = "SlowProbe"; + expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1"; + for = "5m"; + annotations.description = "HTTP probe slow for {{$labels.instance}}"; + } + { + alert = "HttpStatusCode"; + expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)"; + for = "5m"; + annotations.description = "status code {{$value}} for {{$labels.instance}}"; + } + { + alert = "SslExpirySoon"; + expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30"; + for = "5m"; + annotations.description = "SSL certificate for {{$labels.instance}} expires in 30 days"; + } + { + alert = "SslExpiry"; + expr = "probe_ssl_earliest_cert_expiry - time() <= 0"; + for = "5m"; + annotations.description = "SSL certificate for {{$labels.instance}} has expired"; + } + ]; + } + ]; + }) + ]; # ref https://github.com/Mic92/dotfiles/blob/f44bac5dd6970ed3fbb4feb906917331ec3c2be5/machines/eva/modules/prometheus/default.nix systemd.services.matrix-hook = { @@ -246,6 +255,33 @@ in { }; }; + systemd.services.matrix-hook-lassulus = { + description = "Matrix Hook"; + after = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + environment = { + HTTP_ADDRESS = "[::1]"; + HTTP_PORT = "9089"; + MX_HOMESERVER = "https://matrix.4d2.org"; + MX_ID = "@lakai:4d2.org"; + MX_ROOMID = "!MJAGqBAOKZGMywzwkI:lassul.us"; + MX_MSG_TEMPLATE = "${pkgs.matrix-hook}/message.html.tmpl"; + }; + serviceConfig = { + EnvironmentFile = [ + # format: MX_TOKEN= + config.age.secrets.matrix-token-lakai-env.path + ]; + Type = "simple"; + ExecStart = "${pkgs.matrix-hook}/bin/matrix-hook"; + Restart = "always"; + RestartSec = "10"; + DynamicUser = true; + User = "matrix-hook"; + Group = "matrix-hook"; + }; + }; + age.secrets = { matrix-token-lakai-env.file = ../../../secrets/matrix-token-lakai-env.age; }; @@ -260,8 +296,23 @@ in { group_wait = "30s"; repeat_interval = "24h"; receiver = "matrix"; + routes = [ + { + receiver = "lassulus"; + matchers = [ "alertname = \"Mastodon\"" ]; + } + ]; }; receivers = [ + { + name = "lassulus"; + webhook_configs = [ + { + url = "http://localhost:9089/alert"; + max_alerts = 5; + } + ]; + } { name = "matrix"; webhook_configs = [