1
0
mirror of https://github.com/kmein/niveum synced 2026-03-21 04:11:07 +01:00

alertmanager: alert lassulus for mastodon outtime

This commit is contained in:
2025-08-07 10:13:01 +02:00
parent d92f382b9a
commit 6e0026ed5c

View File

@@ -3,11 +3,13 @@
config, config,
pkgs, pkgs,
... ...
}: let }:
let
lokiConfig = import ./loki.nix; lokiConfig = import ./loki.nix;
blackboxConfig = import ./blackbox.nix; blackboxConfig = import ./blackbox.nix;
inherit (import ../../../lib) restic; inherit (import ../../../lib) restic;
in { in
{
services.grafana = { services.grafana = {
enable = true; enable = true;
settings = { settings = {
@@ -80,143 +82,150 @@ in {
} }
]; ];
services.prometheus.rules = let services.prometheus.rules =
diskFreeThreshold = 10; let
in [ diskFreeThreshold = 10;
(builtins.toJSON { in
groups = [ [
{ (builtins.toJSON {
name = "niveum"; groups = [
rules = [ {
{ name = "niveum";
alert = "HostSystemdServiceCrashed"; rules = [
expr = ''(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}''; {
annotations = { alert = "HostSystemdServiceCrashed";
description = "{{$labels.name}} failed on {{$labels.instance}}"; expr = ''(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
}; annotations = {
} description = "{{$labels.name}} failed on {{$labels.instance}}";
{ };
alert = "RootPartitionFull"; }
for = "10m"; {
expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; alert = "RootPartitionFull";
annotations = { for = "10m";
description = ''{{ $labels.instance }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%''; expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}'';
}; annotations = {
} description = ''{{ $labels.instance }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%'';
{ };
alert = "RootPartitionFullWeek"; }
for = "1h"; {
expr = alert = "RootPartitionFullWeek";
''node_filesystem_free_bytes{mountpoint="/"} '' for = "1h";
+ ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0''; expr =
annotations = { ''node_filesystem_free_bytes{mountpoint="/"} ''
description = "{{$labels.instance}} running out of space in 7 days"; + ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0'';
}; annotations = {
} description = "{{$labels.instance}} running out of space in 7 days";
{ };
alert = "HighLoad"; }
expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0''; {
for = "10m"; alert = "HighLoad";
annotations = { expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0'';
description = "{{$labels.instance}} running on high load: {{$value}}"; for = "10m";
}; annotations = {
} description = "{{$labels.instance}} running on high load: {{$value}}";
{ };
alert = "HostUnusualNetworkThroughputIn"; }
expr = ''(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100''; {
for = "5m"; alert = "HostUnusualNetworkThroughputIn";
annotations.description = "Host unusual network throughput in (instance {{ $labels.instance }})"; expr = ''(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'';
} for = "5m";
{ annotations.description = "Host unusual network throughput in (instance {{ $labels.instance }})";
alert = "HostUnusualNetworkThroughputOut"; }
expr = ''(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100''; {
for = "5m"; alert = "HostUnusualNetworkThroughputOut";
annotations.description = "Host unusual network throughput out (instance {{ $labels.instance }})"; expr = ''(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'';
} for = "5m";
{ annotations.description = "Host unusual network throughput out (instance {{ $labels.instance }})";
alert = "HostUnusualDiskReadRate"; }
expr = ''(rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50''; {
for = "5m"; alert = "HostUnusualDiskReadRate";
annotations.description = "Host unusual disk read rate (instance {{ $labels.instance }})"; expr = ''(rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'';
} for = "5m";
{ annotations.description = "Host unusual disk read rate (instance {{ $labels.instance }})";
alert = "HostUnusualDiskWriteRate"; }
expr = ''(rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50''; {
for = "2m"; alert = "HostUnusualDiskWriteRate";
annotations.description = "Host unusual disk write rate (instance {{ $labels.instance }})"; expr = ''(rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'';
} for = "2m";
{ annotations.description = "Host unusual disk write rate (instance {{ $labels.instance }})";
alert = "HostOutOfInodes"; }
expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0''; {
for = "2m"; alert = "HostOutOfInodes";
annotations.description = "Host out of inodes (instance {{ $labels.instance }})"; expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'';
} for = "2m";
{ annotations.description = "Host out of inodes (instance {{ $labels.instance }})";
alert = "HostInodesWillFillIn24Hours"; }
expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0''; {
for = "2m"; alert = "HostInodesWillFillIn24Hours";
annotations.description = "Host inodes will fill in 24 hours (instance {{ $labels.instance }})"; expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0'';
} for = "2m";
{ annotations.description = "Host inodes will fill in 24 hours (instance {{ $labels.instance }})";
alert = "HighRAM"; }
expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; {
for = "1h"; alert = "HighRAM";
annotations.description = "{{$labels.instance}} using lots of RAM"; expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
} for = "1h";
{ annotations.description = "{{$labels.instance}} using lots of RAM";
alert = "UptimeMonster"; }
expr = "time() - node_boot_time_seconds > 2592000"; {
annotations.description = "uptime monster {{$labels.instance}} up for more than 30 days"; alert = "UptimeMonster";
} expr = "time() - node_boot_time_seconds > 2592000";
{ annotations.description = "uptime monster {{$labels.instance}} up for more than 30 days";
alert = "HostDown"; }
expr = ''up == 0''; {
for = "5m"; alert = "HostDown";
annotations = { expr = ''up == 0'';
description = "{{ $labels.instance }} seeming down since 5 minutes"; for = "5m";
}; annotations = {
} description = "{{ $labels.instance }} seeming down since 5 minutes";
{ };
alert = "Reboot"; }
expr = "time() - node_boot_time_seconds < 300"; {
annotations.description = "{{$labels.instance}} rebooted"; alert = "Reboot";
} expr = "time() - node_boot_time_seconds < 300";
{ annotations.description = "{{$labels.instance}} rebooted";
alert = "ProbeFailed"; }
expr = "probe_success == 0"; {
for = "5m"; alert = "Mastodon";
annotations.description = "HTTP probe failed for {{$labels.instance}}"; expr = ''probe_success{instance="https://social.krebsco.de"}'';
} for = "5m";
{ annotations.description = "Mastodon instance {{$labels.instance}} is down";
alert = "SlowProbe"; }
expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1"; {
for = "5m"; alert = "ProbeFailed";
annotations.description = "HTTP probe slow for {{$labels.instance}}"; expr = "probe_success == 0";
} for = "5m";
{ annotations.description = "HTTP probe failed for {{$labels.instance}}";
alert = "HttpStatusCode"; }
expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)"; {
for = "5m"; alert = "SlowProbe";
annotations.description = "status code {{$value}} for {{$labels.instance}}"; expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1";
} for = "5m";
{ annotations.description = "HTTP probe slow for {{$labels.instance}}";
alert = "SslExpirySoon"; }
expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30"; {
for = "5m"; alert = "HttpStatusCode";
annotations.description = "SSL certificate for {{$labels.instance}} expires in 30 days"; expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)";
} for = "5m";
{ annotations.description = "status code {{$value}} for {{$labels.instance}}";
alert = "SslExpiry"; }
expr = "probe_ssl_earliest_cert_expiry - time() <= 0"; {
for = "5m"; alert = "SslExpirySoon";
annotations.description = "SSL certificate for {{$labels.instance}} has expired"; expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30";
} for = "5m";
]; annotations.description = "SSL certificate for {{$labels.instance}} expires in 30 days";
} }
]; {
}) alert = "SslExpiry";
]; expr = "probe_ssl_earliest_cert_expiry - time() <= 0";
for = "5m";
annotations.description = "SSL certificate for {{$labels.instance}} has expired";
}
];
}
];
})
];
# ref https://github.com/Mic92/dotfiles/blob/f44bac5dd6970ed3fbb4feb906917331ec3c2be5/machines/eva/modules/prometheus/default.nix # ref https://github.com/Mic92/dotfiles/blob/f44bac5dd6970ed3fbb4feb906917331ec3c2be5/machines/eva/modules/prometheus/default.nix
systemd.services.matrix-hook = { systemd.services.matrix-hook = {
@@ -246,6 +255,33 @@ in {
}; };
}; };
systemd.services.matrix-hook-lassulus = {
description = "Matrix Hook";
after = [ "network.target" ];
wantedBy = [ "multi-user.target" ];
environment = {
HTTP_ADDRESS = "[::1]";
HTTP_PORT = "9089";
MX_HOMESERVER = "https://matrix.4d2.org";
MX_ID = "@lakai:4d2.org";
MX_ROOMID = "!MJAGqBAOKZGMywzwkI:lassul.us";
MX_MSG_TEMPLATE = "${pkgs.matrix-hook}/message.html.tmpl";
};
serviceConfig = {
EnvironmentFile = [
# format: MX_TOKEN=<token>
config.age.secrets.matrix-token-lakai-env.path
];
Type = "simple";
ExecStart = "${pkgs.matrix-hook}/bin/matrix-hook";
Restart = "always";
RestartSec = "10";
DynamicUser = true;
User = "matrix-hook";
Group = "matrix-hook";
};
};
age.secrets = { age.secrets = {
matrix-token-lakai-env.file = ../../../secrets/matrix-token-lakai-env.age; matrix-token-lakai-env.file = ../../../secrets/matrix-token-lakai-env.age;
}; };
@@ -260,8 +296,23 @@ in {
group_wait = "30s"; group_wait = "30s";
repeat_interval = "24h"; repeat_interval = "24h";
receiver = "matrix"; receiver = "matrix";
routes = [
{
receiver = "lassulus";
matchers = [ "alertname = \"Mastodon\"" ];
}
];
}; };
receivers = [ receivers = [
{
name = "lassulus";
webhook_configs = [
{
url = "http://localhost:9089/alert";
max_alerts = 5;
}
];
}
{ {
name = "matrix"; name = "matrix";
webhook_configs = [ webhook_configs = [