1
0
mirror of https://github.com/kmein/niveum synced 2026-03-20 12:01:06 +01:00

feat(monitoring): improve messages

This commit is contained in:
2022-01-28 09:04:40 +01:00
parent d9ca1e673d
commit fe8af09148

View File

@@ -27,7 +27,7 @@ in
alert = "ServiceDown"; alert = "ServiceDown";
expr = ''node_systemd_unit_state{state="failed"} == 1''; expr = ''node_systemd_unit_state{state="failed"} == 1'';
annotations = { annotations = {
summary = "{{$labels.job}}: Service {{$labels.name}} failed to start."; summary = "{{$labels.name}} failed on {{$labels.job}}";
}; };
} }
{ {
@@ -35,8 +35,7 @@ in
for = "10m"; for = "10m";
expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}'';
annotations = { annotations = {
summary = "{{ $labels.job }}: Filesystem is running out of space soon."; summary = ''{{ $labels.job }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%'';
description = ''The root disk of {{ $labels.job }} has {{ $value | printf "%.2f" }}% free disk space (threshold at ${toString diskFreeThreshold}%).'';
}; };
} }
{ {
@@ -45,7 +44,7 @@ in
expr = ''node_filesystem_free_bytes{mountpoint="/"} '' expr = ''node_filesystem_free_bytes{mountpoint="/"} ''
+ ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0''; + ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0'';
annotations = { annotations = {
summary = "{{$labels.job}}: Filesystem is running out of space in 7 days."; summary = "{{$labels.job}} running out of space in 7 days";
}; };
} }
{ {
@@ -53,62 +52,62 @@ in
expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0''; expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0'';
for = "10m"; for = "10m";
annotations = { annotations = {
summary = "{{$labels.job}}: Running on high load: {{$value}}"; summary = "{{$labels.job}} running on high load: {{$value}}";
}; };
} }
{ {
alert = "HighRAM"; alert = "HighRAM";
expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
for = "1h"; for = "1h";
annotations.summary = "{{$labels.job}}: Using lots of RAM."; annotations.summary = "{{$labels.job}} using lots of RAM";
} }
{ {
alert = "UptimeMonster"; alert = "UptimeMonster";
expr = "time() - node_boot_time_seconds > 2592000"; expr = "time() - node_boot_time_seconds > 2592000";
annotations.summary = "{{$labels.job}}: up for more than 30 days."; annotations.summary = "uptime monster {{$labels.job}} up for more than 30 days";
} }
{ {
alert = "HostDown"; alert = "HostDown";
expr = ''up == 0''; expr = ''up == 0'';
for = "5m"; for = "5m";
annotations = { annotations = {
summary = "Host {{ $labels.job }} down for 5 minutes."; summary = "{{ $labels.job }} seeming down since 5 minutes";
}; };
} }
{ {
alert = "Reboot"; alert = "Reboot";
expr = "time() - node_boot_time_seconds < 300"; expr = "time() - node_boot_time_seconds < 300";
annotations.summary = "{{$labels.job}}: Reboot"; annotations.summary = "{{$labels.job}} rebooted";
} }
{ {
alert = "ProbeFailed"; alert = "ProbeFailed";
expr = "probe_success == 0"; expr = "probe_success == 0";
for = "5m"; for = "5m";
annotations.summary = "{{$labels.instance}}: probe failed"; annotations.summary = "HTTP probe failed for {{$labels.instance}}";
} }
{ {
alert = "SlowProbe"; alert = "SlowProbe";
expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1"; expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1";
for = "5m"; for = "5m";
annotations.summary = "{{$labels.instance}}: HTTP probe slow"; annotations.summary = "HTTP probe slow for {{$labels.instance}}";
} }
{ {
alert = "HttpStatusCode"; alert = "HttpStatusCode";
expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)"; expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)";
for = "5m"; for = "5m";
annotations.summary = "{{$labels.instance}}: status code {{$value}}"; annotations.summary = "status code {{$value}} for {{$labels.instance}}";
} }
{ {
alert = "SslExpirySoon"; alert = "SslExpirySoon";
expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30"; expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30";
for = "5m"; for = "5m";
annotations.summary = "{{$labels.instance}}: SSL certificate expires in 30 days"; annotations.summary = "SSL certificate for {{$labels.instance}} expires in 30 days";
} }
{ {
alert = "SslExpiry"; alert = "SslExpiry";
expr = "probe_ssl_earliest_cert_expiry - time() <= 0"; expr = "probe_ssl_earliest_cert_expiry - time() <= 0";
for = "5m"; for = "5m";
annotations.summary = "{{$labels.instance}}: SSL certificate has expired"; annotations.summary = "SSL certificate for {{$labels.instance}} has expired";
} }
]; ];
}]; }];