1
0
mirror of https://github.com/kmein/niveum synced 2026-03-21 04:11:07 +01:00

alertmanager: more alerts?

This commit is contained in:
2024-10-27 15:44:52 +01:00
parent 05b35bd9f0
commit cf72c1df1f

View File

@@ -89,10 +89,10 @@ in {
name = "niveum"; name = "niveum";
rules = [ rules = [
{ {
alert = "ServiceDown"; alert = "HostSystemdServiceCrashed";
expr = ''node_systemd_unit_state{state="failed"} == 1''; expr = ''(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
annotations = { annotations = {
summary = "{{$labels.name}} failed on {{$labels.job}}"; summary = "{{$labels.name}} failed on {{$labels.instance}}";
}; };
} }
{ {
@@ -100,7 +100,7 @@ in {
for = "10m"; for = "10m";
expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}'';
annotations = { annotations = {
summary = ''{{ $labels.job }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%''; summary = ''{{ $labels.instance }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%'';
}; };
} }
{ {
@@ -110,7 +110,7 @@ in {
''node_filesystem_free_bytes{mountpoint="/"} '' ''node_filesystem_free_bytes{mountpoint="/"} ''
+ ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0''; + ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0'';
annotations = { annotations = {
summary = "{{$labels.job}} running out of space in 7 days"; summary = "{{$labels.instance}} running out of space in 7 days";
}; };
} }
{ {
@@ -118,32 +118,68 @@ in {
expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0''; expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0'';
for = "10m"; for = "10m";
annotations = { annotations = {
summary = "{{$labels.job}} running on high load: {{$value}}"; summary = "{{$labels.instance}} running on high load: {{$value}}";
}; };
} }
{
alert = "HostUnusualNetworkThroughputIn";
expr = ''(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'';
for = "5m";
annotations.summary = "Host unusual network throughput in (instance {{ $labels.instance }})";
}
{
alert = "HostUnusualNetworkThroughputOut";
expr = ''(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'';
for = "5m";
annotations.summary = "Host unusual network throughput out (instance {{ $labels.instance }})";
}
{
alert = "HostUnusualDiskReadRate";
expr = ''(rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'';
for = "5m";
annotations.summary = "Host unusual disk read rate (instance {{ $labels.instance }})";
}
{
alert = "HostUnusualDiskWriteRate";
expr = ''(rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'';
for = "2m";
annotations.summary = "Host unusual disk write rate (instance {{ $labels.instance }})";
}
{
alert = "HostOutOfInodes";
expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'';
for = "2m";
annotations.summary = "Host out of inodes (instance {{ $labels.instance }})";
}
{
alert = "HostInodesWillFillIn24Hours";
expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0'';
for = "2m";
annotations.summary = "Host inodes will fill in 24 hours (instance {{ $labels.instance }})";
}
{ {
alert = "HighRAM"; alert = "HighRAM";
expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
for = "1h"; for = "1h";
annotations.summary = "{{$labels.job}} using lots of RAM"; annotations.summary = "{{$labels.instance}} using lots of RAM";
} }
{ {
alert = "UptimeMonster"; alert = "UptimeMonster";
expr = "time() - node_boot_time_seconds > 2592000"; expr = "time() - node_boot_time_seconds > 2592000";
annotations.summary = "uptime monster {{$labels.job}} up for more than 30 days"; annotations.summary = "uptime monster {{$labels.instance}} up for more than 30 days";
} }
{ {
alert = "HostDown"; alert = "HostDown";
expr = ''up == 0''; expr = ''up == 0'';
for = "5m"; for = "5m";
annotations = { annotations = {
summary = "{{ $labels.job }} seeming down since 5 minutes"; summary = "{{ $labels.instance }} seeming down since 5 minutes";
}; };
} }
{ {
alert = "Reboot"; alert = "Reboot";
expr = "time() - node_boot_time_seconds < 300"; expr = "time() - node_boot_time_seconds < 300";
annotations.summary = "{{$labels.job}} rebooted"; annotations.summary = "{{$labels.instance}} rebooted";
} }
{ {
alert = "ProbeFailed"; alert = "ProbeFailed";