mirror of
https://github.com/kmein/niveum
synced 2026-03-30 01:01:10 +02:00
Compare commits
4 Commits
630ec3d052
...
cbce724ade
| Author | SHA1 | Date | |
|---|---|---|---|
| cbce724ade | |||
| 00dfe27738 | |||
| 6e0026ed5c | |||
| d92f382b9a |
@@ -17,11 +17,7 @@ in {
|
|||||||
chmod o+rx ${stateDir}
|
chmod o+rx ${stateDir}
|
||||||
cd ${stateDir}
|
cd ${stateDir}
|
||||||
(${pkgs.curl}/bin/curl -s -o wallpaper.tmp -z wallpaper.tmp ${lib.escapeShellArg url} && cp wallpaper.tmp wallpaper) || :
|
(${pkgs.curl}/bin/curl -s -o wallpaper.tmp -z wallpaper.tmp ${lib.escapeShellArg url} && cp wallpaper.tmp wallpaper) || :
|
||||||
if [ -z $SWAYSOCK ]; then
|
${pkgs.feh}/bin/feh --no-fehbg --bg-scale wallpaper
|
||||||
${pkgs.feh}/bin/feh --no-fehbg --bg-scale wallpaper
|
|
||||||
else
|
|
||||||
${pkgs.sway}/bin/swaymsg -s $SWAYSOCK 'output * bg ${stateDir}/wallpaper fill'
|
|
||||||
fi
|
|
||||||
'';
|
'';
|
||||||
startAt = "*:00,10,20,30,40,50";
|
startAt = "*:00,10,20,30,40,50";
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
|
|||||||
@@ -3,11 +3,13 @@
|
|||||||
config,
|
config,
|
||||||
pkgs,
|
pkgs,
|
||||||
...
|
...
|
||||||
}: let
|
}:
|
||||||
|
let
|
||||||
lokiConfig = import ./loki.nix;
|
lokiConfig = import ./loki.nix;
|
||||||
blackboxConfig = import ./blackbox.nix;
|
blackboxConfig = import ./blackbox.nix;
|
||||||
inherit (import ../../../lib) restic;
|
inherit (import ../../../lib) restic;
|
||||||
in {
|
in
|
||||||
|
{
|
||||||
services.grafana = {
|
services.grafana = {
|
||||||
enable = true;
|
enable = true;
|
||||||
settings = {
|
settings = {
|
||||||
@@ -80,143 +82,150 @@ in {
|
|||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
services.prometheus.rules = let
|
services.prometheus.rules =
|
||||||
diskFreeThreshold = 10;
|
let
|
||||||
in [
|
diskFreeThreshold = 10;
|
||||||
(builtins.toJSON {
|
in
|
||||||
groups = [
|
[
|
||||||
{
|
(builtins.toJSON {
|
||||||
name = "niveum";
|
groups = [
|
||||||
rules = [
|
{
|
||||||
{
|
name = "niveum";
|
||||||
alert = "HostSystemdServiceCrashed";
|
rules = [
|
||||||
expr = ''(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
|
{
|
||||||
annotations = {
|
alert = "HostSystemdServiceCrashed";
|
||||||
description = "{{$labels.name}} failed on {{$labels.instance}}";
|
expr = ''(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
|
||||||
};
|
annotations = {
|
||||||
}
|
description = "{{$labels.name}} failed on {{$labels.instance}}";
|
||||||
{
|
};
|
||||||
alert = "RootPartitionFull";
|
}
|
||||||
for = "10m";
|
{
|
||||||
expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}'';
|
alert = "RootPartitionFull";
|
||||||
annotations = {
|
for = "10m";
|
||||||
description = ''{{ $labels.instance }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%'';
|
expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}'';
|
||||||
};
|
annotations = {
|
||||||
}
|
description = ''{{ $labels.instance }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%'';
|
||||||
{
|
};
|
||||||
alert = "RootPartitionFullWeek";
|
}
|
||||||
for = "1h";
|
{
|
||||||
expr =
|
alert = "RootPartitionFullWeek";
|
||||||
''node_filesystem_free_bytes{mountpoint="/"} ''
|
for = "1h";
|
||||||
+ ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0'';
|
expr =
|
||||||
annotations = {
|
''node_filesystem_free_bytes{mountpoint="/"} ''
|
||||||
description = "{{$labels.instance}} running out of space in 7 days";
|
+ ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0'';
|
||||||
};
|
annotations = {
|
||||||
}
|
description = "{{$labels.instance}} running out of space in 7 days";
|
||||||
{
|
};
|
||||||
alert = "HighLoad";
|
}
|
||||||
expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0'';
|
{
|
||||||
for = "10m";
|
alert = "HighLoad";
|
||||||
annotations = {
|
expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0'';
|
||||||
description = "{{$labels.instance}} running on high load: {{$value}}";
|
for = "10m";
|
||||||
};
|
annotations = {
|
||||||
}
|
description = "{{$labels.instance}} running on high load: {{$value}}";
|
||||||
{
|
};
|
||||||
alert = "HostUnusualNetworkThroughputIn";
|
}
|
||||||
expr = ''(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'';
|
{
|
||||||
for = "5m";
|
alert = "HostUnusualNetworkThroughputIn";
|
||||||
annotations.description = "Host unusual network throughput in (instance {{ $labels.instance }})";
|
expr = ''(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'';
|
||||||
}
|
for = "5m";
|
||||||
{
|
annotations.description = "Host unusual network throughput in (instance {{ $labels.instance }})";
|
||||||
alert = "HostUnusualNetworkThroughputOut";
|
}
|
||||||
expr = ''(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'';
|
{
|
||||||
for = "5m";
|
alert = "HostUnusualNetworkThroughputOut";
|
||||||
annotations.description = "Host unusual network throughput out (instance {{ $labels.instance }})";
|
expr = ''(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'';
|
||||||
}
|
for = "5m";
|
||||||
{
|
annotations.description = "Host unusual network throughput out (instance {{ $labels.instance }})";
|
||||||
alert = "HostUnusualDiskReadRate";
|
}
|
||||||
expr = ''(rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'';
|
{
|
||||||
for = "5m";
|
alert = "HostUnusualDiskReadRate";
|
||||||
annotations.description = "Host unusual disk read rate (instance {{ $labels.instance }})";
|
expr = ''(rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'';
|
||||||
}
|
for = "5m";
|
||||||
{
|
annotations.description = "Host unusual disk read rate (instance {{ $labels.instance }})";
|
||||||
alert = "HostUnusualDiskWriteRate";
|
}
|
||||||
expr = ''(rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'';
|
{
|
||||||
for = "2m";
|
alert = "HostUnusualDiskWriteRate";
|
||||||
annotations.description = "Host unusual disk write rate (instance {{ $labels.instance }})";
|
expr = ''(rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'';
|
||||||
}
|
for = "2m";
|
||||||
{
|
annotations.description = "Host unusual disk write rate (instance {{ $labels.instance }})";
|
||||||
alert = "HostOutOfInodes";
|
}
|
||||||
expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'';
|
{
|
||||||
for = "2m";
|
alert = "HostOutOfInodes";
|
||||||
annotations.description = "Host out of inodes (instance {{ $labels.instance }})";
|
expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'';
|
||||||
}
|
for = "2m";
|
||||||
{
|
annotations.description = "Host out of inodes (instance {{ $labels.instance }})";
|
||||||
alert = "HostInodesWillFillIn24Hours";
|
}
|
||||||
expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0'';
|
{
|
||||||
for = "2m";
|
alert = "HostInodesWillFillIn24Hours";
|
||||||
annotations.description = "Host inodes will fill in 24 hours (instance {{ $labels.instance }})";
|
expr = ''node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0'';
|
||||||
}
|
for = "2m";
|
||||||
{
|
annotations.description = "Host inodes will fill in 24 hours (instance {{ $labels.instance }})";
|
||||||
alert = "HighRAM";
|
}
|
||||||
expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
{
|
||||||
for = "1h";
|
alert = "HighRAM";
|
||||||
annotations.description = "{{$labels.instance}} using lots of RAM";
|
expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
||||||
}
|
for = "1h";
|
||||||
{
|
annotations.description = "{{$labels.instance}} using lots of RAM";
|
||||||
alert = "UptimeMonster";
|
}
|
||||||
expr = "time() - node_boot_time_seconds > 2592000";
|
{
|
||||||
annotations.description = "uptime monster {{$labels.instance}} up for more than 30 days";
|
alert = "UptimeMonster";
|
||||||
}
|
expr = "time() - node_boot_time_seconds > 2592000";
|
||||||
{
|
annotations.description = "uptime monster {{$labels.instance}} up for more than 30 days";
|
||||||
alert = "HostDown";
|
}
|
||||||
expr = ''up == 0'';
|
{
|
||||||
for = "5m";
|
alert = "HostDown";
|
||||||
annotations = {
|
expr = ''up == 0'';
|
||||||
description = "{{ $labels.instance }} seeming down since 5 minutes";
|
for = "5m";
|
||||||
};
|
annotations = {
|
||||||
}
|
description = "{{ $labels.instance }} seeming down since 5 minutes";
|
||||||
{
|
};
|
||||||
alert = "Reboot";
|
}
|
||||||
expr = "time() - node_boot_time_seconds < 300";
|
{
|
||||||
annotations.description = "{{$labels.instance}} rebooted";
|
alert = "Reboot";
|
||||||
}
|
expr = "time() - node_boot_time_seconds < 300";
|
||||||
{
|
annotations.description = "{{$labels.instance}} rebooted";
|
||||||
alert = "ProbeFailed";
|
}
|
||||||
expr = "probe_success == 0";
|
{
|
||||||
for = "5m";
|
alert = "Mastodon";
|
||||||
annotations.description = "HTTP probe failed for {{$labels.instance}}";
|
expr = ''probe_success{instance="https://social.krebsco.de"}'';
|
||||||
}
|
for = "5m";
|
||||||
{
|
annotations.description = "Mastodon instance {{$labels.instance}} is down";
|
||||||
alert = "SlowProbe";
|
}
|
||||||
expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1";
|
{
|
||||||
for = "5m";
|
alert = "ProbeFailed";
|
||||||
annotations.description = "HTTP probe slow for {{$labels.instance}}";
|
expr = "probe_success == 0";
|
||||||
}
|
for = "5m";
|
||||||
{
|
annotations.description = "HTTP probe failed for {{$labels.instance}}";
|
||||||
alert = "HttpStatusCode";
|
}
|
||||||
expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)";
|
{
|
||||||
for = "5m";
|
alert = "SlowProbe";
|
||||||
annotations.description = "status code {{$value}} for {{$labels.instance}}";
|
expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1";
|
||||||
}
|
for = "5m";
|
||||||
{
|
annotations.description = "HTTP probe slow for {{$labels.instance}}";
|
||||||
alert = "SslExpirySoon";
|
}
|
||||||
expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30";
|
{
|
||||||
for = "5m";
|
alert = "HttpStatusCode";
|
||||||
annotations.description = "SSL certificate for {{$labels.instance}} expires in 30 days";
|
expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)";
|
||||||
}
|
for = "5m";
|
||||||
{
|
annotations.description = "status code {{$value}} for {{$labels.instance}}";
|
||||||
alert = "SslExpiry";
|
}
|
||||||
expr = "probe_ssl_earliest_cert_expiry - time() <= 0";
|
{
|
||||||
for = "5m";
|
alert = "SslExpirySoon";
|
||||||
annotations.description = "SSL certificate for {{$labels.instance}} has expired";
|
expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30";
|
||||||
}
|
for = "5m";
|
||||||
];
|
annotations.description = "SSL certificate for {{$labels.instance}} expires in 30 days";
|
||||||
}
|
}
|
||||||
];
|
{
|
||||||
})
|
alert = "SslExpiry";
|
||||||
];
|
expr = "probe_ssl_earliest_cert_expiry - time() <= 0";
|
||||||
|
for = "5m";
|
||||||
|
annotations.description = "SSL certificate for {{$labels.instance}} has expired";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
})
|
||||||
|
];
|
||||||
|
|
||||||
# ref https://github.com/Mic92/dotfiles/blob/f44bac5dd6970ed3fbb4feb906917331ec3c2be5/machines/eva/modules/prometheus/default.nix
|
# ref https://github.com/Mic92/dotfiles/blob/f44bac5dd6970ed3fbb4feb906917331ec3c2be5/machines/eva/modules/prometheus/default.nix
|
||||||
systemd.services.matrix-hook = {
|
systemd.services.matrix-hook = {
|
||||||
@@ -246,6 +255,33 @@ in {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
systemd.services.matrix-hook-lassulus = {
|
||||||
|
description = "Matrix Hook";
|
||||||
|
after = [ "network.target" ];
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
environment = {
|
||||||
|
HTTP_ADDRESS = "[::1]";
|
||||||
|
HTTP_PORT = "9089";
|
||||||
|
MX_HOMESERVER = "https://matrix.4d2.org";
|
||||||
|
MX_ID = "@lakai:4d2.org";
|
||||||
|
MX_ROOMID = "!MJAGqBAOKZGMywzwkI:lassul.us";
|
||||||
|
MX_MSG_TEMPLATE = "${pkgs.matrix-hook}/message.html.tmpl";
|
||||||
|
};
|
||||||
|
serviceConfig = {
|
||||||
|
EnvironmentFile = [
|
||||||
|
# format: MX_TOKEN=<token>
|
||||||
|
config.age.secrets.matrix-token-lakai-env.path
|
||||||
|
];
|
||||||
|
Type = "simple";
|
||||||
|
ExecStart = "${pkgs.matrix-hook}/bin/matrix-hook";
|
||||||
|
Restart = "always";
|
||||||
|
RestartSec = "10";
|
||||||
|
DynamicUser = true;
|
||||||
|
User = "matrix-hook";
|
||||||
|
Group = "matrix-hook";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
age.secrets = {
|
age.secrets = {
|
||||||
matrix-token-lakai-env.file = ../../../secrets/matrix-token-lakai-env.age;
|
matrix-token-lakai-env.file = ../../../secrets/matrix-token-lakai-env.age;
|
||||||
};
|
};
|
||||||
@@ -260,8 +296,23 @@ in {
|
|||||||
group_wait = "30s";
|
group_wait = "30s";
|
||||||
repeat_interval = "24h";
|
repeat_interval = "24h";
|
||||||
receiver = "matrix";
|
receiver = "matrix";
|
||||||
|
routes = [
|
||||||
|
{
|
||||||
|
receiver = "lassulus";
|
||||||
|
matchers = [ "alertname = \"Mastodon\"" ];
|
||||||
|
}
|
||||||
|
];
|
||||||
};
|
};
|
||||||
receivers = [
|
receivers = [
|
||||||
|
{
|
||||||
|
name = "lassulus";
|
||||||
|
webhook_configs = [
|
||||||
|
{
|
||||||
|
url = "http://localhost:9089/alert";
|
||||||
|
max_alerts = 5;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
{
|
{
|
||||||
name = "matrix";
|
name = "matrix";
|
||||||
webhook_configs = [
|
webhook_configs = [
|
||||||
@@ -306,13 +357,21 @@ in {
|
|||||||
{
|
{
|
||||||
scheme = "http";
|
scheme = "http";
|
||||||
path_prefix = "/";
|
path_prefix = "/";
|
||||||
static_configs = [{targets = ["localhost:${toString config.services.prometheus.alertmanager.port}"];}];
|
static_configs = [
|
||||||
|
{ targets = [ "localhost:${toString config.services.prometheus.alertmanager.port}" ]; }
|
||||||
|
];
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
# otherwise bearer_token_file will fail
|
# otherwise bearer_token_file will fail
|
||||||
services.prometheus.checkConfig = "syntax-only";
|
services.prometheus.checkConfig = "syntax-only";
|
||||||
|
|
||||||
|
services.prometheus.extraFlags = [
|
||||||
|
"--storage.tsdb.retention.time=7d"
|
||||||
|
"--storage.tsdb.retention.size=2GB"
|
||||||
|
"--storage.tsdb.wal-compression"
|
||||||
|
];
|
||||||
|
|
||||||
services.prometheus.scrapeConfigs = [
|
services.prometheus.scrapeConfigs = [
|
||||||
{
|
{
|
||||||
job_name = "makanek";
|
job_name = "makanek";
|
||||||
@@ -328,14 +387,14 @@ in {
|
|||||||
scrape_interval = "5m";
|
scrape_interval = "5m";
|
||||||
job_name = "blackbox";
|
job_name = "blackbox";
|
||||||
metrics_path = "/probe";
|
metrics_path = "/probe";
|
||||||
params.module = ["http_2xx"];
|
params.module = [ "http_2xx" ];
|
||||||
relabel_configs = [
|
relabel_configs = [
|
||||||
{
|
{
|
||||||
source_labels = ["__address__"];
|
source_labels = [ "__address__" ];
|
||||||
target_label = "__param_target";
|
target_label = "__param_target";
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
source_labels = ["__param_target"];
|
source_labels = [ "__param_target" ];
|
||||||
target_label = "instance";
|
target_label = "instance";
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -393,7 +452,7 @@ in {
|
|||||||
scrape_interval = "60s";
|
scrape_interval = "60s";
|
||||||
metrics_path = "/api/prometheus";
|
metrics_path = "/api/prometheus";
|
||||||
scheme = "http";
|
scheme = "http";
|
||||||
static_configs = [{targets = ["zaatar.r:8123"];}];
|
static_configs = [ { targets = [ "zaatar.r:8123" ]; } ];
|
||||||
bearer_token_file = config.age.secrets.home-assistant-token.path;
|
bearer_token_file = config.age.secrets.home-assistant-token.path;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -410,7 +469,7 @@ in {
|
|||||||
|
|
||||||
services.prometheus.exporters.blackbox = {
|
services.prometheus.exporters.blackbox = {
|
||||||
enable = true;
|
enable = true;
|
||||||
configFile = (pkgs.formats.yaml {}).generate "blackbox.yaml" blackboxConfig;
|
configFile = (pkgs.formats.yaml { }).generate "blackbox.yaml" blackboxConfig;
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.firewall.allowedTCPPorts = [
|
networking.firewall.allowedTCPPorts = [
|
||||||
@@ -419,6 +478,6 @@ in {
|
|||||||
|
|
||||||
services.loki = {
|
services.loki = {
|
||||||
enable = true;
|
enable = true;
|
||||||
configFile = (pkgs.formats.yaml {}).generate "loki.yaml" lokiConfig;
|
configFile = (pkgs.formats.yaml { }).generate "loki.yaml" lokiConfig;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user