2022-03-10 21:52:12 +01:00
|
|
|
{
|
|
|
|
|
lib,
|
|
|
|
|
config,
|
|
|
|
|
pkgs,
|
|
|
|
|
...
|
|
|
|
|
}: let
|
2021-09-18 23:54:12 +02:00
|
|
|
lokiConfig = import ./loki.nix;
|
2021-12-13 17:29:09 +01:00
|
|
|
blackboxConfig = import ./blackbox.nix;
|
2022-01-18 23:28:53 +01:00
|
|
|
inherit (import <niveum/lib>) restic;
|
2022-03-10 21:52:12 +01:00
|
|
|
in {
|
2021-09-18 23:54:12 +02:00
|
|
|
services.grafana = {
|
|
|
|
|
enable = true;
|
2022-12-01 13:39:16 +01:00
|
|
|
settings.server = {
|
|
|
|
|
domain = "grafana.kmein.r";
|
|
|
|
|
http_port = 9444;
|
|
|
|
|
http_addr = "127.0.0.1";
|
|
|
|
|
};
|
2021-09-18 23:54:12 +02:00
|
|
|
};
|
|
|
|
|
|
2022-12-01 17:05:02 +01:00
|
|
|
services.nginx.virtualHosts = {
|
|
|
|
|
${config.services.grafana.settings.server.domain} = {
|
|
|
|
|
locations."/" = {
|
|
|
|
|
proxyPass = "http://127.0.0.1:${toString config.services.grafana.settings.server.http_port}";
|
|
|
|
|
proxyWebsockets = true;
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
"alertmanager.kmein.r" = {
|
|
|
|
|
locations."/" = {
|
|
|
|
|
proxyPass = "http://127.0.0.1:${toString config.services.prometheus.alertmanager.port}";
|
|
|
|
|
proxyWebsockets = true;
|
|
|
|
|
};
|
2021-09-18 23:54:12 +02:00
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
2022-05-22 11:47:59 +02:00
|
|
|
niveum.passport.services = [
|
|
|
|
|
{
|
|
|
|
|
title = "Prometheus";
|
|
|
|
|
description = "collects metrics from devices in the <i>niveum</i> network, blackbox monitors some websites.";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
title = "Loki";
|
|
|
|
|
description = "aggregates logs of the <i>niveum</i> network.";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
title = "Grafana";
|
|
|
|
|
link = "http://${config.services.grafana.domain}";
|
|
|
|
|
description = "displays metrics from devices in the <i>niveum</i> network.";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
title = "Alertmanager bot";
|
|
|
|
|
description = "notifies me when something goes wrong.";
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
|
2022-03-10 21:52:12 +01:00
|
|
|
services.prometheus.rules = let
|
|
|
|
|
diskFreeThreshold = 10;
|
|
|
|
|
in [
|
|
|
|
|
(builtins.toJSON {
|
|
|
|
|
groups = [
|
2021-09-18 23:54:12 +02:00
|
|
|
{
|
2022-03-10 21:52:12 +01:00
|
|
|
name = "niveum";
|
|
|
|
|
rules = [
|
|
|
|
|
{
|
|
|
|
|
alert = "ServiceDown";
|
|
|
|
|
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "{{$labels.name}} failed on {{$labels.job}}";
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "RootPartitionFull";
|
|
|
|
|
for = "10m";
|
|
|
|
|
expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}'';
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = ''{{ $labels.job }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%'';
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "RootPartitionFullWeek";
|
|
|
|
|
for = "1h";
|
|
|
|
|
expr =
|
|
|
|
|
''node_filesystem_free_bytes{mountpoint="/"} ''
|
|
|
|
|
+ ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0'';
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "{{$labels.job}} running out of space in 7 days";
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "HighLoad";
|
|
|
|
|
expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0'';
|
|
|
|
|
for = "10m";
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "{{$labels.job}} running on high load: {{$value}}";
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "HighRAM";
|
|
|
|
|
expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
|
|
|
|
for = "1h";
|
|
|
|
|
annotations.summary = "{{$labels.job}} using lots of RAM";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "UptimeMonster";
|
|
|
|
|
expr = "time() - node_boot_time_seconds > 2592000";
|
|
|
|
|
annotations.summary = "uptime monster {{$labels.job}} up for more than 30 days";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "HostDown";
|
|
|
|
|
expr = ''up == 0'';
|
|
|
|
|
for = "5m";
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "{{ $labels.job }} seeming down since 5 minutes";
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "Reboot";
|
|
|
|
|
expr = "time() - node_boot_time_seconds < 300";
|
|
|
|
|
annotations.summary = "{{$labels.job}} rebooted";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "ProbeFailed";
|
|
|
|
|
expr = "probe_success == 0";
|
|
|
|
|
for = "5m";
|
|
|
|
|
annotations.summary = "HTTP probe failed for {{$labels.instance}}";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "SlowProbe";
|
|
|
|
|
expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1";
|
|
|
|
|
for = "5m";
|
|
|
|
|
annotations.summary = "HTTP probe slow for {{$labels.instance}}";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "HttpStatusCode";
|
|
|
|
|
expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)";
|
|
|
|
|
for = "5m";
|
|
|
|
|
annotations.summary = "status code {{$value}} for {{$labels.instance}}";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "SslExpirySoon";
|
|
|
|
|
expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30";
|
|
|
|
|
for = "5m";
|
|
|
|
|
annotations.summary = "SSL certificate for {{$labels.instance}} expires in 30 days";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
alert = "SslExpiry";
|
|
|
|
|
expr = "probe_ssl_earliest_cert_expiry - time() <= 0";
|
|
|
|
|
for = "5m";
|
|
|
|
|
annotations.summary = "SSL certificate for {{$labels.instance}} has expired";
|
|
|
|
|
}
|
|
|
|
|
];
|
2021-12-13 17:29:09 +01:00
|
|
|
}
|
2021-09-18 23:54:12 +02:00
|
|
|
];
|
2022-03-10 21:52:12 +01:00
|
|
|
})
|
|
|
|
|
];
|
2021-09-18 23:54:12 +02:00
|
|
|
|
|
|
|
|
services.prometheus.alertmanager = {
|
|
|
|
|
enable = true;
|
|
|
|
|
listenAddress = "localhost";
|
|
|
|
|
configuration = {
|
|
|
|
|
route = {
|
|
|
|
|
group_wait = "30s";
|
2022-12-01 17:05:02 +01:00
|
|
|
repeat_interval = "24h";
|
|
|
|
|
receiver = "email";
|
2021-09-18 23:54:12 +02:00
|
|
|
};
|
2022-03-10 21:52:12 +01:00
|
|
|
receivers = [
|
|
|
|
|
{
|
2022-12-01 17:05:02 +01:00
|
|
|
name = "email";
|
|
|
|
|
email_configs = let
|
|
|
|
|
inherit (import <niveum/lib>) kieran;
|
|
|
|
|
inherit (import <niveum/lib/email.nix> {inherit lib;}) cock;
|
|
|
|
|
in [
|
2022-03-10 21:52:12 +01:00
|
|
|
{
|
|
|
|
|
send_resolved = true;
|
2022-12-01 17:05:02 +01:00
|
|
|
to = kieran.email;
|
|
|
|
|
from = cock.user;
|
|
|
|
|
smarthost = "${cock.smtp}:587";
|
|
|
|
|
auth_username = cock.user;
|
|
|
|
|
auth_identity = cock.user;
|
|
|
|
|
auth_password = cock.password;
|
2022-03-10 21:52:12 +01:00
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
];
|
2021-09-18 23:54:12 +02:00
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
2022-03-10 21:52:12 +01:00
|
|
|
services.prometheus.alertmanagers = [
|
|
|
|
|
{
|
|
|
|
|
scheme = "http";
|
|
|
|
|
path_prefix = "/";
|
2022-12-01 17:05:02 +01:00
|
|
|
static_configs = [{targets = ["localhost:${toString config.services.prometheus.alertmanager.port}"];}];
|
2022-03-10 21:52:12 +01:00
|
|
|
}
|
|
|
|
|
];
|
2022-01-27 17:17:43 +01:00
|
|
|
|
2021-09-18 23:54:12 +02:00
|
|
|
services.prometheus.scrapeConfigs = [
|
|
|
|
|
{
|
|
|
|
|
job_name = "makanek";
|
2022-03-10 21:52:12 +01:00
|
|
|
static_configs = [
|
|
|
|
|
{
|
|
|
|
|
targets = [
|
|
|
|
|
"127.0.0.1:${toString config.services.prometheus.exporters.node.port}"
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
];
|
2021-09-18 23:54:12 +02:00
|
|
|
}
|
2021-12-13 17:29:09 +01:00
|
|
|
{
|
2021-12-14 00:13:25 +01:00
|
|
|
scrape_interval = "5m";
|
2021-12-13 17:29:09 +01:00
|
|
|
job_name = "blackbox";
|
|
|
|
|
metrics_path = "/probe";
|
2022-03-10 21:52:12 +01:00
|
|
|
params.module = ["http_2xx"];
|
2021-12-13 17:29:09 +01:00
|
|
|
relabel_configs = [
|
2022-03-10 21:52:12 +01:00
|
|
|
{
|
|
|
|
|
source_labels = ["__address__"];
|
|
|
|
|
target_label = "__param_target";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
source_labels = ["__param_target"];
|
|
|
|
|
target_label = "instance";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
|
|
|
|
|
target_label = "__address__";
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
static_configs = [
|
|
|
|
|
{
|
|
|
|
|
targets = [
|
|
|
|
|
"alew.hu-berlin.de"
|
|
|
|
|
"pad.kmein.de"
|
|
|
|
|
"code.kmein.de"
|
|
|
|
|
"radio.kmein.de"
|
|
|
|
|
"tarot.kmein.de"
|
|
|
|
|
"cloud.xn--kiern-0qa.de"
|
|
|
|
|
"grafana.kmein.r"
|
2022-04-24 17:24:58 +02:00
|
|
|
# "names.kmein.r"
|
2022-03-10 21:52:12 +01:00
|
|
|
"rrm.r"
|
|
|
|
|
"graph.r"
|
|
|
|
|
];
|
|
|
|
|
}
|
2021-12-13 17:29:09 +01:00
|
|
|
];
|
|
|
|
|
}
|
2021-09-18 23:54:12 +02:00
|
|
|
{
|
|
|
|
|
job_name = "zaatar";
|
2022-03-10 21:52:12 +01:00
|
|
|
static_configs = [
|
|
|
|
|
{
|
|
|
|
|
targets = [
|
|
|
|
|
"zaatar.r:${toString config.services.prometheus.exporters.node.port}"
|
|
|
|
|
"zaatar.r:${toString restic.port}"
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
];
|
2021-09-18 23:54:12 +02:00
|
|
|
}
|
2022-09-26 09:30:28 +02:00
|
|
|
{
|
|
|
|
|
job_name = "ful";
|
|
|
|
|
static_configs = [
|
|
|
|
|
{
|
|
|
|
|
targets = [
|
|
|
|
|
"ful.r:${toString config.services.prometheus.exporters.node.port}"
|
|
|
|
|
];
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
}
|
2021-09-18 23:54:12 +02:00
|
|
|
];
|
|
|
|
|
|
2021-12-13 17:29:09 +01:00
|
|
|
services.prometheus.exporters.blackbox = {
|
|
|
|
|
enable = true;
|
|
|
|
|
configFile = (pkgs.formats.yaml {}).generate "blackbox.yaml" blackboxConfig;
|
|
|
|
|
};
|
|
|
|
|
|
2021-09-19 12:03:52 +02:00
|
|
|
networking.firewall.allowedTCPPorts = [
|
|
|
|
|
lokiConfig.server.http_listen_port
|
|
|
|
|
];
|
2021-09-18 23:54:12 +02:00
|
|
|
|
|
|
|
|
services.loki = {
|
|
|
|
|
enable = true;
|
|
|
|
|
configFile = (pkgs.formats.yaml {}).generate "loki.yaml" lokiConfig;
|
|
|
|
|
};
|
|
|
|
|
}
|