mirror of
https://github.com/kmein/niveum
synced 2026-03-19 19:41:08 +01:00
chore: format with alejandra
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
method = "GET";
|
||||
no_follow_redirects = false;
|
||||
preferred_ip_protocol = "ip4";
|
||||
valid_http_versions = [ "HTTP/1.1" "HTTP/2.0" ];
|
||||
valid_http_versions = ["HTTP/1.1" "HTTP/2.0"];
|
||||
tls_config.insecure_skip_verify = true;
|
||||
};
|
||||
prober = "http";
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
{ lib, config, pkgs, ... }:
|
||||
let
|
||||
{
|
||||
lib,
|
||||
config,
|
||||
pkgs,
|
||||
...
|
||||
}: let
|
||||
lokiConfig = import ./loki.nix;
|
||||
blackboxConfig = import ./blackbox.nix;
|
||||
inherit (import <niveum/lib>) restic;
|
||||
in
|
||||
{
|
||||
in {
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
domain = "grafana.kmein.r";
|
||||
@@ -19,103 +22,110 @@ in
|
||||
};
|
||||
};
|
||||
|
||||
services.prometheus.rules = let diskFreeThreshold = 10; in [(builtins.toJSON {
|
||||
groups = [{
|
||||
name = "niveum";
|
||||
rules = [
|
||||
services.prometheus.rules = let
|
||||
diskFreeThreshold = 10;
|
||||
in [
|
||||
(builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
alert = "ServiceDown";
|
||||
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
||||
annotations = {
|
||||
summary = "{{$labels.name}} failed on {{$labels.job}}";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "RootPartitionFull";
|
||||
for = "10m";
|
||||
expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}'';
|
||||
annotations = {
|
||||
summary = ''{{ $labels.job }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%'';
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "RootPartitionFullWeek";
|
||||
for = "1h";
|
||||
expr = ''node_filesystem_free_bytes{mountpoint="/"} ''
|
||||
+ ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0'';
|
||||
annotations = {
|
||||
summary = "{{$labels.job}} running out of space in 7 days";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "HighLoad";
|
||||
expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0'';
|
||||
for = "10m";
|
||||
annotations = {
|
||||
summary = "{{$labels.job}} running on high load: {{$value}}";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "HighRAM";
|
||||
expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
||||
for = "1h";
|
||||
annotations.summary = "{{$labels.job}} using lots of RAM";
|
||||
}
|
||||
{
|
||||
alert = "UptimeMonster";
|
||||
expr = "time() - node_boot_time_seconds > 2592000";
|
||||
annotations.summary = "uptime monster {{$labels.job}} up for more than 30 days";
|
||||
}
|
||||
{
|
||||
alert = "HostDown";
|
||||
expr = ''up == 0'';
|
||||
for = "5m";
|
||||
annotations = {
|
||||
summary = "{{ $labels.job }} seeming down since 5 minutes";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "Reboot";
|
||||
expr = "time() - node_boot_time_seconds < 300";
|
||||
annotations.summary = "{{$labels.job}} rebooted";
|
||||
}
|
||||
{
|
||||
alert = "ProbeFailed";
|
||||
expr = "probe_success == 0";
|
||||
for = "5m";
|
||||
annotations.summary = "HTTP probe failed for {{$labels.instance}}";
|
||||
}
|
||||
{
|
||||
alert = "SlowProbe";
|
||||
expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1";
|
||||
for = "5m";
|
||||
annotations.summary = "HTTP probe slow for {{$labels.instance}}";
|
||||
}
|
||||
{
|
||||
alert = "HttpStatusCode";
|
||||
expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)";
|
||||
for = "5m";
|
||||
annotations.summary = "status code {{$value}} for {{$labels.instance}}";
|
||||
}
|
||||
{
|
||||
alert = "SslExpirySoon";
|
||||
expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30";
|
||||
for = "5m";
|
||||
annotations.summary = "SSL certificate for {{$labels.instance}} expires in 30 days";
|
||||
}
|
||||
{
|
||||
alert = "SslExpiry";
|
||||
expr = "probe_ssl_earliest_cert_expiry - time() <= 0";
|
||||
for = "5m";
|
||||
annotations.summary = "SSL certificate for {{$labels.instance}} has expired";
|
||||
name = "niveum";
|
||||
rules = [
|
||||
{
|
||||
alert = "ServiceDown";
|
||||
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
||||
annotations = {
|
||||
summary = "{{$labels.name}} failed on {{$labels.job}}";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "RootPartitionFull";
|
||||
for = "10m";
|
||||
expr = ''(node_filesystem_free_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}'';
|
||||
annotations = {
|
||||
summary = ''{{ $labels.job }} running out of space: {{ $value | printf "%.2f" }}% < ${toString diskFreeThreshold}%'';
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "RootPartitionFullWeek";
|
||||
for = "1h";
|
||||
expr =
|
||||
''node_filesystem_free_bytes{mountpoint="/"} ''
|
||||
+ ''and predict_linear(node_filesystem_free_bytes{mountpoint="/"}[2d], 7*24*3600) <= 0'';
|
||||
annotations = {
|
||||
summary = "{{$labels.job}} running out of space in 7 days";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "HighLoad";
|
||||
expr = ''node_load15 / on(job) count(node_cpu_seconds_total{mode="system"}) by (job) >= 1.0'';
|
||||
for = "10m";
|
||||
annotations = {
|
||||
summary = "{{$labels.job}} running on high load: {{$value}}";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "HighRAM";
|
||||
expr = "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
||||
for = "1h";
|
||||
annotations.summary = "{{$labels.job}} using lots of RAM";
|
||||
}
|
||||
{
|
||||
alert = "UptimeMonster";
|
||||
expr = "time() - node_boot_time_seconds > 2592000";
|
||||
annotations.summary = "uptime monster {{$labels.job}} up for more than 30 days";
|
||||
}
|
||||
{
|
||||
alert = "HostDown";
|
||||
expr = ''up == 0'';
|
||||
for = "5m";
|
||||
annotations = {
|
||||
summary = "{{ $labels.job }} seeming down since 5 minutes";
|
||||
};
|
||||
}
|
||||
{
|
||||
alert = "Reboot";
|
||||
expr = "time() - node_boot_time_seconds < 300";
|
||||
annotations.summary = "{{$labels.job}} rebooted";
|
||||
}
|
||||
{
|
||||
alert = "ProbeFailed";
|
||||
expr = "probe_success == 0";
|
||||
for = "5m";
|
||||
annotations.summary = "HTTP probe failed for {{$labels.instance}}";
|
||||
}
|
||||
{
|
||||
alert = "SlowProbe";
|
||||
expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1";
|
||||
for = "5m";
|
||||
annotations.summary = "HTTP probe slow for {{$labels.instance}}";
|
||||
}
|
||||
{
|
||||
alert = "HttpStatusCode";
|
||||
expr = "probe_http_status_code != 0 AND (probe_http_status_code <= 199 OR probe_http_status_code >= 400)";
|
||||
for = "5m";
|
||||
annotations.summary = "status code {{$value}} for {{$labels.instance}}";
|
||||
}
|
||||
{
|
||||
alert = "SslExpirySoon";
|
||||
expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30";
|
||||
for = "5m";
|
||||
annotations.summary = "SSL certificate for {{$labels.instance}} expires in 30 days";
|
||||
}
|
||||
{
|
||||
alert = "SslExpiry";
|
||||
expr = "probe_ssl_earliest_cert_expiry - time() <= 0";
|
||||
for = "5m";
|
||||
annotations.summary = "SSL certificate for {{$labels.instance}} has expired";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
}];
|
||||
})];
|
||||
})
|
||||
];
|
||||
|
||||
systemd.services.alertmanager-bot-telegram = {
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "ip-up.target" ];
|
||||
wantedBy = ["multi-user.target"];
|
||||
after = ["ip-up.target"];
|
||||
environment.TELEGRAM_ADMIN = "18980945";
|
||||
environment.TELEGRAM_TOKEN = lib.strings.fileContents <system-secrets/telegram/prometheus.token>;
|
||||
serviceConfig = {
|
||||
@@ -123,17 +133,19 @@ in
|
||||
RestartSec = "15s";
|
||||
DynamicUser = true;
|
||||
StateDirectory = "alertbot";
|
||||
ExecStart = ''${pkgs.alertmanager-bot-telegram}/bin/alertmanager-bot \
|
||||
--alertmanager.url=http://localhost:9093 --log.level=info \
|
||||
--store=bolt --bolt.path=/var/lib/alertbot/bot.db \
|
||||
--listen.addr="0.0.0.0:16320" \
|
||||
--template.paths=${pkgs.writeText "template.tmpl" ''
|
||||
{{ define "telegram.default" }}
|
||||
{{range .Alerts -}}
|
||||
{{.Status}}: {{ index .Annotations "summary"}}
|
||||
{{end -}}
|
||||
{{end}}
|
||||
''}'';
|
||||
ExecStart = '' ${pkgs.alertmanager-bot-telegram}/bin/alertmanager-bot \
|
||||
--alertmanager.url=http://localhost:9093 --log.level=info \
|
||||
--store=bolt --bolt.path=/var/lib/alertbot/bot.db \
|
||||
--listen.addr="0.0.0.0:16320" \
|
||||
--template.paths=${
|
||||
pkgs.writeText "template.tmpl" ''
|
||||
{{ define "telegram.default" }}
|
||||
{{range .Alerts -}}
|
||||
{{.Status}}: {{ index .Annotations "summary"}}
|
||||
{{end -}}
|
||||
{{end}}
|
||||
''
|
||||
}'';
|
||||
};
|
||||
};
|
||||
|
||||
@@ -146,64 +158,88 @@ in
|
||||
repeat_interval = "4h";
|
||||
receiver = "me";
|
||||
};
|
||||
receivers = [{
|
||||
name = "me";
|
||||
webhook_configs = [{
|
||||
url = "http://localhost:16320";
|
||||
send_resolved = true;
|
||||
}];
|
||||
}];
|
||||
receivers = [
|
||||
{
|
||||
name = "me";
|
||||
webhook_configs = [
|
||||
{
|
||||
url = "http://localhost:16320";
|
||||
send_resolved = true;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
services.prometheus.alertmanagers = [{
|
||||
scheme = "http";
|
||||
path_prefix = "/";
|
||||
static_configs = [ { targets = [ "localhost:9093" ]; } ];
|
||||
}];
|
||||
services.prometheus.alertmanagers = [
|
||||
{
|
||||
scheme = "http";
|
||||
path_prefix = "/";
|
||||
static_configs = [{targets = ["localhost:9093"];}];
|
||||
}
|
||||
];
|
||||
|
||||
services.prometheus.scrapeConfigs = [
|
||||
{
|
||||
job_name = "makanek";
|
||||
static_configs = [ { targets = [
|
||||
"127.0.0.1:${toString config.services.prometheus.exporters.node.port}"
|
||||
]; } ];
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"127.0.0.1:${toString config.services.prometheus.exporters.node.port}"
|
||||
];
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
scrape_interval = "5m";
|
||||
job_name = "blackbox";
|
||||
metrics_path = "/probe";
|
||||
params.module = [ "http_2xx" ];
|
||||
params.module = ["http_2xx"];
|
||||
relabel_configs = [
|
||||
{ source_labels = ["__address__"]; target_label = "__param_target"; }
|
||||
{ source_labels = ["__param_target"]; target_label = "instance"; }
|
||||
{ replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}"; target_label = "__address__"; }
|
||||
{
|
||||
source_labels = ["__address__"];
|
||||
target_label = "__param_target";
|
||||
}
|
||||
{
|
||||
source_labels = ["__param_target"];
|
||||
target_label = "instance";
|
||||
}
|
||||
{
|
||||
replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}";
|
||||
target_label = "__address__";
|
||||
}
|
||||
];
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"alew.hu-berlin.de"
|
||||
"pad.kmein.de"
|
||||
"code.kmein.de"
|
||||
"radio.kmein.de"
|
||||
"tarot.kmein.de"
|
||||
"cloud.xn--kiern-0qa.de"
|
||||
"grafana.kmein.r"
|
||||
"names.kmein.r"
|
||||
"rrm.r"
|
||||
"graph.r"
|
||||
];
|
||||
}
|
||||
];
|
||||
static_configs = [{
|
||||
targets = [
|
||||
"alew.hu-berlin.de"
|
||||
"pad.kmein.de"
|
||||
"code.kmein.de"
|
||||
"radio.kmein.de"
|
||||
"tarot.kmein.de"
|
||||
"cloud.xn--kiern-0qa.de"
|
||||
"grafana.kmein.r"
|
||||
"names.kmein.r"
|
||||
"rrm.r"
|
||||
"graph.r"
|
||||
];
|
||||
}];
|
||||
}
|
||||
{
|
||||
job_name = "zaatar";
|
||||
static_configs = [ { targets = [
|
||||
"zaatar.r:${toString config.services.prometheus.exporters.node.port}"
|
||||
"zaatar.r:${toString restic.port}"
|
||||
]; } ];
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"zaatar.r:${toString config.services.prometheus.exporters.node.port}"
|
||||
"zaatar.r:${toString restic.port}"
|
||||
];
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
|
||||
|
||||
services.prometheus.exporters.blackbox = {
|
||||
enable = true;
|
||||
configFile = (pkgs.formats.yaml {}).generate "blackbox.yaml" blackboxConfig;
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
};
|
||||
final_sleep = "0s";
|
||||
};
|
||||
chunk_idle_period = "1h"; # Any chunk not receiving new logs in this time will be flushed
|
||||
max_chunk_age = "1h"; # All chunks will be flushed when they hit this age, default is 1h
|
||||
chunk_target_size = 1048576; # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first
|
||||
chunk_retain_period = "30s"; # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
|
||||
max_transfer_retries = 0; # Chunk transfers disabled
|
||||
chunk_idle_period = "1h"; # Any chunk not receiving new logs in this time will be flushed
|
||||
max_chunk_age = "1h"; # All chunks will be flushed when they hit this age, default is 1h
|
||||
chunk_target_size = 1048576; # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first
|
||||
chunk_retain_period = "30s"; # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
|
||||
max_transfer_retries = 0; # Chunk transfers disabled
|
||||
};
|
||||
schema_config.configs = [
|
||||
{
|
||||
@@ -39,7 +39,7 @@
|
||||
boltdb_shipper = {
|
||||
active_index_directory = "/tmp/loki/boltdb-shipper-active";
|
||||
cache_location = "/tmp/loki/boltdb-shipper-cache";
|
||||
cache_ttl = "24h"; # Can be increased for faster performance over longer query periods, uses more disk space
|
||||
cache_ttl = "24h"; # Can be increased for faster performance over longer query periods, uses more disk space
|
||||
shared_store = "filesystem";
|
||||
};
|
||||
filesystem.directory = "/tmp/loki/chunks";
|
||||
|
||||
Reference in New Issue
Block a user