diff --git a/configs/monitoring/blackbox.nix b/configs/monitoring/blackbox.nix new file mode 100644 index 0000000..8a755c0 --- /dev/null +++ b/configs/monitoring/blackbox.nix @@ -0,0 +1,17 @@ +# https://github.com/Fluepke/nix-files/blob/2be70b76a198afaa7763132fed645a3c19d5af6e/configuration/common/blackbox-exporter.yml +# https://github.com/xHain-hackspace/xhain-nixfiles/blob/0d6e3b87a07317c2d54cccabf4f90da589319e2c/common/prometheus/blackbox-exporter.yml +{ + modules.http_2xx = { + http = { + fail_if_not_ssl = true; + ip_protocol_fallback = false; + method = "GET"; + no_follow_redirects = false; + preferred_ip_protocol = "ip4"; + valid_http_versions = [ "HTTP/1.1" "HTTP/2.0" ]; + tls_config.insecure_skip_verify = true; + }; + prober = "http"; + timeout = "15s"; + }; +} diff --git a/configs/monitoring/pull.nix b/configs/monitoring/pull.nix index a0e105b..73eeb03 100644 --- a/configs/monitoring/pull.nix +++ b/configs/monitoring/pull.nix @@ -1,6 +1,7 @@ { lib, config, pkgs, ... }: let lokiConfig = import ./loki.nix; + blackboxConfig = import ./blackbox.nix; in { services.grafana = { @@ -78,6 +79,36 @@ in expr = "time() - node_boot_time_seconds < 300"; annotations.summary = "{{$labels.job}}: Reboot"; } + { + alert = "ProbeFailed"; + expr = "probe_success == 0"; + for = "5m"; + annotations.summary = "{{$labels.instance}}: probe failed"; + } + { + alert = "SlowProbe"; + expr = "avg_over_time(probe_http_duration_seconds[1m]) > 1"; + for = "5m"; + annotations.summary = "{{$labels.instance}}: HTTP probe slow"; + } + { + alert = "HttpStatusCode"; + expr = "probe_http_status_code <= 199 OR probe_http_status_code >= 400"; + for = "5m"; + annotations.summary = "{{$labels.instance}}: returns {{$value}}"; + } + { + alert = "SslExpirySoon"; + expr = "probe_ssl_earliest_cert_expiry - time() < 86400 * 30"; + for = "5m"; + annotations.summary = "{{$labels.instance}}: SSL certificate expires in 30 days"; + } + { + alert = "SslExpiry"; + expr = "probe_ssl_earliest_cert_expiry - time() <= 0"; + for = "5m"; + annotations.summary = "{{$labels.instance}}: SSL certificate has expired"; + } ]; }]; })]; @@ -157,15 +188,35 @@ in job_name = "makanek"; static_configs = [ { targets = [ "127.0.0.1:${toString config.services.prometheus.exporters.node.port}" - # "127.0.0.1:${toString config.services.prometheus.exporters.nginx.port}" ]; } ]; } + { + job_name = "blackbox"; + metrics_path = "/probe"; + params.module = [ "http_2xx" ]; + relabel_configs = [ + { source_labels = ["__address__"]; target_label = "__param_target"; } + { source_labels = ["__param_target"]; target_label = "instance"; } + { replacement = "127.0.0.1:${toString config.services.prometheus.exporters.blackbox.port}"; target_label = "__address__"; } + ]; + static_configs = [{ + targets = [ + "alew.hu-berlin.de" + ]; + }]; + } { job_name = "zaatar"; static_configs = [ { targets = [ "zaatar.r:${toString config.services.prometheus.exporters.node.port}" ]; } ]; } ]; + + services.prometheus.exporters.blackbox = { + enable = true; + configFile = (pkgs.formats.yaml {}).generate "blackbox.yaml" blackboxConfig; + }; + networking.firewall.allowedTCPPorts = [ lokiConfig.server.http_listen_port ];