diff --git a/configs/monitoring/loki.nix b/configs/monitoring/loki.nix new file mode 100644 index 0000000..4158bed --- /dev/null +++ b/configs/monitoring/loki.nix @@ -0,0 +1,60 @@ +{ + auth_enabled = false; + server = { + http_listen_port = 3100; + grpc_listen_port = 9096; + }; + ingester = { + wal = { + enabled = true; + dir = "/tmp/wal"; + }; + lifecycler = { + address = "127.0.0.1"; + ring = { + kvstore.store = "inmemory"; + replication_factor = 1; + }; + final_sleep = "0s"; + }; + chunk_idle_period = "1h"; # Any chunk not receiving new logs in this time will be flushed + max_chunk_age = "1h"; # All chunks will be flushed when they hit this age, default is 1h + chunk_target_size = 1048576; # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first + chunk_retain_period = "30s"; # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m) + max_transfer_retries = 0; # Chunk transfers disabled + }; + schema_config.configs = [ + { + from = "2020-10-24"; + store = "boltdb-shipper"; + object_store = "filesystem"; + schema = "v11"; + index = { + prefix = "index_"; + period = "24h"; + }; + } + ]; + storage_config = { + boltdb_shipper = { + active_index_directory = "/tmp/loki/boltdb-shipper-active"; + cache_location = "/tmp/loki/boltdb-shipper-cache"; + cache_ttl = "24h"; # Can be increased for faster performance over longer query periods, uses more disk space + shared_store = "filesystem"; + }; + filesystem.directory = "/tmp/loki/chunks"; + }; + compactor = { + working_directory = "/tmp/loki/boltdb-shipper-compactor"; + shared_store = "filesystem"; + }; + limits_config = { + reject_old_samples = true; + reject_old_samples_max_age = "168h"; + }; + chunk_store_config.max_look_back_period = "0s"; + table_manager = { + retention_deletes_enabled = false; + retention_period = "0s"; + }; +} diff --git a/configs/monitoring/pull.nix b/configs/monitoring/pull.nix new file mode 100644 index 0000000..f439b29 --- /dev/null +++ b/configs/monitoring/pull.nix @@ -0,0 +1,135 @@ +{ lib, config, pkgs, ... }: +let + lokiConfig = import ./loki.nix; +in +{ + services.grafana = { + enable = true; + domain = "monitoring.xn--kiern-0qa.de"; + port = 2342; + addr = "127.0.0.1"; + }; + + services.nginx.virtualHosts.${config.services.grafana.domain} = { + enableACME = true; + forceSSL = true; + locations."/" = { + proxyPass = "http://127.0.0.1:${toString config.services.grafana.port}"; + proxyWebsockets = true; + }; + }; + + services.prometheus.rules = let diskFreeThreshold = 10; in [(builtins.toJSON { + groups = [{ + name = "niveum"; + rules = [ + { + alert = "ServiceDown"; + for = "5m"; + expr = ''node_systemd_unit_state{state="failed"} == 1''; + labels.severity = "warning"; + annotations = { + summary = "{{ $labels.name }} is down."; + }; + } + { + alert = "RootPartitionFull"; + for = "30m"; + expr = ''(node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < ${toString diskFreeThreshold}''; + labels.severity = "warning"; + annotations = { + summary = "{{ $labels.job }} root disk full."; + description = ''The root disk of {{ $labels.job }} has {{ $value | printf "%.2f" }}% free disk space (threshold at ${toString diskFreeThreshold}%).''; + }; + } + { + alert = "HostDown"; + expr = ''up == 0''; + for = "5m"; + labels.severity = "warning"; + annotations = { + summary = "Host {{ $labels.job }} down for 5 minutes."; + }; + } + ]; + }]; + })]; + + systemd.services.alertmanager-bot-telegram = + let + alertmanager-bot-telegram = pkgs.buildGoModule rec { + pname = "alertmanager-bot"; + version = "2020-07-13"; + src = pkgs.fetchFromGitHub { + owner = "metalmatze"; + repo = "alertmanager-bot"; + rev = "5efc0bbbf8023d4324e9da98562f064a714a7206"; + sha256 = "09cciml1j8x76jpm2v5v6h2q6j1fkhsz1kswslmx8wl4wk40xgp4"; + }; + vendorSha256 = "1v0fgin8dn81b559zz4lqmrl7hikr46g4gb18sci4riql5qs1isj"; + postInstall = '' + install -D ./default.tmpl $out/templates/default.tmpl + ''; + }; + in { + wantedBy = [ "multi-user.target" ]; + after = [ "ip-up.target" ]; + environment.TELEGRAM_ADMIN = "18980945"; + environment.TELEGRAM_TOKEN = lib.strings.fileContents ; + serviceConfig = { + DynamicUser = true; + StateDirectory = "alertbot"; + ExecStart = ''${alertmanager-bot-telegram}/bin/alertmanager-bot \ + --alertmanager.url=http://localhost:9093 --log.level=info \ + --store=bolt --bolt.path=/var/lib/alertbot/bot.db \ + --listen.addr="0.0.0.0:16320" \ + --template.paths=${./template.tmpl}''; + }; + }; + + services.prometheus.alertmanager = { + enable = true; + listenAddress = "localhost"; + configuration = { + route = { + group_wait = "30s"; + repeat_interval = "4h"; + receiver = "me"; + }; + receivers = [{ + name = "me"; + webhook_configs = [{ + url = "http://localhost:16320"; + send_resolved = true; + }]; + }]; + }; + }; + + services.prometheus.alertmanagers = [{ + scheme = "http"; + path_prefix = "/"; + static_configs = [ { targets = [ "localhost:9093" ]; } ]; + }]; + + services.prometheus.scrapeConfigs = [ + { + job_name = "makanek"; + static_configs = [ { targets = [ + "127.0.0.1:${toString config.services.prometheus.exporters.node.port}" + # "127.0.0.1:${toString config.services.prometheus.exporters.nginx.port}" + ]; } ]; + } + { + job_name = "zaatar"; + static_configs = [ { targets = [ "zaatar.r:${toString config.services.prometheus.exporters.node.port}" ]; } ]; + } + ]; + + networking.firewall.allowedTCPPorts = [ lokiConfig.server.http_listen_port ]; + + services.loki = { + enable = true; + configFile = (pkgs.formats.yaml {}).generate "loki.yaml" lokiConfig; + }; +} diff --git a/configs/monitoring/push.nix b/configs/monitoring/push.nix new file mode 100644 index 0000000..5ceb1aa --- /dev/null +++ b/configs/monitoring/push.nix @@ -0,0 +1,75 @@ +{ config, pkgs, ... }: +{ + services.nginx.virtualHosts.default = { + locations."= /stub_status".extraConfig = "stub_status;"; + }; + + services.prometheus = { + enable = true; + port = 9001; + exporters = { + nginx.enable = false; + node = { + enable = true; + enabledCollectors = [ + "conntrack" + "diskstats" + "entropy" + "filefd" + "filesystem" + "loadavg" + "mdadm" + "meminfo" + "netdev" + "netstat" + "stat" + "time" + "vmstat" + "systemd" + "logind" + "interrupts" + "ksmd" + ]; + port = 9002; + }; + }; + }; + + networking.firewall.allowedTCPPorts = [ config.services.prometheus.exporters.node.port ]; + + systemd.services.promtail = { + description = "Promtail service for Loki"; + wantedBy = [ "multi-user.target" ]; + + serviceConfig = { + ExecStart = '' + ${pkgs.grafana-loki}/bin/promtail --config.file ${(pkgs.formats.yaml {}).generate "promtail.yaml" { + server = { + http_listen_port = 28183; + grpc_listen_port = 0; + }; + positions.filename = "/tmp/positions.yaml"; + clients = [ + { url = "http://${if config.networking.hostName == "makanek" then "127.0.0.1" else "makanek.r"}:3100/loki/api/v1/push"; } + ]; + scrape_configs = [ + { + job_name = "journal"; + journal = { + max_age = "12h"; + labels.job = "systemd-journal"; + labels.host = config.networking.hostName; + }; + relabel_configs = [ + { + source_labels = [ "__journal__systemd_unit" ]; + target_label = "unit"; + } + ]; + } + ]; + }} + ''; + }; + }; +} diff --git a/configs/monitoring/template.tmpl b/configs/monitoring/template.tmpl new file mode 100644 index 0000000..9f9b72e --- /dev/null +++ b/configs/monitoring/template.tmpl @@ -0,0 +1,25 @@ +{{ define "telegram.default" }} +{{range .Alerts -}} +{{ $severity := index .Labels "severity" }} +{{ $desc := "" }} +{{ $grafana := "d/alpUteInz/niveum" }} +{{ if eq .Status "firing" }} + {{ $desc = index .Annotations "description" }} + {{ $grafana = index .Annotations "url" }} + {{- if eq $severity "critical" -}} + 🔥 CRITICAL 🔥 + {{- else if eq $severity "warning" -}} + ⚠ WARNING ⚠ + {{- else -}} + {{ $severity }} + {{- end -}} +{{ else -}} + {{ $desc = "The issue has been resolved. 😌" }} + 🎉 RESOLVED 🎉 +{{- end }} +{{ index .Labels "alertname"}} +{{ index .Annotations "summary"}}: {{ $desc }} + +See on Grafana. +{{end -}} +{{end}} diff --git a/systems/makanek/configuration.nix b/systems/makanek/configuration.nix index 3523aca..6e2ca45 100644 --- a/systems/makanek/configuration.nix +++ b/systems/makanek/configuration.nix @@ -53,6 +53,8 @@ in + + diff --git a/systems/zaatar/configuration.nix b/systems/zaatar/configuration.nix index dd1b543..2e23cf6 100644 --- a/systems/zaatar/configuration.nix +++ b/systems/zaatar/configuration.nix @@ -13,6 +13,7 @@ + { nixpkgs.config.allowUnfree = true;