{ zentralwerk, config, lib, ... }: { networking = { hostName = "prometheus"; firewall = { allowedTCPPorts = [ # nginx 80 443 ]; allowedUDPPorts = [ # services.prometheus.exporters.collectd.collectdBinary 25826 ]; enable = true; }; }; services.prometheus = { enable = true; retentionTime = "7d"; alertmanager = { enable = true; webExternalUrl = "https://prometheus.serv.zentralwerk.org/alertmanager/"; listenAddress = "[::1]"; configuration = { "global" = { }; "route" = { "group_by" = [ "instance" ]; "group_wait" = "1m"; "group_interval" = "1m"; "repeat_interval" = "4h"; "receiver" = "xmpp"; }; "receivers" = [{ "name" = "xmpp"; "webhook_configs" = [{ "url" = "http://127.0.0.1:9022/"; }]; }]; }; }; alertmanagers = [{ static_configs = [{ targets = [ "localhost:${toString config.services.prometheus.alertmanager.port}" ]; }]; path_prefix = "/alertmanager"; } { static_configs = [{ targets = [ "localhost:9022" ]; }]; }]; rules = [ '' groups: - name: default rules: - alert: oom_kills expr: increase(node_vmstat_oom_kill[7d]) > 0.999 for: 10m labels: severity: error annotations: summary: service gets oom killed - alert: disk_free expr: collectd_df_df_complex{type="free"} < 1024*1024*1024 for: 10m labels: severity: warn annotations: summary: filesystem has less than 1GB of free space - alert: load1 expr: node_load1 > 100 for: 1m labels: severity: warn annotations: summary: high loadavg - alert: systemd_unit_failed expr: node_systemd_unit_state{state="failed"} > 0 for: 10m labels: severity: error annotations: summary: failed systemd units - name: network rules: - alert: load1 expr: collectd_load_0 > 4 for: 1m labels: severity: warn annotations: summary: high loadavg - alert: memory_free expr: collectd_memory{memory="free"} < 4*1024*1024 for: 10m labels: severity: warn annotations: summary: memory full - alert: throughput0 expr: increase(collectd_interface_if_octets_0_total[10m]) > 600 * 60 * 1024 * 1024 for: 2h labels: severity: warn annotations: summary: sustained throughput - alert: throughput1 expr: increase(collectd_interface_if_octets_1_total[10m]) > 600 * 60 * 1024 * 1024 for: 2h labels: severity: warn annotations: summary: sustained throughput '' ]; scrapeConfigs = [{ # TODO: authorization? job_name = "node"; scrape_interval = "1m"; static_configs = let zwNets = zentralwerk.lib.config.site.net; fromNet = net: _: map (host: "${host}.${net}.zentralwerk.org:9100" ) (builtins.attrNames zwNets.${net}.hosts4); in [ { targets = fromNet "serv" (_: true); labels.__meta_net = "net-serv"; } { targets = fromNet "flpk" (host: host != "flpk-gw"); labels.__meta_net = "net-flpk"; } { targets = fromNet "cluster" (host: builtins.elem host [ "server8" "server9" "server10" ]); labels.__meta_net = "net-flpk"; } { targets = [ "localhost:${toString config.services.prometheus.exporters.collectd.port}" ]; } ]; }]; exporters = { collectd = { enable = true; collectdBinary.enable = true; }; # TODO: deploy with every nginx nginx = { enable = true; openFirewall = true; }; }; webExternalUrl = "https://prometheus.serv.zentralwerk.org/"; }; services.alert2muc = { enable = true; configFile = config.sops.secrets."alert2muc/config".path; }; services.nginx = { enable = true; virtualHosts."prometheus.serv.zentralwerk.org" = { # serverAliases = [ "registry.serv.zentralwerk.org" ]; enableACME = true; forceSSL = true; locations."/" = { proxyPass = "http://localhost:${toString config.services.prometheus.port}"; extraConfig = '' auth_basic "Prometheus"; auth_basic_user_file ${config.sops.secrets."nginx/httpAuth".path}; ''; }; locations."/alertmanager" = { proxyPass = "http://localhost:${toString config.services.prometheus.alertmanager.port}"; extraConfig = '' auth_basic "Prometheus"; auth_basic_user_file ${config.sops.secrets."nginx/httpAuth".path}; ''; }; }; }; sops = { defaultSopsFile = ./secrets.yaml; secrets."nginx/httpAuth".owner = config.systemd.services.nginx.serviceConfig.User; secrets."alertmanager/xmpp-password".owner = config.systemd.services.prometheus-xmpp-alerts.serviceConfig.User; secrets."alert2muc/config".owner = config.services.alert2muc.user; }; system.stateVersion = "22.11"; systemd.services.prometheus-xmpp-alerts.serviceConfig = { DynamicUser = lib.mkForce false; User = "prometheus"; }; }