From b549947a397808823d10aa6870bc393bc3fc4640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sandro=20J=C3=A4ckel?= Date: Sun, 18 Dec 2022 00:42:00 +0100 Subject: [PATCH] Extract rules to extra file, round oom kills --- hosts/prometheus/default.nix | 90 +----------------------------------- hosts/prometheus/rules.yaml | 84 +++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 88 deletions(-) create mode 100644 hosts/prometheus/rules.yaml diff --git a/hosts/prometheus/default.nix b/hosts/prometheus/default.nix index b87bf371..f19241d9 100644 --- a/hosts/prometheus/default.nix +++ b/hosts/prometheus/default.nix @@ -57,94 +57,8 @@ }]; }]; - rules = [ - /* yaml */ '' - groups: - - name: default - rules: - - alert: oom_kills - expr: increase(node_vmstat_oom_kill[7d]) >= 1 - for: 10m - labels: - severity: error - annotations: - summary: service gets oom killed - - - alert: disk_almost_full - expr: collectd_df_df_complex{type="free"} < 1024^3 - for: 10m - labels: - severity: warn - annotations: - summary: filesystem has less than 1GB of free space - - - alert: disk_full - expr: collectd_df_df_complex{type="free"} < 100 * 1024^2 - for: 10m - labels: - severity: warn - annotations: - summary: filesystem has less than 100MB of free space - - - alert: disk_free - expr: predict_linear(collectd_df_df_complex{type="free"}[1h], 8*3600) < 0 - for: 10m - labels: - severity: warn - annotations: - summary: filesystem will be full within 8h - - - alert: load1 - expr: avg_over_time(node_load1[15m]) > 50 - for: 10m - labels: - severity: warn - annotations: - summary: instance has high load avg - - - alert: systemd_unit_failed - expr: increase(node_systemd_unit_state{state="failed"}[7d]) >= 1 - for: 10m - labels: - severity: error - annotations: - summary: service fails - - - name: network - rules: - - alert: load1 - expr: avg_over_time(collectd_load_0[15m]) > 1 - for: 10m - labels: - severity: warn - annotations: - summary: network device has high load avg - - - alert: memory_free - expr: collectd_memory{memory="free"} < 4 * 1024^2 - for: 10m - labels: - severity: warn - annotations: - summary: memory full - - - alert: throughput0 - expr: increase(collectd_interface_if_octets_0_total[10m]) > 10 * 3600 * 1024^2 - for: 2h - labels: - severity: warn - annotations: - summary: sustained throughput - - - alert: throughput1 - expr: increase(collectd_interface_if_octets_1_total[10m]) > 10 * 3600 * 1024^2 - for: 2h - labels: - severity: warn - annotations: - summary: sustained throughput - '' - ]; + enableReload = true; + ruleFiles = [ ./rules.yaml ]; scrapeConfigs = [{ # TODO: authorization? diff --git a/hosts/prometheus/rules.yaml b/hosts/prometheus/rules.yaml new file mode 100644 index 00000000..88d730d7 --- /dev/null +++ b/hosts/prometheus/rules.yaml @@ -0,0 +1,84 @@ +groups: + - name: default + rules: + - alert: oom_kills + expr: round(increase(node_vmstat_oom_kill[7d])) >= 1 + for: 10m + labels: + severity: error + annotations: + summary: service gets oom killed + + - alert: disk_almost_full + expr: collectd_df_df_complex{type="free"} < 1024^3 + for: 10m + labels: + severity: warn + annotations: + summary: filesystem has less than 1GB of free space + + - alert: disk_full + expr: collectd_df_df_complex{type="free"} < 100 * 1024^2 + for: 10m + labels: + severity: warn + annotations: + summary: filesystem has less than 100MB of free space + + - alert: disk_free + expr: predict_linear(collectd_df_df_complex{type="free"}[1h], 8*3600) < 0 + for: 10m + labels: + severity: warn + annotations: + summary: filesystem will be full within 8h + + - alert: load1 + expr: avg_over_time(node_load1[15m]) > 50 + for: 10m + labels: + severity: warn + annotations: + summary: instance has high load avg + + - alert: systemd_unit_failed + expr: increase(node_systemd_unit_state{state="failed"}[7d]) >= 1 + for: 10m + labels: + severity: error + annotations: + summary: service fails + + - name: network + rules: + - alert: load1 + expr: avg_over_time(collectd_load_0[15m]) > 1 + for: 10m + labels: + severity: warn + annotations: + summary: network device has high load avg + + - alert: memory_free + expr: collectd_memory{memory="free"} < 4 * 1024^2 + for: 10m + labels: + severity: warn + annotations: + summary: memory full + + - alert: throughput0 + expr: increase(collectd_interface_if_octets_0_total[10m]) > 10 * 3600 * 1024^2 + for: 2h + labels: + severity: warn + annotations: + summary: sustained throughput + + - alert: throughput1 + expr: increase(collectd_interface_if_octets_1_total[10m]) > 10 * 3600 * 1024^2 + for: 2h + labels: + severity: warn + annotations: + summary: sustained throughput