From c2e19af7e8d5d30a558f7539fd81333f960561c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sandro=20J=C3=A4ckel?= Date: Sat, 17 Dec 2022 22:04:45 +0100 Subject: [PATCH] Cleanup and improve prometheus alerts --- hosts/prometheus/default.nix | 55 +++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/hosts/prometheus/default.nix b/hosts/prometheus/default.nix index 38a8c2d7..b87bf371 100644 --- a/hosts/prometheus/default.nix +++ b/hosts/prometheus/default.nix @@ -58,63 +58,86 @@ }]; rules = [ - '' + /* yaml */ '' groups: - name: default rules: - alert: oom_kills - expr: increase(node_vmstat_oom_kill[7d]) > 0.999 + expr: increase(node_vmstat_oom_kill[7d]) >= 1 for: 10m labels: severity: error annotations: summary: service gets oom killed - - alert: disk_free - expr: collectd_df_df_complex{type="free"} < 1024*1024*1024 + + - alert: disk_almost_full + expr: collectd_df_df_complex{type="free"} < 1024^3 for: 10m labels: severity: warn annotations: summary: filesystem has less than 1GB of free space - - alert: load1 - expr: node_load1 > 100 - for: 1m + + - alert: disk_full + expr: collectd_df_df_complex{type="free"} < 100 * 1024^2 + for: 10m labels: severity: warn annotations: - summary: high loadavg + summary: filesystem has less than 100MB of free space + + - alert: disk_free + expr: predict_linear(collectd_df_df_complex{type="free"}[1h], 8*3600) < 0 + for: 10m + labels: + severity: warn + annotations: + summary: filesystem will be full within 8h + + - alert: load1 + expr: avg_over_time(node_load1[15m]) > 50 + for: 10m + labels: + severity: warn + annotations: + summary: instance has high load avg + - alert: systemd_unit_failed - expr: node_systemd_unit_state{state="failed"} > 0 + expr: increase(node_systemd_unit_state{state="failed"}[7d]) >= 1 for: 10m labels: severity: error annotations: - summary: failed systemd units + summary: service fails + - name: network rules: - alert: load1 - expr: collectd_load_0 > 4 - for: 1m + expr: avg_over_time(collectd_load_0[15m]) > 1 + for: 10m labels: severity: warn annotations: - summary: high loadavg + summary: network device has high load avg + - alert: memory_free - expr: collectd_memory{memory="free"} < 4*1024*1024 + expr: collectd_memory{memory="free"} < 4 * 1024^2 for: 10m labels: severity: warn annotations: summary: memory full + - alert: throughput0 - expr: increase(collectd_interface_if_octets_0_total[10m]) > 600 * 60 * 1024 * 1024 + expr: increase(collectd_interface_if_octets_0_total[10m]) > 10 * 3600 * 1024^2 for: 2h labels: severity: warn annotations: summary: sustained throughput + - alert: throughput1 - expr: increase(collectd_interface_if_octets_1_total[10m]) > 600 * 60 * 1024 * 1024 + expr: increase(collectd_interface_if_octets_1_total[10m]) > 10 * 3600 * 1024^2 for: 2h labels: severity: warn