From fe4deea101bf75e50ba6b06938594a28604d5e72 Mon Sep 17 00:00:00 2001 From: Astro Date: Mon, 12 Dec 2022 02:16:31 +0100 Subject: [PATCH] prometheus: add some alerts --- hosts/prometheus/default.nix | 51 ++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/hosts/prometheus/default.nix b/hosts/prometheus/default.nix index c2ca5569..f994e9c1 100644 --- a/hosts/prometheus/default.nix +++ b/hosts/prometheus/default.nix @@ -61,6 +61,57 @@ severity: error annotations: summary: service gets oom killed + - alert: disk_free + expr: collectd_df_df_complex{type="free"} < 1024*1024*1024 + for: 10m + labels: + severity: warn + annotations: + summary: filesystem has less than 1GB of free space + - alert: load1 + expr: node_load1 > 100 + for: 1m + labels: + severity: warn + annotations: + summary: high loadavg + - alert: systemd_unit_failed + expr: node_systemd_unit_state{state="failed"} > 0 + for: 10m + labels: + severity: error + annotations: + summary: failed systemd units + - name: network + rules: + - alert: load1 + expr: collectd_load_0 > 4 + for: 1m + labels: + severity: warn + annotations: + summary: high loadavg + - alert: memory_free + expr: collectd_memory{memory="free"} < 4*1024*1024 + for: 10m + labels: + severity: warn + annotations: + summary: memory full + - alert: throughput0 + expr: increase(collectd_interface_if_octets_0_total[10m]) > 600 * 60 * 1024 * 1024 + for: 2h + labels: + severity: warn + annotations: + summary: sustained throughput + - alert: throughput1 + expr: increase(collectd_interface_if_octets_1_total[10m]) > 600 * 60 * 1024 * 1024 + for: 2h + labels: + severity: warn + annotations: + summary: sustained throughput '' ];