Cleanup and improve prometheus alerts

This commit is contained in:
Sandro - 2022-12-17 22:04:45 +01:00
parent 24329a062e
commit c2e19af7e8
Signed by: sandro
GPG Key ID: 3AF5A43A3EECC2E5
1 changed files with 39 additions and 16 deletions

View File

@ -58,63 +58,86 @@
}];
rules = [
''
/* yaml */ ''
groups:
- name: default
rules:
- alert: oom_kills
expr: increase(node_vmstat_oom_kill[7d]) > 0.999
expr: increase(node_vmstat_oom_kill[7d]) >= 1
for: 10m
labels:
severity: error
annotations:
summary: service gets oom killed
- alert: disk_free
expr: collectd_df_df_complex{type="free"} < 1024*1024*1024
- alert: disk_almost_full
expr: collectd_df_df_complex{type="free"} < 1024^3
for: 10m
labels:
severity: warn
annotations:
summary: filesystem has less than 1GB of free space
- alert: load1
expr: node_load1 > 100
for: 1m
- alert: disk_full
expr: collectd_df_df_complex{type="free"} < 100 * 1024^2
for: 10m
labels:
severity: warn
annotations:
summary: high loadavg
summary: filesystem has less than 100MB of free space
- alert: disk_free
expr: predict_linear(collectd_df_df_complex{type="free"}[1h], 8*3600) < 0
for: 10m
labels:
severity: warn
annotations:
summary: filesystem will be full within 8h
- alert: load1
expr: avg_over_time(node_load1[15m]) > 50
for: 10m
labels:
severity: warn
annotations:
summary: instance has high load avg
- alert: systemd_unit_failed
expr: node_systemd_unit_state{state="failed"} > 0
expr: increase(node_systemd_unit_state{state="failed"}[7d]) >= 1
for: 10m
labels:
severity: error
annotations:
summary: failed systemd units
summary: service fails
- name: network
rules:
- alert: load1
expr: collectd_load_0 > 4
for: 1m
expr: avg_over_time(collectd_load_0[15m]) > 1
for: 10m
labels:
severity: warn
annotations:
summary: high loadavg
summary: network device has high load avg
- alert: memory_free
expr: collectd_memory{memory="free"} < 4*1024*1024
expr: collectd_memory{memory="free"} < 4 * 1024^2
for: 10m
labels:
severity: warn
annotations:
summary: memory full
- alert: throughput0
expr: increase(collectd_interface_if_octets_0_total[10m]) > 600 * 60 * 1024 * 1024
expr: increase(collectd_interface_if_octets_0_total[10m]) > 10 * 3600 * 1024^2
for: 2h
labels:
severity: warn
annotations:
summary: sustained throughput
- alert: throughput1
expr: increase(collectd_interface_if_octets_1_total[10m]) > 600 * 60 * 1024 * 1024
expr: increase(collectd_interface_if_octets_1_total[10m]) > 10 * 3600 * 1024^2
for: 2h
labels:
severity: warn