2022-12-18 00:42:00 +01:00
|
|
|
groups:
|
|
|
|
- name: default
|
|
|
|
rules:
|
|
|
|
- alert: oom_kills
|
2022-12-21 23:42:10 +01:00
|
|
|
expr: round(increase(node_vmstat_oom_kill[1d])) >= 1
|
2022-12-18 00:42:00 +01:00
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: error
|
|
|
|
annotations:
|
|
|
|
summary: service gets oom killed
|
|
|
|
|
|
|
|
- alert: disk_almost_full
|
2022-12-21 23:31:40 +01:00
|
|
|
expr: collectd_df_df_complex{type="free",df!="boot"} < 1024^3
|
2022-12-18 00:42:00 +01:00
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warn
|
|
|
|
annotations:
|
|
|
|
summary: filesystem has less than 1GB of free space
|
|
|
|
|
|
|
|
- alert: disk_full
|
2022-12-21 23:31:40 +01:00
|
|
|
expr: collectd_df_df_complex{type="free",df!="boot"} < 100 * 1024^2
|
2022-12-18 00:42:00 +01:00
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warn
|
|
|
|
annotations:
|
|
|
|
summary: filesystem has less than 100MB of free space
|
|
|
|
|
|
|
|
- alert: disk_free
|
2022-12-21 23:31:40 +01:00
|
|
|
expr: predict_linear(collectd_df_df_complex{type="free",df!="boot"}[1h], 8*3600) < 0
|
2022-12-18 00:42:00 +01:00
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warn
|
|
|
|
annotations:
|
|
|
|
summary: filesystem will be full within 8h
|
|
|
|
|
|
|
|
- alert: load1
|
|
|
|
expr: avg_over_time(node_load1[15m]) > 50
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warn
|
|
|
|
annotations:
|
|
|
|
summary: instance has high load avg
|
|
|
|
|
|
|
|
- alert: systemd_unit_failed
|
2023-01-07 19:19:19 +01:00
|
|
|
expr: round(increase(node_systemd_unit_state{state="failed"}[7d])) >= 1
|
2022-12-18 00:42:00 +01:00
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: error
|
|
|
|
annotations:
|
|
|
|
summary: service fails
|
|
|
|
|
|
|
|
- name: network
|
|
|
|
rules:
|
|
|
|
- alert: load1
|
2022-12-19 22:43:35 +01:00
|
|
|
expr: avg_over_time(collectd_load_0[15m]) > 2
|
2022-12-18 00:42:00 +01:00
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warn
|
|
|
|
annotations:
|
|
|
|
summary: network device has high load avg
|
|
|
|
|
|
|
|
- alert: memory_free
|
|
|
|
expr: collectd_memory{memory="free"} < 4 * 1024^2
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warn
|
|
|
|
annotations:
|
|
|
|
summary: memory full
|
|
|
|
|
|
|
|
- alert: throughput0
|
2023-01-09 23:39:31 +01:00
|
|
|
expr: round(increase(collectd_interface_if_octets_0_total[10m]),100000000) > 10*3600*1024^2
|
2022-12-18 00:42:00 +01:00
|
|
|
for: 2h
|
|
|
|
labels:
|
|
|
|
severity: warn
|
|
|
|
annotations:
|
|
|
|
summary: sustained throughput
|
|
|
|
|
|
|
|
- alert: throughput1
|
2023-01-09 23:39:31 +01:00
|
|
|
expr: round(increase(collectd_interface_if_octets_1_total[10m]),100000000) > 10*3600*1024^2
|
2022-12-18 00:42:00 +01:00
|
|
|
for: 2h
|
|
|
|
labels:
|
|
|
|
severity: warn
|
|
|
|
annotations:
|
|
|
|
summary: sustained throughput
|