prometheus: add some alerts
This commit is contained in:
parent
8f192b81ff
commit
fe4deea101
|
@ -61,6 +61,57 @@
|
||||||
severity: error
|
severity: error
|
||||||
annotations:
|
annotations:
|
||||||
summary: service gets oom killed
|
summary: service gets oom killed
|
||||||
|
- alert: disk_free
|
||||||
|
expr: collectd_df_df_complex{type="free"} < 1024*1024*1024
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warn
|
||||||
|
annotations:
|
||||||
|
summary: filesystem has less than 1GB of free space
|
||||||
|
- alert: load1
|
||||||
|
expr: node_load1 > 100
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warn
|
||||||
|
annotations:
|
||||||
|
summary: high loadavg
|
||||||
|
- alert: systemd_unit_failed
|
||||||
|
expr: node_systemd_unit_state{state="failed"} > 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: error
|
||||||
|
annotations:
|
||||||
|
summary: failed systemd units
|
||||||
|
- name: network
|
||||||
|
rules:
|
||||||
|
- alert: load1
|
||||||
|
expr: collectd_load_0 > 4
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warn
|
||||||
|
annotations:
|
||||||
|
summary: high loadavg
|
||||||
|
- alert: memory_free
|
||||||
|
expr: collectd_memory{memory="free"} < 4*1024*1024
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warn
|
||||||
|
annotations:
|
||||||
|
summary: memory full
|
||||||
|
- alert: throughput0
|
||||||
|
expr: increase(collectd_interface_if_octets_0_total[10m]) > 600 * 60 * 1024 * 1024
|
||||||
|
for: 2h
|
||||||
|
labels:
|
||||||
|
severity: warn
|
||||||
|
annotations:
|
||||||
|
summary: sustained throughput
|
||||||
|
- alert: throughput1
|
||||||
|
expr: increase(collectd_interface_if_octets_1_total[10m]) > 600 * 60 * 1024 * 1024
|
||||||
|
for: 2h
|
||||||
|
labels:
|
||||||
|
severity: warn
|
||||||
|
annotations:
|
||||||
|
summary: sustained throughput
|
||||||
''
|
''
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue