Extract rules to extra file, round oom kills
This commit is contained in:
parent
c2e19af7e8
commit
b549947a39
|
@ -57,94 +57,8 @@
|
|||
}];
|
||||
}];
|
||||
|
||||
rules = [
|
||||
/* yaml */ ''
|
||||
groups:
|
||||
- name: default
|
||||
rules:
|
||||
- alert: oom_kills
|
||||
expr: increase(node_vmstat_oom_kill[7d]) >= 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: service gets oom killed
|
||||
|
||||
- alert: disk_almost_full
|
||||
expr: collectd_df_df_complex{type="free"} < 1024^3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: filesystem has less than 1GB of free space
|
||||
|
||||
- alert: disk_full
|
||||
expr: collectd_df_df_complex{type="free"} < 100 * 1024^2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: filesystem has less than 100MB of free space
|
||||
|
||||
- alert: disk_free
|
||||
expr: predict_linear(collectd_df_df_complex{type="free"}[1h], 8*3600) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: filesystem will be full within 8h
|
||||
|
||||
- alert: load1
|
||||
expr: avg_over_time(node_load1[15m]) > 50
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: instance has high load avg
|
||||
|
||||
- alert: systemd_unit_failed
|
||||
expr: increase(node_systemd_unit_state{state="failed"}[7d]) >= 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: service fails
|
||||
|
||||
- name: network
|
||||
rules:
|
||||
- alert: load1
|
||||
expr: avg_over_time(collectd_load_0[15m]) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: network device has high load avg
|
||||
|
||||
- alert: memory_free
|
||||
expr: collectd_memory{memory="free"} < 4 * 1024^2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: memory full
|
||||
|
||||
- alert: throughput0
|
||||
expr: increase(collectd_interface_if_octets_0_total[10m]) > 10 * 3600 * 1024^2
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: sustained throughput
|
||||
|
||||
- alert: throughput1
|
||||
expr: increase(collectd_interface_if_octets_1_total[10m]) > 10 * 3600 * 1024^2
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: sustained throughput
|
||||
''
|
||||
];
|
||||
enableReload = true;
|
||||
ruleFiles = [ ./rules.yaml ];
|
||||
|
||||
scrapeConfigs = [{
|
||||
# TODO: authorization?
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
groups:
|
||||
- name: default
|
||||
rules:
|
||||
- alert: oom_kills
|
||||
expr: round(increase(node_vmstat_oom_kill[7d])) >= 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: service gets oom killed
|
||||
|
||||
- alert: disk_almost_full
|
||||
expr: collectd_df_df_complex{type="free"} < 1024^3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: filesystem has less than 1GB of free space
|
||||
|
||||
- alert: disk_full
|
||||
expr: collectd_df_df_complex{type="free"} < 100 * 1024^2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: filesystem has less than 100MB of free space
|
||||
|
||||
- alert: disk_free
|
||||
expr: predict_linear(collectd_df_df_complex{type="free"}[1h], 8*3600) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: filesystem will be full within 8h
|
||||
|
||||
- alert: load1
|
||||
expr: avg_over_time(node_load1[15m]) > 50
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: instance has high load avg
|
||||
|
||||
- alert: systemd_unit_failed
|
||||
expr: increase(node_systemd_unit_state{state="failed"}[7d]) >= 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: service fails
|
||||
|
||||
- name: network
|
||||
rules:
|
||||
- alert: load1
|
||||
expr: avg_over_time(collectd_load_0[15m]) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: network device has high load avg
|
||||
|
||||
- alert: memory_free
|
||||
expr: collectd_memory{memory="free"} < 4 * 1024^2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: memory full
|
||||
|
||||
- alert: throughput0
|
||||
expr: increase(collectd_interface_if_octets_0_total[10m]) > 10 * 3600 * 1024^2
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: sustained throughput
|
||||
|
||||
- alert: throughput1
|
||||
expr: increase(collectd_interface_if_octets_1_total[10m]) > 10 * 3600 * 1024^2
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: sustained throughput
|
Loading…
Reference in New Issue