groups: - name: default rules: - alert: oom_kills expr: round(increase(node_vmstat_oom_kill[7d])) >= 1 for: 10m labels: severity: error annotations: summary: service gets oom killed - alert: disk_almost_full expr: collectd_df_df_complex{type="free"} < 1024^3 for: 10m labels: severity: warn annotations: summary: filesystem has less than 1GB of free space - alert: disk_full expr: collectd_df_df_complex{type="free"} < 100 * 1024^2 for: 10m labels: severity: warn annotations: summary: filesystem has less than 100MB of free space - alert: disk_free expr: predict_linear(collectd_df_df_complex{type="free"}[1h], 8*3600) < 0 for: 10m labels: severity: warn annotations: summary: filesystem will be full within 8h - alert: load1 expr: avg_over_time(node_load1[15m]) > 50 for: 10m labels: severity: warn annotations: summary: instance has high load avg - alert: systemd_unit_failed expr: increase(node_systemd_unit_state{state="failed"}[7d]) >= 1 for: 10m labels: severity: error annotations: summary: service fails - name: network rules: - alert: load1 expr: avg_over_time(collectd_load_0[15m]) > 1 for: 10m labels: severity: warn annotations: summary: network device has high load avg - alert: memory_free expr: collectd_memory{memory="free"} < 4 * 1024^2 for: 10m labels: severity: warn annotations: summary: memory full - alert: throughput0 expr: increase(collectd_interface_if_octets_0_total[10m]) > 10 * 3600 * 1024^2 for: 2h labels: severity: warn annotations: summary: sustained throughput - alert: throughput1 expr: increase(collectd_interface_if_octets_1_total[10m]) > 10 * 3600 * 1024^2 for: 2h labels: severity: warn annotations: summary: sustained throughput