main
1# Prometheus alert rules for homelab monitoring
2# Based on: ~/desktop/org/notes/*prometheus*.org
3{
4 groups = [
5 {
6 name = "node_alerts";
7 interval = "30s";
8 rules = [
9 # Disk space warnings
10 {
11 alert = "DiskSpaceLow";
12 expr = ''
13 (node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"}) * 100 < 15
14 '';
15 for = "5m";
16 labels = {
17 severity = "warning";
18 };
19 annotations = {
20 summary = "Disk space low on {{ $labels.instance }}";
21 description = "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% space remaining ({{ $value | humanizePercentage }})";
22 };
23 }
24 {
25 alert = "DiskSpaceCritical";
26 expr = ''
27 (node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"}) * 100 < 5
28 '';
29 for = "2m";
30 labels = {
31 severity = "critical";
32 };
33 annotations = {
34 summary = "Disk space critical on {{ $labels.instance }}";
35 description = "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% space remaining ({{ $value | humanizePercentage }})";
36 };
37 }
38
39 # High CPU usage
40 {
41 alert = "HighCPUUsage";
42 expr = ''
43 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
44 '';
45 for = "10m";
46 labels = {
47 severity = "warning";
48 };
49 annotations = {
50 summary = "High CPU usage on {{ $labels.instance }}";
51 description = "CPU usage on {{ $labels.instance }} is above 80% (current: {{ $value | humanizePercentage }})";
52 };
53 }
54
55 # High memory usage
56 {
57 alert = "HighMemoryUsage";
58 expr = ''
59 (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
60 '';
61 for = "5m";
62 labels = {
63 severity = "warning";
64 };
65 annotations = {
66 summary = "High memory usage on {{ $labels.instance }}";
67 description = "Memory usage on {{ $labels.instance }} is above 90% (current: {{ $value | humanizePercentage }})";
68 };
69 }
70
71 # Node down
72 {
73 alert = "NodeDown";
74 expr = "up{job=\"node\"} == 0";
75 for = "2m";
76 labels = {
77 severity = "critical";
78 };
79 annotations = {
80 summary = "Node exporter down on {{ $labels.instance }}";
81 description = "Node exporter on {{ $labels.instance }} has been down for more than 2 minutes";
82 };
83 }
84 ];
85 }
86
87 {
88 name = "service_alerts";
89 interval = "30s";
90 rules = [
91 # Service exporters down
92 {
93 alert = "ServiceExporterDown";
94 expr = "up{job!=\"node\"} == 0";
95 for = "5m";
96 labels = {
97 severity = "warning";
98 };
99 annotations = {
100 summary = "Service exporter down: {{ $labels.job }}";
101 description = "Service exporter {{ $labels.job }} on {{ $labels.instance }} has been down for more than 5 minutes";
102 };
103 }
104
105 # PostgreSQL down
106 {
107 alert = "PostgreSQLDown";
108 expr = "pg_up == 0";
109 for = "2m";
110 labels = {
111 severity = "critical";
112 };
113 annotations = {
114 summary = "PostgreSQL down on {{ $labels.instance }}";
115 description = "PostgreSQL database on {{ $labels.instance }} has been unreachable for more than 2 minutes";
116 };
117 }
118
119 # Traefik down
120 {
121 alert = "TraefikDown";
122 expr = "up{job=\"traefik\"} == 0";
123 for = "2m";
124 labels = {
125 severity = "critical";
126 };
127 annotations = {
128 summary = "Traefik reverse proxy down";
129 description = "Traefik on rhea.sbr.pm has been down for more than 2 minutes - all web services may be inaccessible";
130 };
131 }
132 ];
133 }
134
135 {
136 name = "dns_alerts";
137 interval = "30s";
138 rules = [
139 # BIND DNS service down
140 {
141 alert = "DNSServiceDown";
142 expr = "up{job=\"bind\"} == 0";
143 for = "2m";
144 labels = {
145 severity = "critical";
146 };
147 annotations = {
148 summary = "DNS service down on {{ $labels.instance }}";
149 description = "BIND DNS service on {{ $labels.instance }} has been unreachable for more than 2 minutes - DNS resolution may fail";
150 };
151 }
152
153 # High DNS query failure rate
154 {
155 alert = "HighDNSQueryFailureRate";
156 expr = "rate(bind_query_errors_total[5m]) > 10";
157 for = "5m";
158 labels = {
159 severity = "warning";
160 };
161 annotations = {
162 summary = "High DNS query failure rate on {{ $labels.instance }}";
163 description = "DNS query failure rate on {{ $labels.instance }} is above 10 queries/sec";
164 };
165 }
166 ];
167 }
168
169 {
170 name = "traefik_alerts";
171 interval = "30s";
172 rules = [
173 # Traefik certificate expiration warning
174 {
175 alert = "TraefikCertificateExpiringSoon";
176 expr = "(traefik_tls_certs_not_after - time()) / 86400 < 7";
177 for = "1h";
178 labels = {
179 severity = "warning";
180 };
181 annotations = {
182 summary = "Traefik TLS certificate expiring soon";
183 description = "TLS certificate for {{ $labels.cn }} will expire in less than 7 days";
184 };
185 }
186
187 # High error rate (5xx responses)
188 {
189 alert = "TraefikHighErrorRate";
190 expr = "rate(traefik_service_requests_total{code=~\"5..\"}[5m]) > 5";
191 for = "5m";
192 labels = {
193 severity = "warning";
194 };
195 annotations = {
196 summary = "High 5xx error rate on Traefik";
197 description = "Service {{ $labels.service }} is returning 5xx errors at {{ $value }} req/sec";
198 };
199 }
200 ];
201 }
202
203 {
204 name = "caddy_alerts";
205 interval = "30s";
206 rules = [
207 # Caddy down
208 {
209 alert = "CaddyDown";
210 expr = "up{job=\"caddy\"} == 0";
211 for = "2m";
212 labels = {
213 severity = "critical";
214 };
215 annotations = {
216 summary = "Caddy web server down on carthage";
217 description = "Caddy reverse proxy on carthage has been down for more than 2 minutes - external access may be broken";
218 };
219 }
220
221 # High 4xx error rate (potential scanning/probing)
222 {
223 alert = "CaddyHigh4xxRate";
224 expr = "rate(caddy_http_requests_total{code=~\"4..\"}[5m]) > 10";
225 for = "5m";
226 labels = {
227 severity = "warning";
228 };
229 annotations = {
230 summary = "High 4xx error rate on Caddy";
231 description = "Host {{ $labels.host }} is seeing {{ $value | humanize }} 4xx errors/sec - potential scanning or misconfiguration";
232 };
233 }
234
235 # High 5xx error rate (server errors)
236 {
237 alert = "CaddyHigh5xxRate";
238 expr = "rate(caddy_http_requests_total{code=~\"5..\"}[5m]) > 1";
239 for = "3m";
240 labels = {
241 severity = "critical";
242 };
243 annotations = {
244 summary = "High 5xx error rate on Caddy";
245 description = "Host {{ $labels.host }} is returning {{ $value | humanize }} 5xx errors/sec - backend services may be failing";
246 };
247 }
248
249 # High request error rate
250 {
251 alert = "CaddyHighErrorRate";
252 expr = "rate(caddy_http_request_errors_total[5m]) > 5";
253 for = "5m";
254 labels = {
255 severity = "warning";
256 };
257 annotations = {
258 summary = "High request error rate on Caddy";
259 description = "Caddy is encountering {{ $value | humanize }} request errors/sec on {{ $labels.handler }}";
260 };
261 }
262
263 # Suspicious authentication activity (high 401/403 rate on auth endpoints)
264 {
265 alert = "CaddySuspiciousAuthActivity";
266 expr = ''
267 rate(caddy_http_requests_total{code=~"40[13]",host=~"immich.*|jellyfin.*|navidrome.*|audiobookshelf.*"}[5m]) > 5
268 '';
269 for = "3m";
270 labels = {
271 severity = "warning";
272 };
273 annotations = {
274 summary = "Suspicious authentication activity detected";
275 description = "High rate of 401/403 errors on {{ $labels.host }} ({{ $value | humanize }} req/sec) - potential brute force attempt";
276 };
277 }
278 ];
279 }
280
281 {
282 name = "media_services_alerts";
283 interval = "1m";
284 rules = [
285 # Exportarr services down (sonarr, radarr, lidarr, prowlarr, bazarr)
286 {
287 alert = "MediaServiceDown";
288 expr = "up{job=\"exportarr\"} == 0";
289 for = "5m";
290 labels = {
291 severity = "warning";
292 };
293 annotations = {
294 summary = "Media service down on {{ $labels.instance }}";
295 description = "Exportarr exporter for {{ $labels.instance }} has been unreachable for 5 minutes - check *arr services";
296 };
297 }
298
299 # Sonarr/Radarr queue backing up
300 {
301 alert = "MediaQueueBackingUp";
302 expr = "sonarr_queue_total > 50 or radarr_queue_total > 50";
303 for = "30m";
304 labels = {
305 severity = "warning";
306 };
307 annotations = {
308 summary = "Media download queue backing up";
309 description = "Download queue has {{ $value }} items - may indicate stuck downloads";
310 };
311 }
312 ];
313 }
314
315 # MQTT alerts disabled - exporter package broken in nixpkgs
316 # {
317 # name = "mqtt_alerts";
318 # interval = "30s";
319 # rules = [
320 # # Mosquitto MQTT broker down
321 # {
322 # alert = "MQTTBrokerDown";
323 # expr = "up{job=\"mosquitto\"} == 0";
324 # for = "2m";
325 # labels = {
326 # severity = "critical";
327 # };
328 # annotations = {
329 # summary = "MQTT broker down on demeter";
330 # description = "Mosquitto MQTT broker has been unreachable for more than 2 minutes - home automation may be affected";
331 # };
332 # }
333
334 # # MQTT high connection rate (potential issue)
335 # {
336 # alert = "MQTTHighConnectionRate";
337 # expr = "rate(mosquitto_connect_received[5m]) > 10";
338 # for = "5m";
339 # labels = {
340 # severity = "warning";
341 # };
342 # annotations = {
343 # summary = "High MQTT connection rate";
344 # description = "MQTT broker is seeing {{ $value }} connections/sec - may indicate reconnection loops";
345 # };
346 # }
347 # ];
348 # }
349
350 {
351 name = "homeassistant_alerts";
352 interval = "1m";
353 rules = [
354 # Home Assistant unreachable
355 {
356 alert = "HomeAssistantDown";
357 expr = "up{job=\"homeassistant\"} == 0";
358 for = "5m";
359 labels = {
360 severity = "warning";
361 };
362 annotations = {
363 summary = "Home Assistant unreachable";
364 description = "Home Assistant on hass.home has been unreachable for more than 5 minutes";
365 };
366 }
367 ];
368 }
369
370 {
371 name = "backup_alerts";
372 interval = "1h";
373 rules = [
374 # Backup failures (we'll add these when we implement backup monitoring)
375 # Placeholder for tarsnap, restic, rsync backup monitoring
376 ];
377 }
378
379 {
380 name = "systemd_alerts";
381 interval = "30s";
382 rules = [
383 # Critical systemd service failures
384 {
385 alert = "SystemdServiceFailed";
386 expr = ''
387 node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|postgresql).service",state="failed"} == 1
388 '';
389 for = "1m";
390 labels = {
391 severity = "critical";
392 };
393 annotations = {
394 summary = "Systemd service failed on {{ $labels.instance }}";
395 description = "Service {{ $labels.name }} on {{ $labels.instance }} is in failed state - check logs with: journalctl -u {{ $labels.name }}";
396 };
397 }
398
399 # Service unexpectedly inactive (should be running but isn't)
400 {
401 alert = "SystemdServiceInactive";
402 expr = ''
403 node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|postgresql).service",state="inactive"} == 1
404 '';
405 for = "5m";
406 labels = {
407 severity = "warning";
408 };
409 annotations = {
410 summary = "Systemd service inactive on {{ $labels.instance }}";
411 description = "Service {{ $labels.name }} on {{ $labels.instance }} has been inactive for 5 minutes - may need restart";
412 };
413 }
414
415 # Service restarting frequently (potential crash loop)
416 {
417 alert = "SystemdServiceRestartingFrequently";
418 expr = ''
419 rate(node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|postgresql).service",state="activating"}[5m]) > 0.1
420 '';
421 for = "3m";
422 labels = {
423 severity = "warning";
424 };
425 annotations = {
426 summary = "Systemd service restarting frequently on {{ $labels.instance }}";
427 description = "Service {{ $labels.name }} on {{ $labels.instance }} is restarting frequently - possible crash loop";
428 };
429 }
430 ];
431 }
432 ];
433}