Commit e7fb878ba284

Vincent Demeester <vincent@sbr.pm>
2026-01-15 16:56:34
feat(prometheus): add systemd service failure alerting
Add comprehensive systemd service monitoring alerts to detect and notify when critical homelab services fail, become inactive, or enter crash loops. Monitors prosody, jellyfin, media services (*arr stack), traefik, grafana, prometheus, n8n, paperless, and postgresql. Alerts are delivered via Alertmanager to ntfy for immediate notification. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent db5196d
Changed files (1)
systems
systems/sakhalin/prometheus-alerts.nix
@@ -437,5 +437,59 @@
         # Placeholder for tarsnap, restic, rsync backup monitoring
       ];
     }
+
+    {
+      name = "systemd_alerts";
+      interval = "30s";
+      rules = [
+        # Critical systemd service failures
+        {
+          alert = "SystemdServiceFailed";
+          expr = ''
+            node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|paperless|postgresql).service",state="failed"} == 1
+          '';
+          for = "1m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "Systemd service failed on {{ $labels.instance }}";
+            description = "Service {{ $labels.name }} on {{ $labels.instance }} is in failed state - check logs with: journalctl -u {{ $labels.name }}";
+          };
+        }
+
+        # Service unexpectedly inactive (should be running but isn't)
+        {
+          alert = "SystemdServiceInactive";
+          expr = ''
+            node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|paperless|postgresql).service",state="inactive"} == 1
+          '';
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Systemd service inactive on {{ $labels.instance }}";
+            description = "Service {{ $labels.name }} on {{ $labels.instance }} has been inactive for 5 minutes - may need restart";
+          };
+        }
+
+        # Service restarting frequently (potential crash loop)
+        {
+          alert = "SystemdServiceRestartingFrequently";
+          expr = ''
+            rate(node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|paperless|postgresql).service",state="activating"}[5m]) > 0.1
+          '';
+          for = "3m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Systemd service restarting frequently on {{ $labels.instance }}";
+            description = "Service {{ $labels.name }} on {{ $labels.instance }} is restarting frequently - possible crash loop";
+          };
+        }
+      ];
+    }
   ];
 }