Commit e7fb878ba284
Changed files (1)
systems
sakhalin
systems/sakhalin/prometheus-alerts.nix
@@ -437,5 +437,59 @@
# Placeholder for tarsnap, restic, rsync backup monitoring
];
}
+
+ {
+ name = "systemd_alerts";
+ interval = "30s";
+ rules = [
+ # Critical systemd service failures
+ {
+ alert = "SystemdServiceFailed";
+ expr = ''
+ node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|paperless|postgresql).service",state="failed"} == 1
+ '';
+ for = "1m";
+ labels = {
+ severity = "critical";
+ };
+ annotations = {
+ summary = "Systemd service failed on {{ $labels.instance }}";
+ description = "Service {{ $labels.name }} on {{ $labels.instance }} is in failed state - check logs with: journalctl -u {{ $labels.name }}";
+ };
+ }
+
+ # Service unexpectedly inactive (should be running but isn't)
+ {
+ alert = "SystemdServiceInactive";
+ expr = ''
+ node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|paperless|postgresql).service",state="inactive"} == 1
+ '';
+ for = "5m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "Systemd service inactive on {{ $labels.instance }}";
+ description = "Service {{ $labels.name }} on {{ $labels.instance }} has been inactive for 5 minutes - may need restart";
+ };
+ }
+
+ # Service restarting frequently (potential crash loop)
+ {
+ alert = "SystemdServiceRestartingFrequently";
+ expr = ''
+ rate(node_systemd_unit_state{name=~"(prosody|jellyfin|audiobookshelf|navidrome|lidarr|sonarr|radarr|prowlarr|bazarr|traefik|grafana|prometheus|alertmanager|n8n|paperless|postgresql).service",state="activating"}[5m]) > 0.1
+ '';
+ for = "3m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "Systemd service restarting frequently on {{ $labels.instance }}";
+ description = "Service {{ $labels.name }} on {{ $labels.instance }} is restarting frequently - possible crash loop";
+ };
+ }
+ ];
+ }
];
}