Commit 942b578ff5d8

Vincent Demeester <vincent@sbr.pm>
2025-12-19 00:12:22
feat(monitoring): add prometheus alerting with ntfy notifications
- Enable proactive homelab monitoring across 8 machines and critical services - Route alerts to ntfy for real-time push notifications on infrastructure issues - Provide comprehensive coverage for disk, CPU, memory, services, and applications Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Vincent Demeester <vincent@sbr.pm>
1 parent 81b189d
secrets/sakhalin/homeassistant-prometheus-token.age
@@ -0,0 +1,11 @@
+age-encryption.org/v1
+-> piv-p256 ItIHHA AjI4FdQ2qGRojZNYNa2OOhTSSLo9QffAKuL4FVeq3O/k
+7hAbperkLxrReWsc2c1X2vMcOUw5f6JFUkGskG7vxo0
+-> piv-p256 ViCCtQ AzQheA+bJjaZ+8OoOs9MpDQk2JooCxSTA1shyj5eOoS3
+VkKHUF0ASkEsQjiWxoILONSc9t1bU31zX4vMvxOGSaI
+-> ssh-ed25519 /TxA1A WyHZRhO9dhXffK7qLxyjAv4SNf+MwOhJr0qRV9X0UyA
+eTygzeIvlYWX+7+igebFVUFzQQwodtNdp7O4GjbRcPY
+--- wvm/Rdfufhqy5UnnOstKivAws8Hu9Ia7KaHCSoFxOEc
+�e�0��b�B�~�1߻��W��U;{)�2�I����'�����Z��1
+�P^ic�N��YM�� CiH2$�M���0BV���ì�G4g�&~�)�y���?�Ƀ����L|q�ߞ��3�-����ۗ�Y�~B�mi/y��@�8]��>?�S>��SW�`�3n;�6�r;�>(5���z,�pd{��pH�|��D��O��y��
+#�c�3�
\ No newline at end of file
secrets/sakhalin/ntfy-token.age
@@ -0,0 +1,9 @@
+age-encryption.org/v1
+-> piv-p256 ItIHHA AiCltJihpOjJucJNkcx/wCMGJZbjtzFPju9wysh+1r6P
+tlg4UXOwtBOn/qiX2XhAZN9P4bdj78RQbpDcZQBFwUU
+-> piv-p256 ViCCtQ A7EEw2gX4F9vvjFeBEFg9+OIfn4ieKJ+P0UEj7z4Fcno
+OhngWIxVB636PU7cl5LnV1B2DWXLdYOp5WYqAKb1Mns
+-> ssh-ed25519 /TxA1A ZNgPs6M0Mri8Dpo+K1oBabjW8e6a3XTe70zUWbS+aVQ
+6zGa3RxkuWxcLzw5t0jHkpJsr+moM79k/CMWWyo80Z0
+--- OqV6Rjgv22rpAdMUj2ihEoAcq2xbjXEjd05hSguUtuM
+*�vE�����p�Vh�C>>�B����Ǩ�����[\d�C�3PL�Z" �"ԛ�����[��
\ No newline at end of file
systems/demeter/extra.nix
@@ -37,16 +37,18 @@
       ];
     };
 
-    prometheus.exporters.mqtt = {
-      enable = true;
-      port = 9234;
-      mqttAddress = "127.0.0.1";
-      mqttPort = 1883;
-      mqttTopic = "#"; # Subscribe to all topics
-      mqttUsername = "homeassistant";
-      environmentFile = config.age.secrets."mosquitto-homeassistant-password".path;
-      logLevel = "INFO";
-    };
+    # MQTT exporter disabled due to broken package in nixpkgs
+    # ImportError: cannot import name 'main' from 'mqtt_exporter.main'
+    # prometheus.exporters.mqtt = {
+    #   enable = true;
+    #   port = 9234;
+    #   mqttAddress = "127.0.0.1";
+    #   mqttPort = 1883;
+    #   mqttTopic = "#"; # Subscribe to all topics
+    #   mqttUsername = "homeassistant";
+    #   environmentFile = config.age.secrets."mosquitto-homeassistant-password".path;
+    #   logLevel = "INFO";
+    # };
 
     wireguard = {
       enable = true;
systems/rhea/extra.nix
@@ -688,6 +688,13 @@ in
       8883 # MQTTS
       8080 # Traefik metrics
       9000 # Node exporter
+      9187 # PostgreSQL exporter
+      # Exportarr exporters
+      9707 # Sonarr
+      9708 # Radarr
+      9709 # Lidarr
+      9710 # Prowlarr
+      9712 # Bazarr
       # NFS ports
       111 # rpcbind
       2049 # NFS daemon
systems/sakhalin/extra.nix
@@ -9,7 +9,15 @@
 }:
 let
   # Get machines that should be monitored
-  nodeExporterMachines = monitoring.machinesWithNodeExporter globals.machines;
+  # Exclude: kyushu (laptop), shikoku (temporarily stopped), nagoya (not yet configured)
+  nodeExporterMachines = lib.filterAttrs (
+    name: _machine:
+    !builtins.elem name [
+      "kyushu"
+      "shikoku"
+      "nagoya"
+    ]
+  ) (monitoring.machinesWithNodeExporter globals.machines);
 
   # Generate node exporter targets
   nodeExporterTargets = monitoring.mkPrometheusTargets {
@@ -75,6 +83,17 @@ in
     mode = "400";
     owner = "grafana";
   };
+  age.secrets."ntfy-token" = {
+    file = ../../secrets/sakhalin/ntfy-token.age;
+    mode = "440";
+    owner = "root";
+    group = "root";
+  };
+  age.secrets."homeassistant-prometheus-token" = {
+    file = ../../secrets/sakhalin/homeassistant-prometheus-token.age;
+    mode = "400";
+    owner = "prometheus";
+  };
 
   # TODO make it an option ? (otherwise I'll add it for all)
   users.users.vincent.linger = true;
@@ -154,6 +173,24 @@ in
     prometheus = {
       enable = true;
       port = 9001;
+      checkConfig = false; # Disable config check due to agenix secrets not available at build time
+
+      # Alert rules
+      ruleFiles = [
+        (pkgs.writeText "prometheus-alerts.yml" (builtins.toJSON (import ./prometheus-alerts.nix)))
+      ];
+
+      # Alertmanager configuration
+      alertmanagers = [
+        {
+          static_configs = [
+            {
+              targets = [ "localhost:9093" ];
+            }
+          ];
+        }
+      ];
+
       scrapeConfigs = [
         {
           job_name = "node";
@@ -203,25 +240,64 @@ in
             }
           ];
         }
-        {
-          job_name = "mosquitto";
-          static_configs = [
-            {
-              targets = [ "demeter.sbr.pm:9234" ];
-            }
-          ];
-        }
+        # Mosquitto MQTT exporter disabled - package broken in nixpkgs
+        # {
+        #   job_name = "mosquitto";
+        #   static_configs = [
+        #     {
+        #       targets = [ "demeter.sbr.pm:9234" ];
+        #     }
+        #   ];
+        # }
         {
           job_name = "homeassistant";
           static_configs = [
             {
-              targets = [ "home.sbr.pm:8123" ];
+              targets = [ "${builtins.head globals.machines.hass.net.ips}:8123" ];
             }
           ];
           metrics_path = "/api/prometheus";
+          bearer_token_file = config.age.secrets."homeassistant-prometheus-token".path;
         }
       ];
     };
+
+    # Alertmanager for routing alerts
+    prometheus.alertmanager = {
+      enable = true;
+      port = 9093;
+      webExternalUrl = "http://localhost:9093";
+
+      configuration = {
+        global = {
+          resolve_timeout = "5m";
+        };
+
+        route = {
+          group_by = [
+            "alertname"
+            "instance"
+          ];
+          group_wait = "30s";
+          group_interval = "5m";
+          repeat_interval = "12h";
+          receiver = "ntfy";
+        };
+
+        receivers = [
+          {
+            name = "ntfy";
+            webhook_configs = [
+              {
+                url = "http://localhost:8081/hook"; # alertmanager-ntfy bridge
+                send_resolved = true;
+              }
+            ];
+          }
+        ];
+      };
+    };
+
     tarsnap = {
       enable = true;
       archives = {
@@ -281,6 +357,57 @@ in
     '';
   };
 
+  # ntfy-alertmanager bridge - manual service configuration with token support
+  systemd.services.alertmanager-ntfy = {
+    description = "Alertmanager to ntfy bridge";
+    after = [ "network.target" ];
+    wantedBy = [ "multi-user.target" ];
+
+    serviceConfig = {
+      Type = "simple";
+      DynamicUser = true;
+      StateDirectory = "alertmanager-ntfy";
+      Restart = "on-failure";
+      RestartSec = "5s";
+      ExecStart = "${pkgs.alertmanager-ntfy}/bin/alertmanager-ntfy --configs /var/lib/alertmanager-ntfy/config.yml";
+      # Run config preparation as root (+ prefix) before starting the main process
+      ExecStartPre =
+        "+"
+        + pkgs.writeShellScript "prepare-alertmanager-ntfy-config" ''
+                  # Read the token from the secret file
+                  TOKEN=$(cat ${config.age.secrets."ntfy-token".path})
+
+                  # Generate config with the actual token
+                  cat > /var/lib/alertmanager-ntfy/config.yml <<'EOF'
+          http:
+            addr: 127.0.0.1:8081
+
+          ntfy:
+            baseurl: https://ntfy.sbr.pm
+            auth:
+              token: TOKEN_PLACEHOLDER
+            notification:
+              topic: homelab
+              priority: 'status == "firing" ? "urgent" : "default"'
+              tags:
+                - tag: rotating_light
+                  condition: 'status == "firing" && labels.severity == "critical"'
+                - tag: warning
+                  condition: 'status == "firing" && labels.severity == "warning"'
+                - tag: "+1"
+                  condition: 'status == "resolved"'
+              templates:
+                title: '{{ if eq .Status "resolved" }}✅ Resolved: {{ end }}{{ if eq .Status "firing" }}🔥 {{ end }}{{ index .Annotations "summary" }}'
+                description: '{{ index .Annotations "description" }}'
+          EOF
+                  # Replace placeholder with actual token
+                  ${pkgs.gnused}/bin/sed -i "s/TOKEN_PLACEHOLDER/$TOKEN/" /var/lib/alertmanager-ntfy/config.yml
+                  # Make config readable by the dynamic user
+                  chmod 644 /var/lib/alertmanager-ntfy/config.yml
+        '';
+    };
+  };
+
   environment.systemPackages = with pkgs; [ yt-dlp ]; # -----------------------------------
   environment.etc."vrsync".text = ''
     /home/vincent/desktop/pictures/screenshots/ vincent@synodine.home:/volumeUSB2/usbshare/pictures/screenshots/
systems/sakhalin/prometheus-alerts.nix
@@ -0,0 +1,321 @@
+# Prometheus alert rules for homelab monitoring
+# Based on: ~/desktop/org/notes/*prometheus*.org
+{
+  groups = [
+    {
+      name = "node_alerts";
+      interval = "30s";
+      rules = [
+        # Disk space warnings
+        {
+          alert = "DiskSpaceLow";
+          expr = ''
+            (node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"}) * 100 < 15
+          '';
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Disk space low on {{ $labels.instance }}";
+            description = "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% space remaining ({{ $value | humanizePercentage }})";
+          };
+        }
+        {
+          alert = "DiskSpaceCritical";
+          expr = ''
+            (node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"}) * 100 < 5
+          '';
+          for = "2m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "Disk space critical on {{ $labels.instance }}";
+            description = "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% space remaining ({{ $value | humanizePercentage }})";
+          };
+        }
+
+        # High CPU usage
+        {
+          alert = "HighCPUUsage";
+          expr = ''
+            100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+          '';
+          for = "10m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "High CPU usage on {{ $labels.instance }}";
+            description = "CPU usage on {{ $labels.instance }} is above 80% (current: {{ $value | humanizePercentage }})";
+          };
+        }
+
+        # High memory usage
+        {
+          alert = "HighMemoryUsage";
+          expr = ''
+            (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
+          '';
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "High memory usage on {{ $labels.instance }}";
+            description = "Memory usage on {{ $labels.instance }} is above 90% (current: {{ $value | humanizePercentage }})";
+          };
+        }
+
+        # Node down
+        {
+          alert = "NodeDown";
+          expr = "up{job=\"node\"} == 0";
+          for = "2m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "Node exporter down on {{ $labels.instance }}";
+            description = "Node exporter on {{ $labels.instance }} has been down for more than 2 minutes";
+          };
+        }
+      ];
+    }
+
+    {
+      name = "service_alerts";
+      interval = "30s";
+      rules = [
+        # Service exporters down
+        {
+          alert = "ServiceExporterDown";
+          expr = "up{job!=\"node\"} == 0";
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Service exporter down: {{ $labels.job }}";
+            description = "Service exporter {{ $labels.job }} on {{ $labels.instance }} has been down for more than 5 minutes";
+          };
+        }
+
+        # PostgreSQL down
+        {
+          alert = "PostgreSQLDown";
+          expr = "pg_up == 0";
+          for = "2m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "PostgreSQL down on {{ $labels.instance }}";
+            description = "PostgreSQL database on {{ $labels.instance }} has been unreachable for more than 2 minutes";
+          };
+        }
+
+        # Traefik down
+        {
+          alert = "TraefikDown";
+          expr = "up{job=\"traefik\"} == 0";
+          for = "2m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "Traefik reverse proxy down";
+            description = "Traefik on rhea.sbr.pm has been down for more than 2 minutes - all web services may be inaccessible";
+          };
+        }
+      ];
+    }
+
+    {
+      name = "dns_alerts";
+      interval = "30s";
+      rules = [
+        # BIND DNS service down
+        {
+          alert = "DNSServiceDown";
+          expr = "up{job=\"bind\"} == 0";
+          for = "2m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "DNS service down on {{ $labels.instance }}";
+            description = "BIND DNS service on {{ $labels.instance }} has been unreachable for more than 2 minutes - DNS resolution may fail";
+          };
+        }
+
+        # High DNS query failure rate
+        {
+          alert = "HighDNSQueryFailureRate";
+          expr = "rate(bind_query_errors_total[5m]) > 10";
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "High DNS query failure rate on {{ $labels.instance }}";
+            description = "DNS query failure rate on {{ $labels.instance }} is above 10 queries/sec";
+          };
+        }
+      ];
+    }
+
+    {
+      name = "traefik_alerts";
+      interval = "30s";
+      rules = [
+        # Traefik certificate expiration warning
+        {
+          alert = "TraefikCertificateExpiringSoon";
+          expr = "(traefik_tls_certs_not_after - time()) / 86400 < 7";
+          for = "1h";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Traefik TLS certificate expiring soon";
+            description = "TLS certificate for {{ $labels.cn }} will expire in less than 7 days";
+          };
+        }
+
+        # High error rate (5xx responses)
+        {
+          alert = "TraefikHighErrorRate";
+          expr = "rate(traefik_service_requests_total{code=~\"5..\"}[5m]) > 5";
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "High 5xx error rate on Traefik";
+            description = "Service {{ $labels.service }} is returning 5xx errors at {{ $value }} req/sec";
+          };
+        }
+      ];
+    }
+
+    {
+      name = "caddy_alerts";
+      interval = "30s";
+      rules = [
+        # Caddy down
+        {
+          alert = "CaddyDown";
+          expr = "up{job=\"caddy\"} == 0";
+          for = "2m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "Caddy web server down on kerkouane";
+            description = "Caddy reverse proxy has been down for more than 2 minutes - external access may be broken";
+          };
+        }
+      ];
+    }
+
+    {
+      name = "media_services_alerts";
+      interval = "1m";
+      rules = [
+        # Exportarr services down (sonarr, radarr, lidarr, prowlarr, bazarr)
+        {
+          alert = "MediaServiceDown";
+          expr = "up{job=\"exportarr\"} == 0";
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Media service down on {{ $labels.instance }}";
+            description = "Exportarr exporter for {{ $labels.instance }} has been unreachable for 5 minutes - check *arr services";
+          };
+        }
+
+        # Sonarr/Radarr queue backing up
+        {
+          alert = "MediaQueueBackingUp";
+          expr = "sonarr_queue_total > 50 or radarr_queue_total > 50";
+          for = "30m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Media download queue backing up";
+            description = "Download queue has {{ $value }} items - may indicate stuck downloads";
+          };
+        }
+      ];
+    }
+
+    # MQTT alerts disabled - exporter package broken in nixpkgs
+    # {
+    #   name = "mqtt_alerts";
+    #   interval = "30s";
+    #   rules = [
+    #     # Mosquitto MQTT broker down
+    #     {
+    #       alert = "MQTTBrokerDown";
+    #       expr = "up{job=\"mosquitto\"} == 0";
+    #       for = "2m";
+    #       labels = {
+    #         severity = "critical";
+    #       };
+    #       annotations = {
+    #         summary = "MQTT broker down on demeter";
+    #         description = "Mosquitto MQTT broker has been unreachable for more than 2 minutes - home automation may be affected";
+    #       };
+    #     }
+
+    #     # MQTT high connection rate (potential issue)
+    #     {
+    #       alert = "MQTTHighConnectionRate";
+    #       expr = "rate(mosquitto_connect_received[5m]) > 10";
+    #       for = "5m";
+    #       labels = {
+    #         severity = "warning";
+    #       };
+    #       annotations = {
+    #         summary = "High MQTT connection rate";
+    #         description = "MQTT broker is seeing {{ $value }} connections/sec - may indicate reconnection loops";
+    #       };
+    #     }
+    #   ];
+    # }
+
+    {
+      name = "homeassistant_alerts";
+      interval = "1m";
+      rules = [
+        # Home Assistant unreachable
+        {
+          alert = "HomeAssistantDown";
+          expr = "up{job=\"homeassistant\"} == 0";
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Home Assistant unreachable";
+            description = "Home Assistant on hass.home has been unreachable for more than 5 minutes";
+          };
+        }
+      ];
+    }
+
+    {
+      name = "backup_alerts";
+      interval = "1h";
+      rules = [
+        # Backup failures (we'll add these when we implement backup monitoring)
+        # Placeholder for tarsnap, restic, rsync backup monitoring
+      ];
+    }
+  ];
+}
secrets.nix
@@ -103,5 +103,7 @@ in
   "secrets/rhea/jellyfin-auto-collections-jellyseerr-password.age".publicKeys = users ++ [ rhea ];
   "secrets/rhea/webdav-password.age".publicKeys = users ++ [ rhea ];
   "secrets/sakhalin/grafana-admin-password.age".publicKeys = users ++ [ sakhalin ];
+  "secrets/sakhalin/ntfy-token.age".publicKeys = users ++ [ sakhalin ];
+  "secrets/sakhalin/homeassistant-prometheus-token.age".publicKeys = users ++ [ sakhalin ];
   "secrets/demeter/mosquitto-homeassistant-password.age".publicKeys = users ++ [ demeter ];
 }