Commit 942b578ff5d8
Changed files (7)
secrets
systems
secrets/sakhalin/homeassistant-prometheus-token.age
@@ -0,0 +1,11 @@
+age-encryption.org/v1
+-> piv-p256 ItIHHA AjI4FdQ2qGRojZNYNa2OOhTSSLo9QffAKuL4FVeq3O/k
+7hAbperkLxrReWsc2c1X2vMcOUw5f6JFUkGskG7vxo0
+-> piv-p256 ViCCtQ AzQheA+bJjaZ+8OoOs9MpDQk2JooCxSTA1shyj5eOoS3
+VkKHUF0ASkEsQjiWxoILONSc9t1bU31zX4vMvxOGSaI
+-> ssh-ed25519 /TxA1A WyHZRhO9dhXffK7qLxyjAv4SNf+MwOhJr0qRV9X0UyA
+eTygzeIvlYWX+7+igebFVUFzQQwodtNdp7O4GjbRcPY
+--- wvm/Rdfufhqy5UnnOstKivAws8Hu9Ia7KaHCSoFxOEc
+�e�0��b�B�~�1��W��U;{)�2�I����'�����Z��1
+�P^ic�N��YM�� CiH2$�M���0BV���ì�G4g�&~�)�y���?�Ƀ����L|q�ߞ��3�-����ۗ�Y�~B�mi/y��@�8]��>?�S>��SW�`�3n;�6�r;�>(5���z,�pd{��pH�|��D��O��y��
+#�c�3�
\ No newline at end of file
secrets/sakhalin/ntfy-token.age
@@ -0,0 +1,9 @@
+age-encryption.org/v1
+-> piv-p256 ItIHHA AiCltJihpOjJucJNkcx/wCMGJZbjtzFPju9wysh+1r6P
+tlg4UXOwtBOn/qiX2XhAZN9P4bdj78RQbpDcZQBFwUU
+-> piv-p256 ViCCtQ A7EEw2gX4F9vvjFeBEFg9+OIfn4ieKJ+P0UEj7z4Fcno
+OhngWIxVB636PU7cl5LnV1B2DWXLdYOp5WYqAKb1Mns
+-> ssh-ed25519 /TxA1A ZNgPs6M0Mri8Dpo+K1oBabjW8e6a3XTe70zUWbS+aVQ
+6zGa3RxkuWxcLzw5t0jHkpJsr+moM79k/CMWWyo80Z0
+--- OqV6Rjgv22rpAdMUj2ihEoAcq2xbjXEjd05hSguUtuM
+*�vE�����p�Vh�C>>�B����Ǩ�����[\d�C�3PL�Z" �"ԛ�����[��
\ No newline at end of file
systems/demeter/extra.nix
@@ -37,16 +37,18 @@
];
};
- prometheus.exporters.mqtt = {
- enable = true;
- port = 9234;
- mqttAddress = "127.0.0.1";
- mqttPort = 1883;
- mqttTopic = "#"; # Subscribe to all topics
- mqttUsername = "homeassistant";
- environmentFile = config.age.secrets."mosquitto-homeassistant-password".path;
- logLevel = "INFO";
- };
+ # MQTT exporter disabled due to broken package in nixpkgs
+ # ImportError: cannot import name 'main' from 'mqtt_exporter.main'
+ # prometheus.exporters.mqtt = {
+ # enable = true;
+ # port = 9234;
+ # mqttAddress = "127.0.0.1";
+ # mqttPort = 1883;
+ # mqttTopic = "#"; # Subscribe to all topics
+ # mqttUsername = "homeassistant";
+ # environmentFile = config.age.secrets."mosquitto-homeassistant-password".path;
+ # logLevel = "INFO";
+ # };
wireguard = {
enable = true;
systems/rhea/extra.nix
@@ -688,6 +688,13 @@ in
8883 # MQTTS
8080 # Traefik metrics
9000 # Node exporter
+ 9187 # PostgreSQL exporter
+ # Exportarr exporters
+ 9707 # Sonarr
+ 9708 # Radarr
+ 9709 # Lidarr
+ 9710 # Prowlarr
+ 9712 # Bazarr
# NFS ports
111 # rpcbind
2049 # NFS daemon
systems/sakhalin/extra.nix
@@ -9,7 +9,15 @@
}:
let
# Get machines that should be monitored
- nodeExporterMachines = monitoring.machinesWithNodeExporter globals.machines;
+ # Exclude: kyushu (laptop), shikoku (temporarily stopped), nagoya (not yet configured)
+ nodeExporterMachines = lib.filterAttrs (
+ name: _machine:
+ !builtins.elem name [
+ "kyushu"
+ "shikoku"
+ "nagoya"
+ ]
+ ) (monitoring.machinesWithNodeExporter globals.machines);
# Generate node exporter targets
nodeExporterTargets = monitoring.mkPrometheusTargets {
@@ -75,6 +83,17 @@ in
mode = "400";
owner = "grafana";
};
+ age.secrets."ntfy-token" = {
+ file = ../../secrets/sakhalin/ntfy-token.age;
+ mode = "440";
+ owner = "root";
+ group = "root";
+ };
+ age.secrets."homeassistant-prometheus-token" = {
+ file = ../../secrets/sakhalin/homeassistant-prometheus-token.age;
+ mode = "400";
+ owner = "prometheus";
+ };
# TODO make it an option ? (otherwise I'll add it for all)
users.users.vincent.linger = true;
@@ -154,6 +173,24 @@ in
prometheus = {
enable = true;
port = 9001;
+ checkConfig = false; # Disable config check due to agenix secrets not available at build time
+
+ # Alert rules
+ ruleFiles = [
+ (pkgs.writeText "prometheus-alerts.yml" (builtins.toJSON (import ./prometheus-alerts.nix)))
+ ];
+
+ # Alertmanager configuration
+ alertmanagers = [
+ {
+ static_configs = [
+ {
+ targets = [ "localhost:9093" ];
+ }
+ ];
+ }
+ ];
+
scrapeConfigs = [
{
job_name = "node";
@@ -203,25 +240,64 @@ in
}
];
}
- {
- job_name = "mosquitto";
- static_configs = [
- {
- targets = [ "demeter.sbr.pm:9234" ];
- }
- ];
- }
+ # Mosquitto MQTT exporter disabled - package broken in nixpkgs
+ # {
+ # job_name = "mosquitto";
+ # static_configs = [
+ # {
+ # targets = [ "demeter.sbr.pm:9234" ];
+ # }
+ # ];
+ # }
{
job_name = "homeassistant";
static_configs = [
{
- targets = [ "home.sbr.pm:8123" ];
+ targets = [ "${builtins.head globals.machines.hass.net.ips}:8123" ];
}
];
metrics_path = "/api/prometheus";
+ bearer_token_file = config.age.secrets."homeassistant-prometheus-token".path;
}
];
};
+
+ # Alertmanager for routing alerts
+ prometheus.alertmanager = {
+ enable = true;
+ port = 9093;
+ webExternalUrl = "http://localhost:9093";
+
+ configuration = {
+ global = {
+ resolve_timeout = "5m";
+ };
+
+ route = {
+ group_by = [
+ "alertname"
+ "instance"
+ ];
+ group_wait = "30s";
+ group_interval = "5m";
+ repeat_interval = "12h";
+ receiver = "ntfy";
+ };
+
+ receivers = [
+ {
+ name = "ntfy";
+ webhook_configs = [
+ {
+ url = "http://localhost:8081/hook"; # alertmanager-ntfy bridge
+ send_resolved = true;
+ }
+ ];
+ }
+ ];
+ };
+ };
+
tarsnap = {
enable = true;
archives = {
@@ -281,6 +357,57 @@ in
'';
};
+ # ntfy-alertmanager bridge - manual service configuration with token support
+ systemd.services.alertmanager-ntfy = {
+ description = "Alertmanager to ntfy bridge";
+ after = [ "network.target" ];
+ wantedBy = [ "multi-user.target" ];
+
+ serviceConfig = {
+ Type = "simple";
+ DynamicUser = true;
+ StateDirectory = "alertmanager-ntfy";
+ Restart = "on-failure";
+ RestartSec = "5s";
+ ExecStart = "${pkgs.alertmanager-ntfy}/bin/alertmanager-ntfy --configs /var/lib/alertmanager-ntfy/config.yml";
+ # Run config preparation as root (+ prefix) before starting the main process
+ ExecStartPre =
+ "+"
+ + pkgs.writeShellScript "prepare-alertmanager-ntfy-config" ''
+ # Read the token from the secret file
+ TOKEN=$(cat ${config.age.secrets."ntfy-token".path})
+
+ # Generate config with the actual token
+ cat > /var/lib/alertmanager-ntfy/config.yml <<'EOF'
+ http:
+ addr: 127.0.0.1:8081
+
+ ntfy:
+ baseurl: https://ntfy.sbr.pm
+ auth:
+ token: TOKEN_PLACEHOLDER
+ notification:
+ topic: homelab
+ priority: 'status == "firing" ? "urgent" : "default"'
+ tags:
+ - tag: rotating_light
+ condition: 'status == "firing" && labels.severity == "critical"'
+ - tag: warning
+ condition: 'status == "firing" && labels.severity == "warning"'
+ - tag: "+1"
+ condition: 'status == "resolved"'
+ templates:
+ title: '{{ if eq .Status "resolved" }}✅ Resolved: {{ end }}{{ if eq .Status "firing" }}🔥 {{ end }}{{ index .Annotations "summary" }}'
+ description: '{{ index .Annotations "description" }}'
+ EOF
+ # Replace placeholder with actual token
+ ${pkgs.gnused}/bin/sed -i "s/TOKEN_PLACEHOLDER/$TOKEN/" /var/lib/alertmanager-ntfy/config.yml
+ # Make config readable by the dynamic user
+ chmod 644 /var/lib/alertmanager-ntfy/config.yml
+ '';
+ };
+ };
+
environment.systemPackages = with pkgs; [ yt-dlp ]; # -----------------------------------
environment.etc."vrsync".text = ''
/home/vincent/desktop/pictures/screenshots/ vincent@synodine.home:/volumeUSB2/usbshare/pictures/screenshots/
systems/sakhalin/prometheus-alerts.nix
@@ -0,0 +1,321 @@
+# Prometheus alert rules for homelab monitoring
+# Based on: ~/desktop/org/notes/*prometheus*.org
+{
+ groups = [
+ {
+ name = "node_alerts";
+ interval = "30s";
+ rules = [
+ # Disk space warnings
+ {
+ alert = "DiskSpaceLow";
+ expr = ''
+ (node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"}) * 100 < 15
+ '';
+ for = "5m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "Disk space low on {{ $labels.instance }}";
+ description = "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 15% space remaining ({{ $value | humanizePercentage }})";
+ };
+ }
+ {
+ alert = "DiskSpaceCritical";
+ expr = ''
+ (node_filesystem_avail_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"} / node_filesystem_size_bytes{fstype!="tmpfs",fstype!="ramfs",fstype!="squashfs"}) * 100 < 5
+ '';
+ for = "2m";
+ labels = {
+ severity = "critical";
+ };
+ annotations = {
+ summary = "Disk space critical on {{ $labels.instance }}";
+ description = "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% space remaining ({{ $value | humanizePercentage }})";
+ };
+ }
+
+ # High CPU usage
+ {
+ alert = "HighCPUUsage";
+ expr = ''
+ 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+ '';
+ for = "10m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "High CPU usage on {{ $labels.instance }}";
+ description = "CPU usage on {{ $labels.instance }} is above 80% (current: {{ $value | humanizePercentage }})";
+ };
+ }
+
+ # High memory usage
+ {
+ alert = "HighMemoryUsage";
+ expr = ''
+ (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
+ '';
+ for = "5m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "High memory usage on {{ $labels.instance }}";
+ description = "Memory usage on {{ $labels.instance }} is above 90% (current: {{ $value | humanizePercentage }})";
+ };
+ }
+
+ # Node down
+ {
+ alert = "NodeDown";
+ expr = "up{job=\"node\"} == 0";
+ for = "2m";
+ labels = {
+ severity = "critical";
+ };
+ annotations = {
+ summary = "Node exporter down on {{ $labels.instance }}";
+ description = "Node exporter on {{ $labels.instance }} has been down for more than 2 minutes";
+ };
+ }
+ ];
+ }
+
+ {
+ name = "service_alerts";
+ interval = "30s";
+ rules = [
+ # Service exporters down
+ {
+ alert = "ServiceExporterDown";
+ expr = "up{job!=\"node\"} == 0";
+ for = "5m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "Service exporter down: {{ $labels.job }}";
+ description = "Service exporter {{ $labels.job }} on {{ $labels.instance }} has been down for more than 5 minutes";
+ };
+ }
+
+ # PostgreSQL down
+ {
+ alert = "PostgreSQLDown";
+ expr = "pg_up == 0";
+ for = "2m";
+ labels = {
+ severity = "critical";
+ };
+ annotations = {
+ summary = "PostgreSQL down on {{ $labels.instance }}";
+ description = "PostgreSQL database on {{ $labels.instance }} has been unreachable for more than 2 minutes";
+ };
+ }
+
+ # Traefik down
+ {
+ alert = "TraefikDown";
+ expr = "up{job=\"traefik\"} == 0";
+ for = "2m";
+ labels = {
+ severity = "critical";
+ };
+ annotations = {
+ summary = "Traefik reverse proxy down";
+ description = "Traefik on rhea.sbr.pm has been down for more than 2 minutes - all web services may be inaccessible";
+ };
+ }
+ ];
+ }
+
+ {
+ name = "dns_alerts";
+ interval = "30s";
+ rules = [
+ # BIND DNS service down
+ {
+ alert = "DNSServiceDown";
+ expr = "up{job=\"bind\"} == 0";
+ for = "2m";
+ labels = {
+ severity = "critical";
+ };
+ annotations = {
+ summary = "DNS service down on {{ $labels.instance }}";
+ description = "BIND DNS service on {{ $labels.instance }} has been unreachable for more than 2 minutes - DNS resolution may fail";
+ };
+ }
+
+ # High DNS query failure rate
+ {
+ alert = "HighDNSQueryFailureRate";
+ expr = "rate(bind_query_errors_total[5m]) > 10";
+ for = "5m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "High DNS query failure rate on {{ $labels.instance }}";
+ description = "DNS query failure rate on {{ $labels.instance }} is above 10 queries/sec";
+ };
+ }
+ ];
+ }
+
+ {
+ name = "traefik_alerts";
+ interval = "30s";
+ rules = [
+ # Traefik certificate expiration warning
+ {
+ alert = "TraefikCertificateExpiringSoon";
+ expr = "(traefik_tls_certs_not_after - time()) / 86400 < 7";
+ for = "1h";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "Traefik TLS certificate expiring soon";
+ description = "TLS certificate for {{ $labels.cn }} will expire in less than 7 days";
+ };
+ }
+
+ # High error rate (5xx responses)
+ {
+ alert = "TraefikHighErrorRate";
+ expr = "rate(traefik_service_requests_total{code=~\"5..\"}[5m]) > 5";
+ for = "5m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "High 5xx error rate on Traefik";
+ description = "Service {{ $labels.service }} is returning 5xx errors at {{ $value }} req/sec";
+ };
+ }
+ ];
+ }
+
+ {
+ name = "caddy_alerts";
+ interval = "30s";
+ rules = [
+ # Caddy down
+ {
+ alert = "CaddyDown";
+ expr = "up{job=\"caddy\"} == 0";
+ for = "2m";
+ labels = {
+ severity = "critical";
+ };
+ annotations = {
+ summary = "Caddy web server down on kerkouane";
+ description = "Caddy reverse proxy has been down for more than 2 minutes - external access may be broken";
+ };
+ }
+ ];
+ }
+
+ {
+ name = "media_services_alerts";
+ interval = "1m";
+ rules = [
+ # Exportarr services down (sonarr, radarr, lidarr, prowlarr, bazarr)
+ {
+ alert = "MediaServiceDown";
+ expr = "up{job=\"exportarr\"} == 0";
+ for = "5m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "Media service down on {{ $labels.instance }}";
+ description = "Exportarr exporter for {{ $labels.instance }} has been unreachable for 5 minutes - check *arr services";
+ };
+ }
+
+ # Sonarr/Radarr queue backing up
+ {
+ alert = "MediaQueueBackingUp";
+ expr = "sonarr_queue_total > 50 or radarr_queue_total > 50";
+ for = "30m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "Media download queue backing up";
+ description = "Download queue has {{ $value }} items - may indicate stuck downloads";
+ };
+ }
+ ];
+ }
+
+ # MQTT alerts disabled - exporter package broken in nixpkgs
+ # {
+ # name = "mqtt_alerts";
+ # interval = "30s";
+ # rules = [
+ # # Mosquitto MQTT broker down
+ # {
+ # alert = "MQTTBrokerDown";
+ # expr = "up{job=\"mosquitto\"} == 0";
+ # for = "2m";
+ # labels = {
+ # severity = "critical";
+ # };
+ # annotations = {
+ # summary = "MQTT broker down on demeter";
+ # description = "Mosquitto MQTT broker has been unreachable for more than 2 minutes - home automation may be affected";
+ # };
+ # }
+
+ # # MQTT high connection rate (potential issue)
+ # {
+ # alert = "MQTTHighConnectionRate";
+ # expr = "rate(mosquitto_connect_received[5m]) > 10";
+ # for = "5m";
+ # labels = {
+ # severity = "warning";
+ # };
+ # annotations = {
+ # summary = "High MQTT connection rate";
+ # description = "MQTT broker is seeing {{ $value }} connections/sec - may indicate reconnection loops";
+ # };
+ # }
+ # ];
+ # }
+
+ {
+ name = "homeassistant_alerts";
+ interval = "1m";
+ rules = [
+ # Home Assistant unreachable
+ {
+ alert = "HomeAssistantDown";
+ expr = "up{job=\"homeassistant\"} == 0";
+ for = "5m";
+ labels = {
+ severity = "warning";
+ };
+ annotations = {
+ summary = "Home Assistant unreachable";
+ description = "Home Assistant on hass.home has been unreachable for more than 5 minutes";
+ };
+ }
+ ];
+ }
+
+ {
+ name = "backup_alerts";
+ interval = "1h";
+ rules = [
+ # Backup failures (we'll add these when we implement backup monitoring)
+ # Placeholder for tarsnap, restic, rsync backup monitoring
+ ];
+ }
+ ];
+}
secrets.nix
@@ -103,5 +103,7 @@ in
"secrets/rhea/jellyfin-auto-collections-jellyseerr-password.age".publicKeys = users ++ [ rhea ];
"secrets/rhea/webdav-password.age".publicKeys = users ++ [ rhea ];
"secrets/sakhalin/grafana-admin-password.age".publicKeys = users ++ [ sakhalin ];
+ "secrets/sakhalin/ntfy-token.age".publicKeys = users ++ [ sakhalin ];
+ "secrets/sakhalin/homeassistant-prometheus-token.age".publicKeys = users ++ [ sakhalin ];
"secrets/demeter/mosquitto-homeassistant-password.age".publicKeys = users ++ [ demeter ];
}