fedora-csb-system-manager
  1{
  2  libx,
  3  globals,
  4  lib,
  5  pkgs,
  6  monitoring,
  7  config,
  8  ...
  9}:
 10let
 11  # Get machines that should be monitored
 12  # Exclude: kyushu (laptop), shikoku (temporarily stopped), nagoya (not yet configured)
 13  nodeExporterMachines = lib.filterAttrs (
 14    name: _machine:
 15    !builtins.elem name [
 16      "kyushu"
 17      "shikoku"
 18      "nagoya"
 19    ]
 20  ) (monitoring.machinesWithNodeExporter globals.machines);
 21
 22  # Generate node exporter targets
 23  nodeExporterTargets = monitoring.mkPrometheusTargets {
 24    machines = nodeExporterMachines;
 25    port = 9000;
 26  };
 27
 28  # Machines with BIND DNS
 29  bindMachines = lib.filterAttrs (
 30    _name: _machine:
 31    builtins.elem _name [
 32      "demeter"
 33      "athena"
 34    ]
 35  ) globals.machines;
 36  bindTargets = monitoring.mkPrometheusTargets {
 37    machines = bindMachines;
 38    port = 9009;
 39  };
 40
 41  # PostgreSQL hosts
 42  postgresTargets = map (host: "${host}.sbr.pm:9187") [
 43    "rhea"
 44    "sakhalin"
 45  ];
 46
 47  # Exportarr services configuration
 48  exportarrServices = {
 49    sonarr = {
 50      port = 9707;
 51    };
 52    radarr = {
 53      port = 9708;
 54    };
 55    lidarr = {
 56      port = 9709;
 57    };
 58    prowlarr = {
 59      port = 9710;
 60    };
 61    bazarr = {
 62      port = 9712;
 63    };
 64  };
 65  exportarrTargets = lib.mapAttrsToList (
 66    _name: cfg: "rhea.sbr.pm:${toString cfg.port}"
 67  ) exportarrServices;
 68
 69  # Docker hosts with metrics enabled
 70  dockerMachines = lib.filterAttrs (
 71    _name: _machine:
 72    builtins.elem _name [
 73      "sakhalin"
 74      "aomi"
 75    ]
 76  ) globals.machines;
 77  dockerTargets = monitoring.mkPrometheusTargets {
 78    machines = dockerMachines;
 79    port = 9323;
 80  };
 81in
 82{
 83
 84  imports = [
 85    ../common/services/containers.nix
 86    ../common/services/docker.nix
 87    ../common/services/binfmt.nix
 88    ../common/services/prometheus-exporters-node.nix
 89    ../common/services/prometheus-exporters-postgres.nix
 90  ];
 91
 92  # Disable TPM2 (hardware has no TPM chip)
 93  security.tpm2.enable = lib.mkForce false;
 94
 95  # Age secrets
 96  age.secrets."grafana-admin-password" = {
 97    file = ../../secrets/sakhalin/grafana-admin-password.age;
 98    mode = "400";
 99    owner = "grafana";
100  };
101  age.secrets."ntfy-token" = {
102    file = ../../secrets/sakhalin/ntfy-token.age;
103    mode = "440";
104    owner = "root";
105    group = "root";
106  };
107  age.secrets."homeassistant-prometheus-token" = {
108    file = ../../secrets/sakhalin/homeassistant-prometheus-token.age;
109    mode = "400";
110    owner = "prometheus";
111  };
112
113  # TODO make it an option ? (otherwise I'll add it for all)
114  users.users.vincent.linger = true;
115
116  systemd.services.n8n.environment = {
117    N8N_SECURE_COOKIE = "false";
118    PATH = lib.mkForce "/run/current-system/sw/bin";
119  };
120
121  services = {
122    atuin = {
123      enable = true;
124      host = "0.0.0.0";
125      openRegistration = false;
126    };
127
128    n8n = {
129      enable = true;
130      openFirewall = true;
131      # webhookUrl = "";
132    };
133    paperless = {
134      enable = true;
135      address = "0.0.0.0"; # Listen on all interfaces for access via LAN and VPN
136      port = 8000;
137      settings = {
138        PAPERLESS_URL = "https://paperless.sbr.pm";
139      };
140    };
141    # PostgreSQL backups
142    postgresqlBackup = {
143      enable = true;
144      databases = [ ];
145      location = "/var/backup/postgresql";
146      startAt = "*-*-* 02:15:00"; # Daily at 2:15 AM
147    };
148
149    grafana = {
150      enable = true;
151      settings = {
152        server = {
153          http_addr = "0.0.0.0";
154          http_port = 3000;
155          domain = "grafana.sbr.pm";
156          root_url = "https://grafana.sbr.pm";
157        };
158      };
159
160      provision = {
161        enable = true;
162        datasources.settings = {
163          apiVersion = 1;
164          datasources = [
165            {
166              name = "Prometheus";
167              type = "prometheus";
168              access = "proxy";
169              url = "http://localhost:9001";
170              isDefault = true;
171              jsonData = {
172                timeInterval = "30s";
173              };
174            }
175          ];
176        };
177
178        dashboards.settings = {
179          apiVersion = 1;
180          providers = [
181            {
182              name = "Default";
183              type = "file";
184              disableDeletion = false;
185              allowUiUpdates = true;
186              options.path = "/var/lib/grafana/dashboards";
187            }
188          ];
189        };
190      };
191    };
192    prometheus = {
193      enable = true;
194      port = 9001;
195      checkConfig = false; # Disable config check due to agenix secrets not available at build time
196
197      # Alert rules
198      ruleFiles = [
199        (pkgs.writeText "prometheus-alerts.yml" (builtins.toJSON (import ./prometheus-alerts.nix)))
200      ];
201
202      # Alertmanager configuration
203      alertmanagers = [
204        {
205          static_configs = [
206            {
207              targets = [ "localhost:9093" ];
208            }
209          ];
210        }
211      ];
212
213      scrapeConfigs = [
214        {
215          job_name = "node";
216          static_configs = [
217            {
218              targets = nodeExporterTargets;
219            }
220          ];
221        }
222        {
223          job_name = "bind";
224          static_configs = [
225            {
226              targets = bindTargets;
227            }
228          ];
229        }
230        {
231          job_name = "postgres";
232          static_configs = [
233            {
234              targets = postgresTargets;
235            }
236          ];
237        }
238        {
239          job_name = "traefik";
240          static_configs = [
241            {
242              targets = [ "rhea.sbr.pm:8080" ];
243            }
244          ];
245        }
246        {
247          job_name = "caddy";
248          static_configs = [
249            {
250              targets = [ "${builtins.head globals.machines.kerkouane.net.vpn.ips}:2019" ];
251            }
252          ];
253        }
254        {
255          job_name = "exportarr";
256          static_configs = [
257            {
258              targets = exportarrTargets;
259            }
260          ];
261        }
262        # Mosquitto MQTT exporter disabled - package broken in nixpkgs
263        # {
264        #   job_name = "mosquitto";
265        #   static_configs = [
266        #     {
267        #       targets = [ "demeter.sbr.pm:9234" ];
268        #     }
269        #   ];
270        # }
271        {
272          job_name = "homeassistant";
273          static_configs = [
274            {
275              targets = [ "${builtins.head globals.machines.hass.net.ips}:8123" ];
276            }
277          ];
278          metrics_path = "/api/prometheus";
279          bearer_token_file = config.age.secrets."homeassistant-prometheus-token".path;
280        }
281        {
282          job_name = "docker";
283          static_configs = [
284            {
285              targets = dockerTargets;
286            }
287          ];
288        }
289        {
290          job_name = "ollama";
291          static_configs = [
292            {
293              targets = [ "${builtins.head globals.machines.aomi.net.ips}:8000" ];
294            }
295          ];
296        }
297        {
298          job_name = "restic";
299          static_configs = [
300            {
301              targets = [ "aion.sbr.pm:9753" ];
302            }
303          ];
304        }
305      ];
306    };
307
308    # Alertmanager for routing alerts
309    prometheus.alertmanager = {
310      enable = true;
311      port = 9093;
312      webExternalUrl = "http://localhost:9093";
313
314      configuration = {
315        global = {
316          resolve_timeout = "5m";
317        };
318
319        route = {
320          group_by = [
321            "alertname"
322            "instance"
323          ];
324          group_wait = "30s";
325          group_interval = "5m";
326          repeat_interval = "12h";
327          receiver = "ntfy";
328        };
329
330        receivers = [
331          {
332            name = "ntfy";
333            webhook_configs = [
334              {
335                url = "http://localhost:8081/hook"; # alertmanager-ntfy bridge
336                send_resolved = true;
337              }
338            ];
339          }
340        ];
341      };
342    };
343
344    tarsnap = {
345      enable = true;
346      archives = {
347        documents = {
348          directories = [ "/home/vincent/desktop/documents" ];
349          period = "daily";
350          keyfile = "/etc/nixos/assets/tarsnap.documents.key";
351        };
352        org = {
353          directories = [ "/home/vincent/desktop/org" ];
354          period = "daily";
355          keyfile = "/etc/nixos/assets/tarsnap.org.key";
356        };
357      };
358    };
359    nfs.server = {
360      enable = true;
361      exports = ''
362        /export                      192.168.1.0/24(rw,fsid=0,no_subtree_check) 10.100.0.0/24(rw,fsid=0,no_subtree_check)
363        /export/gaia                 192.168.1.0/24(rw,fsid=1,no_subtree_check) 10.100.0.0/24(rw,fsid=1,no_subtree_check)
364        /export/toshito              192.168.1.0/24(rw,fsid=2,no_subtree_check) 10.100.0.0/24(rw,fsid=2,no_subtree_check)
365      '';
366    };
367
368    wireguard = {
369      enable = true;
370      ips = libx.wg-ips globals.machines.sakhalin.net.vpn.ips;
371      endpoint = "${globals.net.vpn.endpoint}";
372      endpointPublicKey = "${globals.machines.kerkouane.net.vpn.pubkey}";
373    };
374  };
375
376  # Create Grafana dashboard directory and deploy Ollama dashboards
377  systemd.tmpfiles.rules = [
378    "d /var/lib/grafana/dashboards 0755 grafana grafana -"
379    "C /var/lib/grafana/dashboards/ollama-metrics.json 0644 grafana grafana - ${../../tools/ollama-exporter/grafana-dashboard.json}"
380    "C /var/lib/grafana/dashboards/ollama-performance.json 0644 grafana grafana - ${../../tools/ollama-exporter/grafana-dashboard-custom.json}"
381  ];
382
383  # Set Grafana admin password from secret file
384  systemd.services.grafana-set-admin-password = {
385    description = "Set Grafana admin password from secret file";
386    after = [ "grafana.service" ];
387    wantedBy = [ "multi-user.target" ];
388    serviceConfig = {
389      Type = "oneshot";
390      User = "grafana";
391      RemainAfterExit = true;
392    };
393    script = ''
394      # Only set password if admin user exists (database initialized)
395      if ${pkgs.grafana}/bin/grafana-cli --homepath /var/lib/grafana admin reset-admin-password --password-from-stdin < ${
396        config.age.secrets."grafana-admin-password".path
397      } 2>/dev/null; then
398        echo "Admin password updated successfully"
399      else
400        echo "Failed to update password or admin user doesn't exist yet"
401      fi
402    '';
403  };
404
405  # ntfy-alertmanager bridge - manual service configuration with token support
406  systemd.services.alertmanager-ntfy = {
407    description = "Alertmanager to ntfy bridge";
408    after = [ "network.target" ];
409    wantedBy = [ "multi-user.target" ];
410
411    serviceConfig = {
412      Type = "simple";
413      DynamicUser = true;
414      StateDirectory = "alertmanager-ntfy";
415      Restart = "on-failure";
416      RestartSec = "5s";
417      ExecStart = "${pkgs.alertmanager-ntfy}/bin/alertmanager-ntfy --configs /var/lib/alertmanager-ntfy/config.yml";
418      # Run config preparation as root (+ prefix) before starting the main process
419      ExecStartPre =
420        "+"
421        + pkgs.writeShellScript "prepare-alertmanager-ntfy-config" ''
422                  # Read the token from the secret file
423                  TOKEN=$(cat ${config.age.secrets."ntfy-token".path})
424
425                  # Generate config with the actual token
426                  cat > /var/lib/alertmanager-ntfy/config.yml <<'EOF'
427          http:
428            addr: 127.0.0.1:8081
429
430          ntfy:
431            baseurl: https://ntfy.sbr.pm
432            auth:
433              token: TOKEN_PLACEHOLDER
434            notification:
435              topic: homelab
436              priority: 'status == "firing" ? "urgent" : "default"'
437              tags:
438                - tag: rotating_light
439                  condition: 'status == "firing" && labels.severity == "critical"'
440                - tag: warning
441                  condition: 'status == "firing" && labels.severity == "warning"'
442                - tag: "+1"
443                  condition: 'status == "resolved"'
444              templates:
445                title: '{{ if eq .Status "resolved" }} Resolved: {{ end }}{{ if eq .Status "firing" }}🔥 {{ end }}{{ index .Annotations "summary" }}'
446                description: '{{ index .Annotations "description" }}'
447          EOF
448                  # Replace placeholder with actual token
449                  ${pkgs.gnused}/bin/sed -i "s/TOKEN_PLACEHOLDER/$TOKEN/" /var/lib/alertmanager-ntfy/config.yml
450                  # Make config readable by the dynamic user
451                  chmod 644 /var/lib/alertmanager-ntfy/config.yml
452        '';
453    };
454  };
455
456  environment.systemPackages = with pkgs; [ yt-dlp ];
457  # mr -i u daily
458  systemd.services.mr = {
459    description = "Update configs daily";
460    requires = [ "network-online.target" ];
461    after = [ "network-online.target" ];
462
463    restartIfChanged = false;
464    unitConfig.X-StopOnRemoval = false;
465
466    serviceConfig = {
467      Type = "oneshot";
468      User = "vincent";
469      OnFailure = "status-email-root@%n.service";
470    };
471
472    path = with pkgs; [
473      git
474      mr
475    ];
476    script = ''
477      set -e
478       cd /mnt/gaia/src/configs/
479       mr -t run git reset --hard
480       mr -t u
481    '';
482
483    startAt = "daily";
484  };
485  # Kiwix serve
486  systemd.services.kiwix-serve = {
487    description = "Kiwix offline content server";
488    wantedBy = [ "multi-user.target" ];
489    after = [ "network.target" ];
490
491    serviceConfig = {
492      Type = "simple";
493      User = "vincent";
494      ExecStart = "${pkgs.bash}/bin/bash -c '${pkgs.kiwix-tools}/bin/kiwix-serve --port=8080 /mnt/gaia/kiwix/*.zim'";
495      Restart = "on-failure";
496      RestartSec = "5s";
497    };
498  };
499
500  # Open firewall for services accessible from the network
501  networking.firewall.allowedTCPPorts = [
502    8000 # Paperless-ngx web interface
503  ];
504}