fedora-csb-system-manager
1{
2 libx,
3 globals,
4 lib,
5 pkgs,
6 monitoring,
7 config,
8 ...
9}:
10let
11 # Get machines that should be monitored
12 # Exclude: kyushu (laptop), shikoku (temporarily stopped), nagoya (not yet configured)
13 nodeExporterMachines = lib.filterAttrs (
14 name: _machine:
15 !builtins.elem name [
16 "kyushu"
17 "shikoku"
18 "nagoya"
19 ]
20 ) (monitoring.machinesWithNodeExporter globals.machines);
21
22 # Generate node exporter targets
23 nodeExporterTargets = monitoring.mkPrometheusTargets {
24 machines = nodeExporterMachines;
25 port = 9000;
26 };
27
28 # Machines with BIND DNS
29 bindMachines = lib.filterAttrs (
30 _name: _machine:
31 builtins.elem _name [
32 "demeter"
33 "athena"
34 ]
35 ) globals.machines;
36 bindTargets = monitoring.mkPrometheusTargets {
37 machines = bindMachines;
38 port = 9009;
39 };
40
41 # PostgreSQL hosts
42 postgresTargets = map (host: "${host}.sbr.pm:9187") [
43 "rhea"
44 "sakhalin"
45 ];
46
47 # Exportarr services configuration
48 exportarrServices = {
49 sonarr = {
50 port = 9707;
51 };
52 radarr = {
53 port = 9708;
54 };
55 lidarr = {
56 port = 9709;
57 };
58 prowlarr = {
59 port = 9710;
60 };
61 bazarr = {
62 port = 9712;
63 };
64 };
65 exportarrTargets = lib.mapAttrsToList (
66 _name: cfg: "rhea.sbr.pm:${toString cfg.port}"
67 ) exportarrServices;
68
69 # Docker hosts with metrics enabled
70 dockerMachines = lib.filterAttrs (
71 _name: _machine:
72 builtins.elem _name [
73 "sakhalin"
74 "aomi"
75 ]
76 ) globals.machines;
77 dockerTargets = monitoring.mkPrometheusTargets {
78 machines = dockerMachines;
79 port = 9323;
80 };
81in
82{
83
84 imports = [
85 ../common/services/containers.nix
86 ../common/services/docker.nix
87 ../common/services/binfmt.nix
88 ../common/services/prometheus-exporters-node.nix
89 ../common/services/prometheus-exporters-postgres.nix
90 ];
91
92 # Disable TPM2 (hardware has no TPM chip)
93 security.tpm2.enable = lib.mkForce false;
94
95 # Age secrets
96 age.secrets."grafana-admin-password" = {
97 file = ../../secrets/sakhalin/grafana-admin-password.age;
98 mode = "400";
99 owner = "grafana";
100 };
101 age.secrets."ntfy-token" = {
102 file = ../../secrets/sakhalin/ntfy-token.age;
103 mode = "440";
104 owner = "root";
105 group = "root";
106 };
107 age.secrets."homeassistant-prometheus-token" = {
108 file = ../../secrets/sakhalin/homeassistant-prometheus-token.age;
109 mode = "400";
110 owner = "prometheus";
111 };
112
113 # TODO make it an option ? (otherwise I'll add it for all)
114 users.users.vincent.linger = true;
115
116 systemd.services.n8n.environment = {
117 N8N_SECURE_COOKIE = "false";
118 PATH = lib.mkForce "/run/current-system/sw/bin";
119 };
120
121 services = {
122 atuin = {
123 enable = true;
124 host = "0.0.0.0";
125 openRegistration = false;
126 };
127
128 n8n = {
129 enable = true;
130 openFirewall = true;
131 # webhookUrl = "";
132 };
133 paperless = {
134 enable = true;
135 address = "0.0.0.0"; # Listen on all interfaces for access via LAN and VPN
136 port = 8000;
137 settings = {
138 PAPERLESS_URL = "https://paperless.sbr.pm";
139 };
140 };
141 # PostgreSQL backups
142 postgresqlBackup = {
143 enable = true;
144 databases = [ ];
145 location = "/var/backup/postgresql";
146 startAt = "*-*-* 02:15:00"; # Daily at 2:15 AM
147 };
148
149 grafana = {
150 enable = true;
151 settings = {
152 server = {
153 http_addr = "0.0.0.0";
154 http_port = 3000;
155 domain = "grafana.sbr.pm";
156 root_url = "https://grafana.sbr.pm";
157 };
158 };
159
160 provision = {
161 enable = true;
162 datasources.settings = {
163 apiVersion = 1;
164 datasources = [
165 {
166 name = "Prometheus";
167 type = "prometheus";
168 access = "proxy";
169 url = "http://localhost:9001";
170 isDefault = true;
171 jsonData = {
172 timeInterval = "30s";
173 };
174 }
175 ];
176 };
177
178 dashboards.settings = {
179 apiVersion = 1;
180 providers = [
181 {
182 name = "Default";
183 type = "file";
184 disableDeletion = false;
185 allowUiUpdates = true;
186 options.path = "/var/lib/grafana/dashboards";
187 }
188 ];
189 };
190 };
191 };
192 prometheus = {
193 enable = true;
194 port = 9001;
195 checkConfig = false; # Disable config check due to agenix secrets not available at build time
196
197 # Alert rules
198 ruleFiles = [
199 (pkgs.writeText "prometheus-alerts.yml" (builtins.toJSON (import ./prometheus-alerts.nix)))
200 ];
201
202 # Alertmanager configuration
203 alertmanagers = [
204 {
205 static_configs = [
206 {
207 targets = [ "localhost:9093" ];
208 }
209 ];
210 }
211 ];
212
213 scrapeConfigs = [
214 {
215 job_name = "node";
216 static_configs = [
217 {
218 targets = nodeExporterTargets;
219 }
220 ];
221 }
222 {
223 job_name = "bind";
224 static_configs = [
225 {
226 targets = bindTargets;
227 }
228 ];
229 }
230 {
231 job_name = "postgres";
232 static_configs = [
233 {
234 targets = postgresTargets;
235 }
236 ];
237 }
238 {
239 job_name = "traefik";
240 static_configs = [
241 {
242 targets = [ "rhea.sbr.pm:8080" ];
243 }
244 ];
245 }
246 {
247 job_name = "caddy";
248 static_configs = [
249 {
250 targets = [ "${builtins.head globals.machines.kerkouane.net.vpn.ips}:2019" ];
251 }
252 ];
253 }
254 {
255 job_name = "exportarr";
256 static_configs = [
257 {
258 targets = exportarrTargets;
259 }
260 ];
261 }
262 # Mosquitto MQTT exporter disabled - package broken in nixpkgs
263 # {
264 # job_name = "mosquitto";
265 # static_configs = [
266 # {
267 # targets = [ "demeter.sbr.pm:9234" ];
268 # }
269 # ];
270 # }
271 {
272 job_name = "homeassistant";
273 static_configs = [
274 {
275 targets = [ "${builtins.head globals.machines.hass.net.ips}:8123" ];
276 }
277 ];
278 metrics_path = "/api/prometheus";
279 bearer_token_file = config.age.secrets."homeassistant-prometheus-token".path;
280 }
281 {
282 job_name = "docker";
283 static_configs = [
284 {
285 targets = dockerTargets;
286 }
287 ];
288 }
289 {
290 job_name = "ollama";
291 static_configs = [
292 {
293 targets = [ "${builtins.head globals.machines.aomi.net.ips}:8000" ];
294 }
295 ];
296 }
297 {
298 job_name = "restic";
299 static_configs = [
300 {
301 targets = [ "aion.sbr.pm:9753" ];
302 }
303 ];
304 }
305 ];
306 };
307
308 # Alertmanager for routing alerts
309 prometheus.alertmanager = {
310 enable = true;
311 port = 9093;
312 webExternalUrl = "http://localhost:9093";
313
314 configuration = {
315 global = {
316 resolve_timeout = "5m";
317 };
318
319 route = {
320 group_by = [
321 "alertname"
322 "instance"
323 ];
324 group_wait = "30s";
325 group_interval = "5m";
326 repeat_interval = "12h";
327 receiver = "ntfy";
328 };
329
330 receivers = [
331 {
332 name = "ntfy";
333 webhook_configs = [
334 {
335 url = "http://localhost:8081/hook"; # alertmanager-ntfy bridge
336 send_resolved = true;
337 }
338 ];
339 }
340 ];
341 };
342 };
343
344 tarsnap = {
345 enable = true;
346 archives = {
347 documents = {
348 directories = [ "/home/vincent/desktop/documents" ];
349 period = "daily";
350 keyfile = "/etc/nixos/assets/tarsnap.documents.key";
351 };
352 org = {
353 directories = [ "/home/vincent/desktop/org" ];
354 period = "daily";
355 keyfile = "/etc/nixos/assets/tarsnap.org.key";
356 };
357 };
358 };
359 nfs.server = {
360 enable = true;
361 exports = ''
362 /export 192.168.1.0/24(rw,fsid=0,no_subtree_check) 10.100.0.0/24(rw,fsid=0,no_subtree_check)
363 /export/gaia 192.168.1.0/24(rw,fsid=1,no_subtree_check) 10.100.0.0/24(rw,fsid=1,no_subtree_check)
364 /export/toshito 192.168.1.0/24(rw,fsid=2,no_subtree_check) 10.100.0.0/24(rw,fsid=2,no_subtree_check)
365 '';
366 };
367
368 wireguard = {
369 enable = true;
370 ips = libx.wg-ips globals.machines.sakhalin.net.vpn.ips;
371 endpoint = "${globals.net.vpn.endpoint}";
372 endpointPublicKey = "${globals.machines.kerkouane.net.vpn.pubkey}";
373 };
374 };
375
376 # Create Grafana dashboard directory and deploy Ollama dashboards
377 systemd.tmpfiles.rules = [
378 "d /var/lib/grafana/dashboards 0755 grafana grafana -"
379 "C /var/lib/grafana/dashboards/ollama-metrics.json 0644 grafana grafana - ${../../tools/ollama-exporter/grafana-dashboard.json}"
380 "C /var/lib/grafana/dashboards/ollama-performance.json 0644 grafana grafana - ${../../tools/ollama-exporter/grafana-dashboard-custom.json}"
381 ];
382
383 # Set Grafana admin password from secret file
384 systemd.services.grafana-set-admin-password = {
385 description = "Set Grafana admin password from secret file";
386 after = [ "grafana.service" ];
387 wantedBy = [ "multi-user.target" ];
388 serviceConfig = {
389 Type = "oneshot";
390 User = "grafana";
391 RemainAfterExit = true;
392 };
393 script = ''
394 # Only set password if admin user exists (database initialized)
395 if ${pkgs.grafana}/bin/grafana-cli --homepath /var/lib/grafana admin reset-admin-password --password-from-stdin < ${
396 config.age.secrets."grafana-admin-password".path
397 } 2>/dev/null; then
398 echo "Admin password updated successfully"
399 else
400 echo "Failed to update password or admin user doesn't exist yet"
401 fi
402 '';
403 };
404
405 # ntfy-alertmanager bridge - manual service configuration with token support
406 systemd.services.alertmanager-ntfy = {
407 description = "Alertmanager to ntfy bridge";
408 after = [ "network.target" ];
409 wantedBy = [ "multi-user.target" ];
410
411 serviceConfig = {
412 Type = "simple";
413 DynamicUser = true;
414 StateDirectory = "alertmanager-ntfy";
415 Restart = "on-failure";
416 RestartSec = "5s";
417 ExecStart = "${pkgs.alertmanager-ntfy}/bin/alertmanager-ntfy --configs /var/lib/alertmanager-ntfy/config.yml";
418 # Run config preparation as root (+ prefix) before starting the main process
419 ExecStartPre =
420 "+"
421 + pkgs.writeShellScript "prepare-alertmanager-ntfy-config" ''
422 # Read the token from the secret file
423 TOKEN=$(cat ${config.age.secrets."ntfy-token".path})
424
425 # Generate config with the actual token
426 cat > /var/lib/alertmanager-ntfy/config.yml <<'EOF'
427 http:
428 addr: 127.0.0.1:8081
429
430 ntfy:
431 baseurl: https://ntfy.sbr.pm
432 auth:
433 token: TOKEN_PLACEHOLDER
434 notification:
435 topic: homelab
436 priority: 'status == "firing" ? "urgent" : "default"'
437 tags:
438 - tag: rotating_light
439 condition: 'status == "firing" && labels.severity == "critical"'
440 - tag: warning
441 condition: 'status == "firing" && labels.severity == "warning"'
442 - tag: "+1"
443 condition: 'status == "resolved"'
444 templates:
445 title: '{{ if eq .Status "resolved" }}✅ Resolved: {{ end }}{{ if eq .Status "firing" }}🔥 {{ end }}{{ index .Annotations "summary" }}'
446 description: '{{ index .Annotations "description" }}'
447 EOF
448 # Replace placeholder with actual token
449 ${pkgs.gnused}/bin/sed -i "s/TOKEN_PLACEHOLDER/$TOKEN/" /var/lib/alertmanager-ntfy/config.yml
450 # Make config readable by the dynamic user
451 chmod 644 /var/lib/alertmanager-ntfy/config.yml
452 '';
453 };
454 };
455
456 environment.systemPackages = with pkgs; [ yt-dlp ];
457 # mr -i u daily
458 systemd.services.mr = {
459 description = "Update configs daily";
460 requires = [ "network-online.target" ];
461 after = [ "network-online.target" ];
462
463 restartIfChanged = false;
464 unitConfig.X-StopOnRemoval = false;
465
466 serviceConfig = {
467 Type = "oneshot";
468 User = "vincent";
469 OnFailure = "status-email-root@%n.service";
470 };
471
472 path = with pkgs; [
473 git
474 mr
475 ];
476 script = ''
477 set -e
478 cd /mnt/gaia/src/configs/
479 mr -t run git reset --hard
480 mr -t u
481 '';
482
483 startAt = "daily";
484 };
485 # Kiwix serve
486 systemd.services.kiwix-serve = {
487 description = "Kiwix offline content server";
488 wantedBy = [ "multi-user.target" ];
489 after = [ "network.target" ];
490
491 serviceConfig = {
492 Type = "simple";
493 User = "vincent";
494 ExecStart = "${pkgs.bash}/bin/bash -c '${pkgs.kiwix-tools}/bin/kiwix-serve --port=8080 /mnt/gaia/kiwix/*.zim'";
495 Restart = "on-failure";
496 RestartSec = "5s";
497 };
498 };
499
500 # Open firewall for services accessible from the network
501 networking.firewall.allowedTCPPorts = [
502 8000 # Paperless-ngx web interface
503 ];
504}