Commit a44f34fc60a1

Vincent Demeester <vincent@sbr.pm>
2026-01-06 21:53:41
feat(monitoring): Add comprehensive Ollama monitoring with Prometheus
Implemented full monitoring stack for Ollama LLM infrastructure: Aomi (Ollama host): - ollama-exporter Docker service (ghcr.io/frcooper/ollama-exporter) - Exposes Prometheus metrics on port 8000 - Tracks requests, latency, token throughput per model - Systemd service with automatic restart and Docker image updates Sakhalin (Prometheus): - Added Ollama scrape job targeting aomi:8000 - Comprehensive alert rules for Ollama monitoring: * Service availability (OllamaDown) * High latency (P95 > 5s) * Low throughput (< 5 tokens/sec) * High error rate (> 5%) - Alerts route to existing ntfy integration Monitoring Capabilities: - Request rate by model - Inference latency (P50, P95, P99) - Token generation throughput - Input/output token counts - Model load times - Error rates Reference: Based on comprehensive Ollama monitoring research (ollama-exporter, Prometheus integration, production best practices) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> Signed-off-by: Vincent Demeester <vincent@sbr.pm>
1 parent ff91b54
Changed files (3)
systems/aomi/extra.nix
@@ -1,6 +1,7 @@
 {
   globals,
   libx,
+  pkgs,
   ...
 }:
 {
@@ -126,4 +127,42 @@
     # };
   };
 
+  # Ollama Prometheus Exporter (Docker-based)
+  systemd.services.ollama-exporter = {
+    description = "Ollama Prometheus Exporter";
+    after = [
+      "docker.service"
+      "ollama.service"
+    ];
+    requires = [ "docker.service" ];
+    wantedBy = [ "multi-user.target" ];
+
+    serviceConfig = {
+      Type = "simple";
+      Restart = "always";
+      RestartSec = "10s";
+
+      ExecStartPre = [
+        # Stop and remove existing container
+        "-${pkgs.docker}/bin/docker stop ollama-exporter"
+        "-${pkgs.docker}/bin/docker rm ollama-exporter"
+        # Pull latest image
+        "${pkgs.docker}/bin/docker pull ghcr.io/frcooper/ollama-exporter:latest"
+      ];
+
+      ExecStart = ''
+        ${pkgs.docker}/bin/docker run --rm --name ollama-exporter \
+          -p 8000:8000 \
+          -e OLLAMA_HOST=http://localhost:11434 \
+          --network host \
+          ghcr.io/frcooper/ollama-exporter:latest
+      '';
+
+      ExecStop = "${pkgs.docker}/bin/docker stop ollama-exporter";
+    };
+  };
+
+  # Open firewall for Ollama exporter
+  networking.firewall.allowedTCPPorts = [ 8000 ];
+
 }
systems/sakhalin/extra.nix
@@ -287,6 +287,14 @@ in
             }
           ];
         }
+        {
+          job_name = "ollama";
+          static_configs = [
+            {
+              targets = [ "${builtins.head globals.machines.aomi.net.ips}:8000" ];
+            }
+          ];
+        }
       ];
     };
 
systems/sakhalin/prometheus-alerts.nix
@@ -367,6 +367,68 @@
       ];
     }
 
+    {
+      name = "ollama_alerts";
+      interval = "30s";
+      rules = [
+        # Ollama service down
+        {
+          alert = "OllamaDown";
+          expr = "up{job=\"ollama\"} == 0";
+          for = "2m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "Ollama service down on {{ $labels.instance }}";
+            description = "Ollama LLM service has been unreachable for more than 2 minutes - check aomi ollama-exporter";
+          };
+        }
+
+        # High latency (P95 > 5 seconds)
+        {
+          alert = "OllamaHighLatency";
+          expr = "histogram_quantile(0.95, sum(rate(ollama_response_seconds_bucket[5m])) by (le, model)) > 5";
+          for = "5m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "High Ollama inference latency";
+            description = "Model {{ $labels.model }} P95 latency is {{ $value | humanizeDuration }} (threshold: 5s) - CPU may be overloaded";
+          };
+        }
+
+        # Low throughput (< 5 tokens/sec for 10+ minutes)
+        {
+          alert = "OllamaLowThroughput";
+          expr = "rate(ollama_tokens_generated_total[5m]) < 5";
+          for = "10m";
+          labels = {
+            severity = "warning";
+          };
+          annotations = {
+            summary = "Low Ollama token generation rate";
+            description = "Token generation rate is {{ $value | humanize }} tokens/sec (expected: 7-15 for CPU) - check aomi CPU usage";
+          };
+        }
+
+        # High error rate (> 5% for 5+ minutes)
+        {
+          alert = "OllamaHighErrorRate";
+          expr = "(sum(rate(ollama_requests_total{status=~\"5..\"}[5m])) / sum(rate(ollama_requests_total[5m]))) * 100 > 5";
+          for = "5m";
+          labels = {
+            severity = "critical";
+          };
+          annotations = {
+            summary = "High Ollama error rate";
+            description = "Error rate is {{ $value | humanizePercentage }} (threshold: 5%) - check ollama logs on aomi";
+          };
+        }
+      ];
+    }
+
     {
       name = "backup_alerts";
       interval = "1h";