Commit df1fa3fa5688

Vincent Demeester <vincent@sbr.pm>
2026-02-13 16:14:59
feat(okinawa): multi-model llama-server router
Switched llama.cpp from single-model to router mode with models-preset INI serving 6 models (3 coding, 3 general). Only one model loaded at a time with auto-swap on request. Added oneshot service for bulk model downloads and updated pi models.json for OpenAI-compatible model selection.
1 parent ff8bb87
Changed files (1)
systems
okinawa
systems/okinawa/extra.nix
@@ -51,14 +51,49 @@
   };
 
   # llama.cpp server for local LLM inference with Vulkan GPU (RX 6700S)
-  # Replaces ollama — better tool-call parsing, Anthropic API, faster Vulkan inference
-  # Benchmarks on RX 6700S: 7B ~40 tok/s, 3B ~85 tok/s (Vulkan, Q4_K_M)
+  # Router mode: serves multiple models with on-demand loading (--models-max 1)
+  # Only one model loaded in VRAM at a time; auto-swaps on request
+  # Benchmarks on RX 6700S: 7B dense ~40 tok/s, 3B active MoE ~20-40 tok/s (Vulkan, Q4_K_M)
   systemd.services.llama-cpp =
     let
       llama-cpp-vulkan = pkgs.llama-cpp.override { vulkanSupport = true; };
+
+      # Model preset INI for router mode
+      # Section names become model IDs in the API (used in models.json)
+      modelsPreset = pkgs.writeText "llama-models.ini" ''
+        version = 1
+
+        ; Shared settings for all models
+        [*]
+        n-gpu-layers = 99
+        jinja = true
+
+        ; === Coding models ===
+
+        ; Qwen3 Coder Next 80B MoE (3B active) — best coding model, needs ~48GB RAM
+        [Qwen/Qwen3-Coder-Next-GGUF:Q3_K_M]
+
+        ; Qwen3 Coder 30B MoE (3B active) — sweet spot coding, ~19GB RAM
+        [Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M]
+
+        ; Qwen 2.5 Coder 7B dense — lightweight coding, ~5GB (current default)
+        [Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M]
+
+        ; === General purpose models ===
+
+        ; Qwen3 8B dense — best all-rounder (reasoning, tool use, multilingual)
+        [Qwen/Qwen3-8B-GGUF:Q4_K_M]
+
+        ; DeepSeek R1 Distill 7B — deep reasoning / chain-of-thought
+        [bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M]
+
+        ; Phi-4 mini 3.8B — ultra-fast utility model
+        [unsloth/Phi-4-mini-instruct-GGUF:Q4_K_M]
+      '';
+
     in
     {
-      description = "llama.cpp server (Vulkan)";
+      description = "llama.cpp server (Vulkan, router mode)";
       after = [ "network.target" ];
       wantedBy = [ "multi-user.target" ];
 
@@ -75,8 +110,8 @@
           "--log-disable"
           "--host 127.0.0.1"
           "--port 8090"
-          "-hf Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M"
-          "-ngl 99"
+          "--models-preset ${modelsPreset}"
+          "--models-max 1"
           "-np 1"
           "--api-key local-test"
         ];
@@ -94,6 +129,45 @@
       };
     };
 
+  # Oneshot service to pre-download all LLM models
+  # Run manually: sudo systemctl start llama-download-models
+  systemd.services.llama-download-models =
+    let
+      llama-cpp-vulkan = pkgs.llama-cpp.override { vulkanSupport = true; };
+    in
+    {
+      description = "Download LLM models for llama.cpp";
+      after = [ "network-online.target" ];
+      wants = [ "network-online.target" ];
+
+      serviceConfig = {
+        Type = "oneshot";
+        RemainAfterExit = false;
+        User = "vincent";
+        Group = "users";
+        ExecStart = pkgs.writeShellScript "llama-download-models" ''
+          set -euo pipefail
+          export PATH="${llama-cpp-vulkan}/bin:$PATH"
+
+          models=(
+            "Qwen/Qwen3-Coder-Next-GGUF:Q3_K_M"
+            "Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M"
+            "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M"
+            "Qwen/Qwen3-8B-GGUF:Q4_K_M"
+            "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M"
+            "unsloth/Phi-4-mini-instruct-GGUF:Q4_K_M"
+          )
+
+          for model in "''${models[@]}"; do
+            echo "Checking/downloading $model..."
+            # Use llama-cli to trigger download, then exit immediately
+            timeout 120 llama-cli -hf "$model" -p "test" -n 1 --no-display-prompt 2>&1 || true
+            echo "Done: $model"
+          done
+        '';
+      };
+    };
+
   # System packages for LLM and gaming
   environment.systemPackages = with pkgs; [
     # LLM tools (same package as the service, for CLI use)