Commit d39f5c51a6cd

Vincent Demeester <vincent@sbr.pm>
2026-02-13 15:32:39
feat(okinawa): replaced ollama with llama-cpp systemd service
Switched from ollama to llama-server with Vulkan backend for local LLM inference. llama-server provides proper tool-call parsing via Jinja templates, Anthropic Messages API support, and 2.6x faster prompt eval on RDNA2. Removed ROCm support as Vulkan is faster and more stable on RX 6700S.
1 parent b8ba302
Changed files (1)
systems
okinawa
systems/okinawa/extra.nix
@@ -50,52 +50,55 @@
     endpointPublicKey = "${globals.machines.kerkouane.net.vpn.pubkey}";
   };
 
-  # Ollama for local LLM inference with dGPU
-  services.ollama = {
-    enable = true;
-    # Use ollama-vulkan for GPU acceleration on RDNA2 (RX 6700S)
-    # ollama-rocm has GGML_ASSERT(max_blocks_per_sm > 0) failures on RDNA2
-    package = pkgs.ollama-vulkan;
-    host = "0.0.0.0";
-    port = 11434;
+  # llama.cpp server for local LLM inference with Vulkan GPU (RX 6700S)
+  # Replaces ollama — better tool-call parsing, Anthropic API, faster Vulkan inference
+  # Benchmarks on RX 6700S: 7B ~40 tok/s, 3B ~85 tok/s (Vulkan, Q4_K_M)
+  systemd.services.llama-cpp =
+    let
+      llama-cpp-vulkan = pkgs.llama-cpp.override { vulkanSupport = true; };
+    in
+    {
+      description = "llama.cpp server (Vulkan)";
+      after = [ "network.target" ];
+      wantedBy = [ "multi-user.target" ];
 
-    # Models optimized for 8GB VRAM (RX 6700S)
-    loadModels = [
-      # Coding models
-      "qwen2.5-coder:7b" # Best coding: 88.4% HumanEval (~20-30 tok/s)
-      "codestral" # Latest coding (Jan 2025): 86.6% HumanEval (~8-10 tok/s)
+      environment = {
+        GGML_VK_VISIBLE_DEVICES = "0"; # Use RX 6700S dGPU only
+        HIP_VISIBLE_DEVICES = ""; # Disable ROCm to avoid conflicts
+      };
 
-      # Reasoning models
-      "phi4-reasoning" # Best 14B reasoning (~6-10 tok/s, tight on 8GB)
+      serviceConfig = {
+        Type = "idle";
+        KillSignal = "SIGINT";
+        ExecStart = builtins.concatStringsSep " " [
+          "${llama-cpp-vulkan}/bin/llama-server"
+          "--log-disable"
+          "--host 127.0.0.1"
+          "--port 8090"
+          "-hf Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M"
+          "-ngl 99"
+          "-np 1"
+          "--api-key local-test"
+        ];
+        Restart = "on-failure";
+        RestartSec = 30;
 
-      # Multimodal
-      "qwen2.5vl:7b" # Vision + text (~15-20 tok/s)
+        # Run as vincent to access HuggingFace model cache
+        User = "vincent";
+        Group = "users";
 
-      # General purpose
-      "llama3.1:8b" # Native tool support (~20-25 tok/s)
-      "phi3.5:3.8b" # Ultra-fast for quick tasks (~30-40 tok/s)
-    ];
-
-    environmentVariables = {
-      # Disable ROCm to avoid conflicts with Vulkan backend
-      # Vulkan is 2.6x faster prompt eval, 1.2x faster generation on RDNA2
-      HIP_VISIBLE_DEVICES = "";
-      OLLAMA_KEEP_ALIVE = "10m";
-      OLLAMA_NUM_PARALLEL = "1";
+        # GPU access requires relaxed sandboxing
+        PrivateDevices = false;
+        ProtectHome = false;
+        MemoryDenyWriteExecute = false;
+      };
     };
-  };
-
-  # GPU environment variables (needed for ROCm with RX 6700S / gfx1032)
-  environment.variables = {
-    HSA_OVERRIDE_GFX_VERSION = "10.3.0";
-  };
 
   # System packages for LLM and gaming
   environment.systemPackages = with pkgs; [
-    # LLM tools
+    # LLM tools (same package as the service, for CLI use)
     (llama-cpp.override {
       vulkanSupport = true;
-      rocmSupport = true;
     })
 
     # GPU monitoring and management