Commit d39f5c51a6cd
Changed files (1)
systems
okinawa
systems/okinawa/extra.nix
@@ -50,52 +50,55 @@
endpointPublicKey = "${globals.machines.kerkouane.net.vpn.pubkey}";
};
- # Ollama for local LLM inference with dGPU
- services.ollama = {
- enable = true;
- # Use ollama-vulkan for GPU acceleration on RDNA2 (RX 6700S)
- # ollama-rocm has GGML_ASSERT(max_blocks_per_sm > 0) failures on RDNA2
- package = pkgs.ollama-vulkan;
- host = "0.0.0.0";
- port = 11434;
+ # llama.cpp server for local LLM inference with Vulkan GPU (RX 6700S)
+ # Replaces ollama — better tool-call parsing, Anthropic API, faster Vulkan inference
+ # Benchmarks on RX 6700S: 7B ~40 tok/s, 3B ~85 tok/s (Vulkan, Q4_K_M)
+ systemd.services.llama-cpp =
+ let
+ llama-cpp-vulkan = pkgs.llama-cpp.override { vulkanSupport = true; };
+ in
+ {
+ description = "llama.cpp server (Vulkan)";
+ after = [ "network.target" ];
+ wantedBy = [ "multi-user.target" ];
- # Models optimized for 8GB VRAM (RX 6700S)
- loadModels = [
- # Coding models
- "qwen2.5-coder:7b" # Best coding: 88.4% HumanEval (~20-30 tok/s)
- "codestral" # Latest coding (Jan 2025): 86.6% HumanEval (~8-10 tok/s)
+ environment = {
+ GGML_VK_VISIBLE_DEVICES = "0"; # Use RX 6700S dGPU only
+ HIP_VISIBLE_DEVICES = ""; # Disable ROCm to avoid conflicts
+ };
- # Reasoning models
- "phi4-reasoning" # Best 14B reasoning (~6-10 tok/s, tight on 8GB)
+ serviceConfig = {
+ Type = "idle";
+ KillSignal = "SIGINT";
+ ExecStart = builtins.concatStringsSep " " [
+ "${llama-cpp-vulkan}/bin/llama-server"
+ "--log-disable"
+ "--host 127.0.0.1"
+ "--port 8090"
+ "-hf Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M"
+ "-ngl 99"
+ "-np 1"
+ "--api-key local-test"
+ ];
+ Restart = "on-failure";
+ RestartSec = 30;
- # Multimodal
- "qwen2.5vl:7b" # Vision + text (~15-20 tok/s)
+ # Run as vincent to access HuggingFace model cache
+ User = "vincent";
+ Group = "users";
- # General purpose
- "llama3.1:8b" # Native tool support (~20-25 tok/s)
- "phi3.5:3.8b" # Ultra-fast for quick tasks (~30-40 tok/s)
- ];
-
- environmentVariables = {
- # Disable ROCm to avoid conflicts with Vulkan backend
- # Vulkan is 2.6x faster prompt eval, 1.2x faster generation on RDNA2
- HIP_VISIBLE_DEVICES = "";
- OLLAMA_KEEP_ALIVE = "10m";
- OLLAMA_NUM_PARALLEL = "1";
+ # GPU access requires relaxed sandboxing
+ PrivateDevices = false;
+ ProtectHome = false;
+ MemoryDenyWriteExecute = false;
+ };
};
- };
-
- # GPU environment variables (needed for ROCm with RX 6700S / gfx1032)
- environment.variables = {
- HSA_OVERRIDE_GFX_VERSION = "10.3.0";
- };
# System packages for LLM and gaming
environment.systemPackages = with pkgs; [
- # LLM tools
+ # LLM tools (same package as the service, for CLI use)
(llama-cpp.override {
vulkanSupport = true;
- rocmSupport = true;
})
# GPU monitoring and management