Commit df1fa3fa5688
Changed files (1)
systems
okinawa
systems/okinawa/extra.nix
@@ -51,14 +51,49 @@
};
# llama.cpp server for local LLM inference with Vulkan GPU (RX 6700S)
- # Replaces ollama — better tool-call parsing, Anthropic API, faster Vulkan inference
- # Benchmarks on RX 6700S: 7B ~40 tok/s, 3B ~85 tok/s (Vulkan, Q4_K_M)
+ # Router mode: serves multiple models with on-demand loading (--models-max 1)
+ # Only one model loaded in VRAM at a time; auto-swaps on request
+ # Benchmarks on RX 6700S: 7B dense ~40 tok/s, 3B active MoE ~20-40 tok/s (Vulkan, Q4_K_M)
systemd.services.llama-cpp =
let
llama-cpp-vulkan = pkgs.llama-cpp.override { vulkanSupport = true; };
+
+ # Model preset INI for router mode
+ # Section names become model IDs in the API (used in models.json)
+ modelsPreset = pkgs.writeText "llama-models.ini" ''
+ version = 1
+
+ ; Shared settings for all models
+ [*]
+ n-gpu-layers = 99
+ jinja = true
+
+ ; === Coding models ===
+
+ ; Qwen3 Coder Next 80B MoE (3B active) — best coding model, needs ~48GB RAM
+ [Qwen/Qwen3-Coder-Next-GGUF:Q3_K_M]
+
+ ; Qwen3 Coder 30B MoE (3B active) — sweet spot coding, ~19GB RAM
+ [Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M]
+
+ ; Qwen 2.5 Coder 7B dense — lightweight coding, ~5GB (current default)
+ [Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M]
+
+ ; === General purpose models ===
+
+ ; Qwen3 8B dense — best all-rounder (reasoning, tool use, multilingual)
+ [Qwen/Qwen3-8B-GGUF:Q4_K_M]
+
+ ; DeepSeek R1 Distill 7B — deep reasoning / chain-of-thought
+ [bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M]
+
+ ; Phi-4 mini 3.8B — ultra-fast utility model
+ [unsloth/Phi-4-mini-instruct-GGUF:Q4_K_M]
+ '';
+
in
{
- description = "llama.cpp server (Vulkan)";
+ description = "llama.cpp server (Vulkan, router mode)";
after = [ "network.target" ];
wantedBy = [ "multi-user.target" ];
@@ -75,8 +110,8 @@
"--log-disable"
"--host 127.0.0.1"
"--port 8090"
- "-hf Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M"
- "-ngl 99"
+ "--models-preset ${modelsPreset}"
+ "--models-max 1"
"-np 1"
"--api-key local-test"
];
@@ -94,6 +129,45 @@
};
};
+ # Oneshot service to pre-download all LLM models
+ # Run manually: sudo systemctl start llama-download-models
+ systemd.services.llama-download-models =
+ let
+ llama-cpp-vulkan = pkgs.llama-cpp.override { vulkanSupport = true; };
+ in
+ {
+ description = "Download LLM models for llama.cpp";
+ after = [ "network-online.target" ];
+ wants = [ "network-online.target" ];
+
+ serviceConfig = {
+ Type = "oneshot";
+ RemainAfterExit = false;
+ User = "vincent";
+ Group = "users";
+ ExecStart = pkgs.writeShellScript "llama-download-models" ''
+ set -euo pipefail
+ export PATH="${llama-cpp-vulkan}/bin:$PATH"
+
+ models=(
+ "Qwen/Qwen3-Coder-Next-GGUF:Q3_K_M"
+ "Qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M"
+ "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M"
+ "Qwen/Qwen3-8B-GGUF:Q4_K_M"
+ "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M"
+ "unsloth/Phi-4-mini-instruct-GGUF:Q4_K_M"
+ )
+
+ for model in "''${models[@]}"; do
+ echo "Checking/downloading $model..."
+ # Use llama-cli to trigger download, then exit immediately
+ timeout 120 llama-cli -hf "$model" -p "test" -n 1 --no-display-prompt 2>&1 || true
+ echo "Done: $model"
+ done
+ '';
+ };
+ };
+
# System packages for LLM and gaming
environment.systemPackages = with pkgs; [
# LLM tools (same package as the service, for CLI use)