Commit 985b808cd3f7

Vincent Demeester <vincent@sbr.pm>
2026-02-12 16:50:23
fix(okinawa): fix ollama and llama-cli on RDNA2
Switched from ollama-rocm to ollama package to fix GGML_ASSERT(max_blocks_per_sm > 0) crash on RX 6700S. Added llama-cpp env defaults to disable flash attention and pin to dGPU, both broken on gfx1030 ROCm.
1 parent ec88213
Changed files (2)
systems
systems/okinawa/extra.nix
@@ -53,7 +53,9 @@
   # Ollama for local LLM inference with dGPU
   services.ollama = {
     enable = true;
-    package = pkgs.ollama-rocm; # ROCm support for AMD GPU
+    # Use regular ollama with Vulkan instead of ollama-rocm
+    # ollama-rocm has GGML_ASSERT(max_blocks_per_sm > 0) failures on RDNA2
+    package = pkgs.ollama; # Vulkan support for AMD GPU (more stable than ROCm)
     host = "0.0.0.0";
     port = 11434;
 
@@ -75,14 +77,14 @@
     ];
 
     environmentVariables = {
-      # Critical: RX 6700S (gfx1032) needs this override
-      HSA_OVERRIDE_GFX_VERSION = "10.3.0";
+      # Vulkan is used automatically for AMD GPUs
+      # No HSA_OVERRIDE_GFX_VERSION needed with Vulkan backend
       OLLAMA_KEEP_ALIVE = "10m";
       OLLAMA_NUM_PARALLEL = "1";
     };
   };
 
-  # ROCm environment variables
+  # GPU environment variables (needed for ROCm with RX 6700S / gfx1032)
   environment.variables = {
     HSA_OVERRIDE_GFX_VERSION = "10.3.0";
   };
systems/okinawa/home.nix
@@ -23,6 +23,13 @@ in
     ../../home/common/shell/gh.nix
   ];
 
+  # llama-cpp defaults for RX 6700S (RDNA2)
+  # Flash attention crashes on gfx1030 ROCm: GGML_ASSERT(max_blocks_per_sm > 0)
+  home.sessionVariables = {
+    LLAMA_ARG_FLASH_ATTN = "off";
+    LLAMA_ARG_MAIN_GPU = "0";
+  };
+
   home.packages = with pkgs; [
     nautilus