Commit 09ad187fe24f

Vincent Demeester <vincent@sbr.pm>
2026-03-24 14:36:14
feat(flake-updater): add AI auto-fix for builds
Added AI-powered auto-fix to the nix-flake-updater module. When a flake.lock update breaks a host build, pi is invoked to diagnose and fix the error automatically with retries and regression checking across all hosts. - Switched weekly schedule to bi-weekly (1st/3rd Sunday) - Added AGENTS.md files for channel-awareness (unstable vs stable-25.11) so any agent session knows the constraints - New module options: autoFix.enable, command, extraArgs, maxAttempts, envFile, environment - Uses google-vertex-claude provider (pi extension) on okinawa with Application Default Credentials
1 parent bb387d2
Changed files (8)
home
modules
nix-flake-updater
systems
tools
home/common/AGENTS.md
@@ -0,0 +1,23 @@
+# Shared Home-Manager Configuration
+
+Code in this directory is shared across **all hosts** and **both nixpkgs channels**
+(unstable and stable/25.11). The same rules as `systems/common/AGENTS.md` apply.
+
+## Rules
+
+1. **Never assume a home-manager option or package exists on all channels.**
+   Use conditionals or host-specific overrides when needed.
+
+2. **Host-specific home-manager overrides** go in `systems/<hostname>/home.nix`,
+   not here.
+
+3. **Test both channels** after modifying shared code:
+   - Unstable: `make host/okinawa/build`
+   - Stable: `make host/rhea/build`
+
+## Directory Structure
+
+- `desktop/` — desktop environment programs (browser, terminal, media, etc.)
+- `dev/` — development tools (editors, languages, AI tools, etc.)
+- `services/` — user services (syncthing, gpg-agent, etc.)
+- `shell/` — shell configuration (zsh, starship, aliases, etc.)
modules/nix-flake-updater/default.nix
@@ -123,6 +123,50 @@ let
           default = 3600;
           description = "Random delay in seconds before starting (0-value)";
         };
+
+        autoFix = {
+          enable = mkEnableOption "AI-powered auto-fix on build failure";
+
+          command = mkOption {
+            type = types.str;
+            default = "pi";
+            description = "Agent command to invoke (must support -p for non-interactive mode)";
+          };
+
+          extraArgs = mkOption {
+            type = types.listOf types.str;
+            default = [
+              "--provider"
+              "google-vertex-claude"
+              "--no-session"
+              "--no-themes"
+              "--no-skills"
+            ];
+            description = "Extra arguments passed to the agent command (note: do not use --no-extensions if the provider is an extension)";
+          };
+
+          maxAttempts = mkOption {
+            type = types.int;
+            default = 3;
+            description = "Maximum agent invocations per failing host before giving up";
+          };
+
+          envFile = mkOption {
+            type = types.nullOr types.path;
+            default = null;
+            description = "Optional file to source before running the agent (for API keys, credentials)";
+          };
+
+          environment = mkOption {
+            type = types.attrsOf types.str;
+            default = { };
+            example = {
+              GOOGLE_CLOUD_PROJECT = "my-project";
+              GOOGLE_CLOUD_LOCATION = "us-east5";
+            };
+            description = "Environment variables to set when running the agent";
+          };
+        };
       };
     };
 
@@ -144,6 +188,16 @@ let
       ${optionalString (
         instanceCfg.ntfyTokenFile != null
       ) ''export NTFY_TOKEN_FILE="${instanceCfg.ntfyTokenFile}"''}
+      export AUTO_FIX="${toString instanceCfg.autoFix.enable}"
+      export AUTO_FIX_COMMAND="${instanceCfg.autoFix.command}"
+      export AUTO_FIX_EXTRA_ARGS="${concatStringsSep " " instanceCfg.autoFix.extraArgs}"
+      export AUTO_FIX_MAX_ATTEMPTS="${toString instanceCfg.autoFix.maxAttempts}"
+      ${optionalString (
+        instanceCfg.autoFix.envFile != null
+      ) ''export AUTO_FIX_ENV_FILE="${instanceCfg.autoFix.envFile}"''}
+      ${concatStringsSep "\n" (
+        mapAttrsToList (k: v: "export ${k}=\"${v}\"") instanceCfg.autoFix.environment
+      )}
 
       # Execute the packaged update script (already has tools in PATH)
       exec ${pkgs.nix-flake-update}/bin/nix-flake-update
@@ -176,6 +230,8 @@ let
           "/home/${instanceCfg.user}/.cache/nix"
           # Org inbox for TODOs
           (dirOf instanceCfg.inboxOrg)
+          # Pi agent session/config directory (needed for auto-fix)
+          "/home/${instanceCfg.user}/.pi"
         ];
         NoNewPrivileges = true;
 
modules/nix-flake-updater/README.md
@@ -1,6 +1,7 @@
 # Nix Flake Updater Module
 
-Automated NixOS module for updating `flake.lock` with build verification and notifications.
+Automated NixOS module for updating `flake.lock` with build verification, notifications,
+and optional AI-powered auto-fix.
 
 ## Overview
 
@@ -8,8 +9,10 @@ This module provides automated, unattended flake.lock updates that:
 
 - Run on a configurable schedule via systemd timers
 - Verify builds across multiple systems before committing
+- Optionally use a coding agent (pi) to auto-fix build failures
 - Create git branches for review workflow
 - Send notifications via ntfy
+- Support multiple named instances (e.g., daily, biweekly)
 - Support dry-run mode for testing
 
 ## Files
@@ -19,67 +22,126 @@ This module provides automated, unattended flake.lock updates that:
 
 ## Usage
 
-Import the module and configure:
+Import the module and configure instances:
 
 ```nix
 {
-  imports = [
-    ../../modules/nix-flake-updater
-  ];
+  imports = [ ../../modules/nix-flake-updater ];
 
   services.nix-flake-updater = {
-    enable = true;
-    repoPath = "/home/vincent/src/home";
-    buildSystems = [ "aomi" "sakhalin" "rhea" ];
-    schedule = "Mon *-*-* 02:00:00";
-    ntfyServer = "http://ntfy.sbr.pm";
-    user = "vincent";
+    # Bi-weekly full update with auto-fix
+    biweekly = {
+      enable = true;
+      repoPath = "/home/vincent/src/home";
+      buildSystems = [ "okinawa" "kyushu" "rhea" "athena" ];
+      schedule = "Sun *-*-1..7,15..21 02:00:00";
+      ntfyServer = "https://ntfy.sbr.pm";
+      user = "vincent";
+
+      autoFix = {
+        enable = true;
+        command = "pir";
+        extraArgs = [ "--model" "claude-opus-4-6" "--no-session" "--no-extensions" "--no-themes" ];
+        maxAttempts = 3;
+      };
+    };
+
+    # Daily update for specific inputs with auto-merge
+    daily = {
+      enable = true;
+      repoPath = "/home/vincent/src/home";
+      flakeInputs = [ "chick-group" "chapeau-rouge" ];
+      autoMerge = true;
+      buildSystems = [ "okinawa" "kyushu" ];
+      schedule = "*-*-* 04:00:00";
+      user = "vincent";
+    };
   };
 }
 ```
 
+## Auto-Fix
+
+When `autoFix.enable = true`, build failures trigger a coding agent to attempt fixes:
+
+1. Build error stderr is captured (last 200 lines)
+2. The agent is invoked in non-interactive mode (`-p`) with the error context
+3. The agent reads AGENTS.md files in the repo for channel-awareness rules
+4. If the fix works, it's committed separately from the flake.lock update
+5. A regression check rebuilds all hosts after fixes are applied
+6. Up to `maxAttempts` retries per failing host
+
+### Agent Authentication
+
+The default agent command (`pir`) uses `passage` for API key retrieval. For headless
+systemd execution, ensure the password store is accessible without interactive auth,
+or use `autoFix.envFile` to source credentials:
+
+```nix
+autoFix = {
+  enable = true;
+  envFile = config.age.secrets."vertex-ai-credentials".path;
+};
+```
+
+## Manual Trigger
+
+```bash
+# Run the bi-weekly update manually
+sudo systemctl start nix-flake-updater-biweekly
+
+# View logs
+journalctl -u nix-flake-updater-biweekly -f
+
+# Check timer schedule
+systemctl list-timers 'nix-flake-updater-*'
+```
+
+## Configuration Options
+
+### Core
+- `enable` - Enable this instance
+- `repoPath` - Git repository path
+- `buildSystems` - List of NixOS systems to build for verification
+- `schedule` - Systemd OnCalendar schedule
+- `flakeInputs` - Specific inputs to update (empty = all)
+- `user` - User to run as (needs git push access)
+
+### Git
+- `gitRemote` - Remote to push to (default: `origin`)
+- `mainBranch` - Main branch name (default: `main`)
+- `branchPrefix` - Prefix for update branches
+- `autoMerge` - Auto-merge to main on success (default: `false`)
+
+### Notifications
+- `ntfyServer` / `ntfyTopic` - ntfy notification settings
+- `ntfyTokenFile` - Authentication token file
+- `inboxOrg` - Org-mode inbox for TODO entries on failure
+
+### Auto-Fix
+- `autoFix.enable` - Enable AI-powered auto-fix
+- `autoFix.command` - Agent command (default: `pir`)
+- `autoFix.extraArgs` - Extra agent CLI arguments
+- `autoFix.maxAttempts` - Max retries per host (default: `3`)
+- `autoFix.envFile` - Source file for API credentials
+
+### Other
+- `dryRun` - Don't push to remote
+- `randomizedDelaySec` - Random delay before start
+
+## Architecture
+
+The update script:
+1. Creates an isolated git worktree from main
+2. Updates flake.lock (all or specific inputs)
+3. Builds all specified systems
+4. On failure with auto-fix: invokes coding agent → rebuilds → regression check
+5. Commits flake.lock update + any fixes (separate commits)
+6. Pushes branch (or auto-merges to main)
+7. Sends ntfy notification with results
+8. Cleans up worktree
+
 ## Documentation
 
 See:
 - `/docs/nix-flake-updater-guide.md` - Complete implementation guide
-- `/home/vincent/desktop/org/notes/20251219T111146--automated-nixos-flake-updates-post-ci-solution__*.org` - Design notes
-
-## Architecture
-
-The module creates a systemd timer that:
-1. Pulls latest main branch
-2. Creates update branch
-3. Runs `nix flake update`
-4. Builds specified systems for verification
-5. Commits and pushes if builds succeed
-6. Sends ntfy notification with results
-
-## Configuration Options
-
-- `enable` - Enable the service
-- `repoPath` - Git repository path
-- `buildSystems` - List of systems to build for verification
-- `schedule` - Systemd OnCalendar schedule
-- `ntfyServer` / `ntfyTopic` - Notification settings
-- `gitRemote` - Remote to push to
-- `user` - User to run as (needs git push access)
-- `dryRun` - Test mode (don't push)
-
-## Example Deployment
-
-```bash
-# Build configuration
-make host/aomi/build
-
-# Deploy
-make host/aomi/switch
-
-# Verify timer
-systemctl list-timers nix-flake-updater
-
-# Test manually
-sudo systemctl start nix-flake-updater
-
-# View logs
-journalctl -u nix-flake-updater -f
-```
systems/common/AGENTS.md
@@ -0,0 +1,40 @@
+# Shared NixOS Configuration
+
+Code in this directory is used by **all hosts** across **both nixpkgs channels**
+(unstable and stable/25.11). See `../AGENTS.md` for the full channel map.
+
+## Rules for Modifying Shared Code
+
+1. **Never assume an option exists on all channels.** Options added or renamed
+   in nixpkgs-unstable won't exist on nixpkgs-25.11 (and vice versa for
+   backports). When in doubt, check the option exists before using it.
+
+2. **Use conditional patterns** when a change is channel-dependent:
+   ```nix
+   # Check if an option exists before using it
+   lib.optionalAttrs (builtins.hasAttr "newOption" options.services.foo) {
+     services.foo.newOption = true;
+   }
+
+   # Version-gated setting
+   lib.mkIf (lib.versionAtLeast config.system.nixos.release "25.11") {
+     services.foo.bar = "new-value";
+   }
+   ```
+
+3. **Prefer host-specific overrides** over conditionals when the change only
+   affects one or two hosts. Edit `systems/<hostname>/extra.nix` instead of
+   modifying shared modules.
+
+4. **Test both channels** after modifying shared code:
+   - Unstable: `make host/okinawa/build`
+   - Stable: `make host/rhea/build`
+
+## Directory Structure
+
+- `base/` — core system settings (nix, boot, locale, networking, security)
+- `desktop/` — desktop environment modules (sway, niri, waybar, etc.)
+- `hardware/` — hardware support (audio, bluetooth, GPU, etc.)
+- `programs/` — system-wide program configurations
+- `services/` — system service configurations
+- `users/` — user account definitions
systems/okinawa/extra.nix
@@ -156,8 +156,9 @@
 
   # Automated flake.lock updates with build verification
   services.nix-flake-updater = {
-    # Weekly updates for all inputs
-    weekly = {
+    # Bi-weekly updates for all inputs with AI-powered auto-fix
+    # Manual trigger: sudo systemctl start nix-flake-updater-biweekly
+    biweekly = {
       enable = true;
       repoPath = "/home/vincent/src/home";
 
@@ -177,8 +178,8 @@
         "aix" # Raspberry Pi 4
       ];
 
-      # Run weekly on Sunday at 2 AM
-      schedule = "Sun *-*-* 02:00:00";
+      # Run bi-weekly: 1st and 3rd Sunday of each month at 2 AM
+      schedule = "Sun *-*-1..7,15..21 02:00:00";
 
       # Notifications via ntfy
       ntfyServer = "https://ntfy.sbr.pm";
@@ -194,6 +195,24 @@
 
       # Add randomized delay to avoid conflicts
       randomizedDelaySec = 1800; # 0-30 min delay
+
+      # AI-powered auto-fix on build failure
+      autoFix = {
+        enable = true;
+        command = "pi";
+        extraArgs = [
+          "--provider"
+          "google-vertex-claude"
+          "--no-session"
+          "--no-themes"
+          "--no-skills"
+        ];
+        maxAttempts = 3;
+        environment = {
+          GOOGLE_CLOUD_PROJECT = "itpc-gcp-pnd-pe-eng-claude";
+          GOOGLE_CLOUD_LOCATION = "us-east5";
+        };
+      };
     };
 
     # Daily automated updates for chick-group and chapeau-rouge with auto-merge
systems/AGENTS.md
@@ -0,0 +1,38 @@
+# Systems Architecture
+
+NixOS system configurations organized by hostname with shared modules in `common/`.
+
+## Channel Groups
+
+Hosts use different nixpkgs channels. Changes to shared code (`common/`)
+**must work across both channels**.
+
+### Unstable (`nixpkgs` — nixos-unstable)
+- **okinawa** — x86_64-linux, laptop / LLM build server (ASUS G14)
+- **kyushu** — x86_64-linux, work laptop
+- **aomi** — x86_64-linux, server
+- **sakhalin** — x86_64-linux, home server
+
+### Stable (`nixpkgs-25_11` — nixos-25.11)
+- **rhea** — aarch64-linux, media server (main)
+- **aion** — aarch64-linux, XMPP / podcast server
+- **athena** — aarch64-linux, Raspberry Pi 4
+- **demeter** — aarch64-linux, Raspberry Pi 4
+- **aix** — aarch64-linux, Raspberry Pi 4
+- **kerkouane** — x86_64-linux, VPS server
+- **carthage** — x86_64-linux, VPS server
+
+## Host File Layout
+
+Each host has a directory in `/systems/<hostname>/` containing:
+- `boot.nix` — bootloader, initrd, kernel modules
+- `hardware.nix` — hardware-specific settings, filesystem mounts
+- `extra.nix` (optional) — additional host-specific NixOS configuration
+- `home.nix` (optional) — host-specific home-manager configuration
+
+## Build Verification
+
+- Test current host: `make build`
+- Test a specific host: `make host/<hostname>/build`
+- Dry-build (eval only): `make host/<hostname>/dry-build`
+- Direct nix: `nix build .#nixosConfigurations.<hostname>.config.system.build.toplevel`
tools/nix-flake-update/default.nix
@@ -7,11 +7,12 @@
   jq,
   curl,
   openssh,
+  pi-coding-agent,
 }:
 
 stdenv.mkDerivation {
   pname = "nix-flake-update";
-  version = "0.1.0";
+  version = "0.2.0";
 
   src = ./.;
 
@@ -32,6 +33,7 @@ stdenv.mkDerivation {
           jq
           curl
           openssh
+          pi-coding-agent
         ]
       }
 
tools/nix-flake-update/nix-flake-update.sh
@@ -1,8 +1,9 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-# Automated NixOS flake.lock updater
-# This script updates flake.lock, builds verification systems, and pushes to remote
+# Automated NixOS flake.lock updater with optional AI-powered auto-fix
+# This script updates flake.lock, builds verification systems, optionally
+# uses a coding agent to fix build failures, and pushes to remote.
 
 # Configuration from environment or defaults
 REPO_PATH="${REPO_PATH:-/home/vincent/src/home}"
@@ -19,6 +20,13 @@ FLAKE_INPUTS="${FLAKE_INPUTS:-}"  # Space-separated list of inputs to update (em
 AUTO_MERGE="${AUTO_MERGE:-false}"  # If true, merge to main on success
 INBOX_ORG="${INBOX_ORG:-$HOME/desktop/org/inbox.org}"  # Path to org-mode inbox
 
+# Auto-fix configuration
+AUTO_FIX="${AUTO_FIX:-false}"
+AUTO_FIX_COMMAND="${AUTO_FIX_COMMAND:-pir}"
+AUTO_FIX_EXTRA_ARGS="${AUTO_FIX_EXTRA_ARGS:---model claude-opus-4-6 --no-session --no-extensions --no-themes}"
+AUTO_FIX_MAX_ATTEMPTS="${AUTO_FIX_MAX_ATTEMPTS:-3}"
+AUTO_FIX_ENV_FILE="${AUTO_FIX_ENV_FILE:-}"
+
 LOG_FILE="/var/log/nix-flake-updater/$(date +%Y%m%d-%H%M%S).log"
 mkdir -p "$(dirname "$LOG_FILE")"
 
@@ -26,6 +34,11 @@ mkdir -p "$(dirname "$LOG_FILE")"
 WORKTREE_DIR="$HOME/tmp/nix-flake-updater-$(date +%Y%m%d-%H%M%S)"
 mkdir -p "$HOME/tmp"
 
+# Track auto-fix state
+FIXES_APPLIED=0
+FIXED_HOSTS=()
+UNFIXED_HOSTS=()
+
 log() {
   echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
 }
@@ -37,7 +50,6 @@ notify() {
   local tags="$4"
 
   if [ -n "$NTFY_TOKEN_FILE" ] && [ -f "$NTFY_TOKEN_FILE" ]; then
-    # Use authentication token
     curl -s \
       -H "Authorization: Bearer $(tr -d '\n' < "$NTFY_TOKEN_FILE")" \
       -H "Title: $title" \
@@ -46,7 +58,6 @@ notify() {
       -d "$message" \
       "$NTFY_SERVER/$NTFY_TOPIC" || true
   else
-    # No authentication
     curl -s \
       -H "Title: $title" \
       -H "Priority: $priority" \
@@ -63,7 +74,6 @@ add_todo_to_inbox() {
   if [ -f "$INBOX_ORG" ]; then
     log "Adding TODO to $INBOX_ORG"
 
-    # Capture last 30 lines of the log file for quick diagnosis
     local log_tail=""
     if [ -f "$LOG_FILE" ]; then
       log_tail=$(tail -30 "$LOG_FILE")
@@ -109,18 +119,17 @@ cleanup() {
 
   if [ $exit_code -ne 0 ]; then
     log "ERROR: Update process failed with exit code $exit_code"
-    
-    # Add TODO to inbox on failure
+
     local input_desc="all inputs"
     if [ -n "$FLAKE_INPUTS" ]; then
       input_desc="inputs: $FLAKE_INPUTS"
     fi
-    
+
     add_todo_to_inbox "Fix flake update failure" \
       "Flake update failed for $input_desc.
 Build systems: $BUILD_SYSTEMS
 Auto-merge: $AUTO_MERGE"
-    
+
     notify "high" "❌ Flake Update Failed" \
       "Build failed for $input_desc. TODO added to inbox. See logs: $LOG_FILE" \
       "warning,flake"
@@ -129,6 +138,223 @@ Auto-merge: $AUTO_MERGE"
 
 trap cleanup EXIT
 
+# Build a single host, capturing stderr. Returns 0 on success, 1 on failure.
+# Sets BUILD_ERROR with captured stderr on failure.
+build_host() {
+  local system="$1"
+  local error_file
+  error_file=$(mktemp)
+
+  log "Building system: $system"
+  if nix build ".#nixosConfigurations.$system.config.system.build.toplevel" \
+     --no-link \
+     --print-build-logs 2> >(tee "$error_file" | tee -a "$LOG_FILE" >&2); then
+    log "✓ $system built successfully"
+    rm -f "$error_file"
+    return 0
+  else
+    log "✗ $system build failed"
+    BUILD_ERROR=$(tail -200 "$error_file")
+    rm -f "$error_file"
+    return 1
+  fi
+}
+
+# Build all systems, returning list of failures.
+# Sets FAILED_SYSTEMS (space-separated) and PASSED_SYSTEMS.
+build_all_systems() {
+  FAILED_SYSTEMS=""
+  PASSED_SYSTEMS=""
+
+  for system in $BUILD_SYSTEMS; do
+    if build_host "$system"; then
+      PASSED_SYSTEMS="$PASSED_SYSTEMS $system"
+    else
+      FAILED_SYSTEMS="$FAILED_SYSTEMS $system"
+    fi
+  done
+
+  # Trim leading spaces
+  FAILED_SYSTEMS="${FAILED_SYSTEMS# }"
+  PASSED_SYSTEMS="${PASSED_SYSTEMS# }"
+}
+
+# Try to auto-fix a build failure using a coding agent.
+# Arguments: $1 = system name, $2 = build error text
+# Returns 0 if the agent ran (check rebuild to see if it actually fixed it).
+try_auto_fix() {
+  local system="$1"
+  local error_text="$2"
+
+  if [ "$AUTO_FIX" != "true" ] && [ "$AUTO_FIX" != "1" ]; then
+    return 1
+  fi
+
+  # Source env file if provided (for API keys, credentials)
+  if [ -n "$AUTO_FIX_ENV_FILE" ] && [ -f "$AUTO_FIX_ENV_FILE" ]; then
+    log "Sourcing auto-fix environment: $AUTO_FIX_ENV_FILE"
+    # shellcheck disable=SC1090
+    source "$AUTO_FIX_ENV_FILE"
+  fi
+
+  log "Running auto-fix agent for $system (command: $AUTO_FIX_COMMAND)"
+
+  # Build the prompt
+  local prompt
+  prompt="The NixOS build for host '${system}' failed after a flake.lock update.
+
+Working directory: $(pwd)
+
+Build command that failed:
+  nix build .#nixosConfigurations.${system}.config.system.build.toplevel
+
+Build error output (last 200 lines):
+\`\`\`
+${error_text}
+\`\`\`
+
+Fix the Nix configuration files to resolve this build error.
+
+Rules:
+- Do NOT modify flake.lock or flake.nix
+- Only edit .nix configuration files
+- Read the AGENTS.md files in the relevant directories for channel-awareness rules
+- If the fix is in a shared module (systems/common/ or home/common/), ensure it works across both nixpkgs channels
+- Prefer host-specific overrides (systems/${system}/extra.nix) over modifying shared code when possible
+- After making changes, verify with: nix build .#nixosConfigurations.${system}.config.system.build.toplevel --no-link"
+
+  # Run the agent
+  # shellcheck disable=SC2086
+  if $AUTO_FIX_COMMAND -p $AUTO_FIX_EXTRA_ARGS "$prompt" 2>&1 | tee -a "$LOG_FILE"; then
+    log "Auto-fix agent completed for $system"
+    return 0
+  else
+    log "Auto-fix agent failed/crashed for $system"
+    return 1
+  fi
+}
+
+# Run the auto-fix loop for all failed systems.
+# Attempts to fix each failing host, with retries and regression checking.
+auto_fix_loop() {
+  local remaining_failures="$1"
+  local all_systems_list="$BUILD_SYSTEMS"
+  local round=0
+
+  while [ -n "$remaining_failures" ] && [ $round -lt "$AUTO_FIX_MAX_ATTEMPTS" ]; do
+    round=$((round + 1))
+    log "=== Auto-fix round $round/$AUTO_FIX_MAX_ATTEMPTS ==="
+
+    local still_failing=""
+
+    for system in $remaining_failures; do
+      # Try to build first (a previous fix might have resolved this too)
+      if build_host "$system"; then
+        log "✓ $system now builds (fixed by previous change)"
+        FIXED_HOSTS+=("$system")
+        continue
+      fi
+
+      # Run the agent
+      if try_auto_fix "$system" "$BUILD_ERROR"; then
+        # Check if the fix worked
+        if build_host "$system"; then
+          log "✓ $system fixed by auto-fix agent"
+          FIXES_APPLIED=$((FIXES_APPLIED + 1))
+          FIXED_HOSTS+=("$system")
+        else
+          log "✗ $system still failing after auto-fix attempt"
+          still_failing="$still_failing $system"
+        fi
+      else
+        log "✗ Auto-fix agent failed to run for $system"
+        still_failing="$still_failing $system"
+      fi
+    done
+
+    remaining_failures="${still_failing# }"
+
+    if [ -z "$remaining_failures" ]; then
+      log "All failures resolved after $round round(s)"
+      break
+    fi
+  done
+
+  # Record unfixed hosts
+  for system in $remaining_failures; do
+    UNFIXED_HOSTS+=("$system")
+  done
+
+  # Regression check: rebuild all previously-passing systems
+  if [ $FIXES_APPLIED -gt 0 ]; then
+    log "=== Regression check: rebuilding all systems ==="
+    local regression_failures=""
+
+    for system in $all_systems_list; do
+      if ! build_host "$system"; then
+        # Check if this was already a known failure
+        local was_fixed=false
+        for fixed in "${FIXED_HOSTS[@]:-}"; do
+          if [ "$system" = "$fixed" ]; then
+            was_fixed=true
+            break
+          fi
+        done
+
+        if [ "$was_fixed" = true ]; then
+          log "⚠ Regression: $system was fixed but now fails again"
+          regression_failures="$regression_failures $system"
+        else
+          # Check if this is a new regression (was passing before)
+          local is_new_regression=true
+          for unfixed in "${UNFIXED_HOSTS[@]:-}"; do
+            if [ "$system" = "$unfixed" ]; then
+              is_new_regression=false
+              break
+            fi
+          done
+
+          if [ "$is_new_regression" = true ]; then
+            log "⚠ Regression: $system was passing but now fails after auto-fix changes"
+            regression_failures="$regression_failures $system"
+          fi
+        fi
+      fi
+    done
+
+    regression_failures="${regression_failures# }"
+
+    if [ -n "$regression_failures" ]; then
+      log "Regressions detected: $regression_failures"
+      log "Attempting to fix regressions..."
+
+      # One more attempt with regression context
+      for system in $regression_failures; do
+        if ! build_host "$system"; then
+          local regression_prompt_extra="IMPORTANT: This is a REGRESSION. Host '$system' was building successfully before auto-fix changes were made to fix other hosts. Your previous fixes broke this host. Fix it WITHOUT breaking the other hosts."
+
+          if try_auto_fix "$system" "$BUILD_ERROR
+$regression_prompt_extra"; then
+            if build_host "$system"; then
+              log "✓ Regression fixed for $system"
+              FIXES_APPLIED=$((FIXES_APPLIED + 1))
+            else
+              log "✗ Could not fix regression for $system"
+              UNFIXED_HOSTS+=("$system")
+            fi
+          else
+            UNFIXED_HOSTS+=("$system")
+          fi
+        fi
+      done
+    fi
+  fi
+}
+
+# =============================================================================
+# Main script
+# =============================================================================
+
 log "Starting flake update process"
 cd "$REPO_PATH"
 
@@ -158,7 +384,7 @@ git checkout -b "$BRANCH_NAME"
 # Save old flake.lock for before/after comparison
 OLD_FLAKE_LOCK=$(cat flake.lock)
 
-# Update flake.lock (work in worktree, flake is at root)
+# Update flake.lock
 log "Updating flake.lock"
 if [ -n "$FLAKE_INPUTS" ]; then
   log "Updating specific inputs: $FLAKE_INPUTS"
@@ -179,32 +405,118 @@ if ! git diff --quiet flake.lock; then
   log "Flake input changes:"
   git diff flake.lock | grep -E '^\+.*"(narHash|rev)"' | head -20 | tee -a "$LOG_FILE"
 
-  # Build test systems (build from worktree)
-  BUILD_SUCCESS=true
-  for system in $BUILD_SYSTEMS; do
-    log "Building system: $system"
-    if nix build ".#nixosConfigurations.$system.config.system.build.toplevel" \
-       --no-link \
-       --print-build-logs 2>&1 | tee -a "$LOG_FILE"; then
-      log "✓ $system built successfully"
-    else
-      log "✗ $system build failed"
-      BUILD_SUCCESS=false
-      break
-    fi
-  done
+  # Build all systems
+  build_all_systems
 
-  if [ "$BUILD_SUCCESS" = true ]; then
-    # Commit changes (we're already in WORKTREE_DIR)
+  if [ -n "$FAILED_SYSTEMS" ]; then
+    log "Build failures detected: $FAILED_SYSTEMS"
+
+    if [ "$AUTO_FIX" = "true" ] || [ "$AUTO_FIX" = "1" ]; then
+      # Commit flake.lock first so the agent works on a clean tree
+      git add flake.lock
+
+      input_desc="all inputs"
+      if [ -n "$FLAKE_INPUTS" ]; then
+        input_desc="$FLAKE_INPUTS"
+      fi
+
+      CHANGES=$(jq -n --argjson old "$OLD_FLAKE_LOCK" --argjson new "$(cat flake.lock)" -r '
+        def rev_map:
+          .nodes | to_entries
+          | map(select(.key != "root" and .value.locked != null))
+          | map({(.key): (.value.locked.rev // .value.locked.narHash // "unknown")})
+          | add // {};
+        ($old | rev_map) as $o |
+        ($new | rev_map) as $n |
+        [$n | to_entries[] | select($o[.key] != null and $o[.key] != .value)] |
+        group_by({old: $o[.key], new: .value}) |
+        map({
+          names: (map(.key) | join(", ")),
+          old: $o[.[0].key][0:12],
+          new: .[0].value[0:12]
+        }) |
+        map("- \(.names): \(.old) → \(.new)") |
+        join("\n")
+      ' 2>/dev/null || echo "Updated flake inputs")
+
+      git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "chore(flake): update $input_desc
+
+$CHANGES"
+
+      # Run auto-fix loop
+      auto_fix_loop "$FAILED_SYSTEMS"
+
+      # Commit any fixes the agent made
+      if [ $FIXES_APPLIED -gt 0 ]; then
+        git add -A
+        if ! git diff --cached --quiet; then
+          local_fixed_list="${FIXED_HOSTS[*]:-}"
+          git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "fix(nix): auto-fix build errors
+
+Fixed hosts: ${local_fixed_list}
+Agent: ${AUTO_FIX_COMMAND}
+Attempts used: ${FIXES_APPLIED}"
+        fi
+      fi
+
+      # Determine overall result
+      if [ ${#UNFIXED_HOSTS[@]} -gt 0 ]; then
+        log "Auto-fix partially succeeded. Unfixed: ${UNFIXED_HOSTS[*]}"
+
+        add_todo_to_inbox "Flake update: ${#UNFIXED_HOSTS[@]} hosts still failing" \
+          "Auto-fix resolved ${#FIXED_HOSTS[@]} host(s) but could not fix: ${UNFIXED_HOSTS[*]}
+Build systems: $BUILD_SYSTEMS
+Agent: $AUTO_FIX_COMMAND"
+
+        # Still push the branch with partial fixes
+        if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
+          log "DRY RUN: Would push partial-fix branch"
+        else
+          git push "$GIT_REMOTE" "$BRANCH_NAME"
+        fi
+
+        notify "high" "⚠️ Flake Updated (${#UNFIXED_HOSTS[@]} hosts still failing)" \
+          "Auto-fixed: ${FIXED_HOSTS[*]:-none}. Still failing: ${UNFIXED_HOSTS[*]}. Branch: $BRANCH_NAME" \
+          "warning,flake,robot"
+
+        exit 1
+      else
+        log "All failures resolved by auto-fix"
+        # Fall through to normal push/merge logic below
+      fi
+    else
+      # No auto-fix — original behavior
+      log "Build failed, not committing changes"
+
+      input_desc="all inputs"
+      if [ -n "$FLAKE_INPUTS" ]; then
+        input_desc="$FLAKE_INPUTS"
+      fi
+
+      add_todo_to_inbox "Flake update build failure" \
+        "Build failed after updating $input_desc.
+Build systems tested: $BUILD_SYSTEMS
+Auto-merge: $AUTO_MERGE"
+
+      notify "high" "❌ Flake Update Build Failed" \
+        "Builds failed for updated $input_desc. TODO added to inbox. Check logs: $LOG_FILE" \
+        "x,flake,warning"
+
+      exit 1
+    fi
+  fi
+
+  # If we get here, all builds passed (either initially or after auto-fix)
+
+  # Commit flake.lock if not already committed (no auto-fix path)
+  if ! git diff --quiet flake.lock || ! git diff --cached --quiet flake.lock; then
     git add flake.lock
 
-    # Generate commit message with changed inputs
     input_desc="all inputs"
     if [ -n "$FLAKE_INPUTS" ]; then
       input_desc="$FLAKE_INPUTS"
     fi
-    
-    # Generate before/after changelog, deduplicated by rev transition
+
     CHANGES=$(jq -n --argjson old "$OLD_FLAKE_LOCK" --argjson new "$(cat flake.lock)" -r '
       def rev_map:
         .nodes | to_entries
@@ -231,77 +543,58 @@ $CHANGES
 Built systems: $BUILD_SYSTEMS"
 
     git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "$COMMIT_MSG"
+  fi
 
-    if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
-      log "DRY RUN: Would push to $GIT_REMOTE/$BRANCH_NAME"
-      notify "low" "🧪 Flake Update (Dry Run)" \
-        "Branch $BRANCH_NAME created locally. All builds passed: $BUILD_SYSTEMS" \
-        "test_tube,flake"
-    elif [ "$AUTO_MERGE" = "true" ] || [ "$AUTO_MERGE" = "1" ]; then
-      # Auto-merge: rebase onto main and push directly
-      log "Auto-merge enabled: rebasing onto $GIT_REMOTE/$MAIN_BRANCH"
-      
-      # Fetch latest main
-      git fetch "$GIT_REMOTE" "$MAIN_BRANCH"
-      
-      # Rebase our commit onto main
-      if git rebase "$GIT_REMOTE/$MAIN_BRANCH"; then
-        log "Rebase successful, pushing to $GIT_REMOTE/$MAIN_BRANCH"
-        
-        # Push directly to main
-        git push "$GIT_REMOTE" "HEAD:$MAIN_BRANCH"
-        
-        # Notify success
-        notify "default" "✅ Flake Auto-Updated & Merged" \
-          "Updates for $input_desc merged to $MAIN_BRANCH. All builds passed: $BUILD_SYSTEMS" \
-          "white_check_mark,flake,merged"
-        
-        log "SUCCESS: Flake updated and merged to $MAIN_BRANCH"
-      else
-        log "ERROR: Rebase failed, main branch may have moved"
-        git rebase --abort || true
-        
-        add_todo_to_inbox "Flake update rebase conflict" \
-          "Auto-merge failed due to rebase conflict.
+  # Determine notification details
+  fix_note=""
+  if [ $FIXES_APPLIED -gt 0 ]; then
+    fix_note=" ($FIXES_APPLIED auto-fix(es) applied: ${FIXED_HOSTS[*]})"
+  fi
+
+  if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
+    log "DRY RUN: Would push to $GIT_REMOTE/$BRANCH_NAME"
+    notify "low" "🧪 Flake Update (Dry Run)" \
+      "Branch $BRANCH_NAME created locally. All builds passed: $BUILD_SYSTEMS${fix_note}" \
+      "test_tube,flake"
+  elif [ "$AUTO_MERGE" = "true" ] || [ "$AUTO_MERGE" = "1" ]; then
+    # Auto-merge: rebase onto main and push directly
+    log "Auto-merge enabled: rebasing onto $GIT_REMOTE/$MAIN_BRANCH"
+
+    git fetch "$GIT_REMOTE" "$MAIN_BRANCH"
+
+    if git rebase "$GIT_REMOTE/$MAIN_BRANCH"; then
+      log "Rebase successful, pushing to $GIT_REMOTE/$MAIN_BRANCH"
+      git push "$GIT_REMOTE" "HEAD:$MAIN_BRANCH"
+
+      notify "default" "✅ Flake Auto-Updated & Merged" \
+        "Updates merged to $MAIN_BRANCH. All builds passed: $BUILD_SYSTEMS${fix_note}" \
+        "white_check_mark,flake,merged"
+
+      log "SUCCESS: Flake updated and merged to $MAIN_BRANCH"
+    else
+      log "ERROR: Rebase failed, main branch may have moved"
+      git rebase --abort || true
+
+      add_todo_to_inbox "Flake update rebase conflict" \
+        "Auto-merge failed due to rebase conflict.
 Inputs: $input_desc
 Branch: $BRANCH_NAME (in worktree, needs manual rebase)"
-        
-        notify "high" "⚠️ Flake Update Rebase Failed" \
-          "Could not rebase $input_desc onto $MAIN_BRANCH. TODO added to inbox." \
-          "warning,flake,conflict"
-        exit 1
-      fi
-    else
-      # Branch mode: push to feature branch
-      log "Pushing to $GIT_REMOTE/$BRANCH_NAME"
-      git push "$GIT_REMOTE" "$BRANCH_NAME"
 
-      # Notify success
-      notify "default" "✅ Flake Updated Successfully" \
-        "Branch $BRANCH_NAME created and pushed. All builds passed: $BUILD_SYSTEMS" \
-        "white_check_mark,flake"
-
-      log "SUCCESS: Flake updated and pushed to $BRANCH_NAME"
+      notify "high" "⚠️ Flake Update Rebase Failed" \
+        "Could not rebase onto $MAIN_BRANCH. TODO added to inbox." \
+        "warning,flake,conflict"
+      exit 1
     fi
-
   else
-    log "Build failed, not committing changes"
-    
-    input_desc="all inputs"
-    if [ -n "$FLAKE_INPUTS" ]; then
-      input_desc="$FLAKE_INPUTS"
-    fi
-    
-    add_todo_to_inbox "Flake update build failure" \
-      "Build failed after updating $input_desc.
-Build systems tested: $BUILD_SYSTEMS
-Auto-merge: $AUTO_MERGE"
-    
-    notify "high" "❌ Flake Update Build Failed" \
-      "Builds failed for updated $input_desc. TODO added to inbox. Check logs: $LOG_FILE" \
-      "x,flake,warning"
+    # Branch mode: push to feature branch
+    log "Pushing to $GIT_REMOTE/$BRANCH_NAME"
+    git push "$GIT_REMOTE" "$BRANCH_NAME"
 
-    exit 1
+    notify "default" "✅ Flake Updated Successfully" \
+      "Branch $BRANCH_NAME pushed. All builds passed: $BUILD_SYSTEMS${fix_note}" \
+      "white_check_mark,flake"
+
+    log "SUCCESS: Flake updated and pushed to $BRANCH_NAME"
   fi
 
 else