flake-update-20260505
  1#!/usr/bin/env bash
  2set -euo pipefail
  3
  4# Automated NixOS flake.lock updater with optional AI-powered auto-fix
  5# This script updates flake.lock, builds verification systems, optionally
  6# uses a coding agent to fix build failures, and pushes to remote.
  7
  8# Configuration from environment or defaults
  9REPO_PATH="${REPO_PATH:-/home/vincent/src/home}"
 10FLAKE_PATH="${FLAKE_PATH:-$REPO_PATH}"
 11GIT_REMOTE="${GIT_REMOTE:-origin}"
 12MAIN_BRANCH="${MAIN_BRANCH:-main}"
 13BRANCH_PREFIX="${BRANCH_PREFIX:-flake-update-}"
 14NTFY_TOPIC="${NTFY_TOPIC:-nix-updates}"
 15NTFY_SERVER="${NTFY_SERVER:-https://ntfy.sh}"
 16NTFY_TOKEN_FILE="${NTFY_TOKEN_FILE:-}"
 17BUILD_SYSTEMS="${BUILD_SYSTEMS:-}"
 18DRY_RUN="${DRY_RUN:-false}"
 19FLAKE_INPUTS="${FLAKE_INPUTS:-}"  # Space-separated list of inputs to update (empty = all)
 20AUTO_MERGE="${AUTO_MERGE:-false}"  # If true, merge to main on success
 21INBOX_ORG="${INBOX_ORG:-$HOME/desktop/org/inbox.org}"  # Path to org-mode inbox
 22
 23# Auto-fix configuration
 24AUTO_FIX="${AUTO_FIX:-false}"
 25AUTO_FIX_COMMAND="${AUTO_FIX_COMMAND:-pir}"
 26AUTO_FIX_EXTRA_ARGS="${AUTO_FIX_EXTRA_ARGS:---model claude-opus-4-6 --no-session --no-extensions --no-themes}"
 27AUTO_FIX_MAX_ATTEMPTS="${AUTO_FIX_MAX_ATTEMPTS:-3}"
 28AUTO_FIX_ENV_FILE="${AUTO_FIX_ENV_FILE:-}"
 29
 30LOG_FILE="/var/log/nix-flake-updater/$(date +%Y%m%d-%H%M%S).log"
 31mkdir -p "$(dirname "$LOG_FILE")"
 32
 33# Worktree directory for isolated work (use ~/tmp to avoid tmpfs/RAM)
 34WORKTREE_DIR="$HOME/tmp/nix-flake-updater-$(date +%Y%m%d-%H%M%S)"
 35mkdir -p "$HOME/tmp"
 36
 37# Track auto-fix state
 38FIXES_APPLIED=0
 39FIXED_HOSTS=()
 40UNFIXED_HOSTS=()
 41
 42log() {
 43  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
 44}
 45
 46notify() {
 47  local priority="$1"
 48  local title="$2"
 49  local message="$3"
 50  local tags="$4"
 51
 52  if [ -n "$NTFY_TOKEN_FILE" ] && [ -f "$NTFY_TOKEN_FILE" ]; then
 53    curl -s \
 54      -H "Authorization: Bearer $(tr -d '\n' < "$NTFY_TOKEN_FILE")" \
 55      -H "Title: $title" \
 56      -H "Priority: $priority" \
 57      -H "Tags: $tags" \
 58      -d "$message" \
 59      "$NTFY_SERVER/$NTFY_TOPIC" || true
 60  else
 61    curl -s \
 62      -H "Title: $title" \
 63      -H "Priority: $priority" \
 64      -H "Tags: $tags" \
 65      -d "$message" \
 66      "$NTFY_SERVER/$NTFY_TOPIC" || true
 67  fi
 68}
 69
 70add_todo_to_inbox() {
 71  local title="$1"
 72  local details="$2"
 73
 74  if [ -f "$INBOX_ORG" ]; then
 75    log "Adding TODO to $INBOX_ORG"
 76
 77    local log_tail=""
 78    if [ -f "$LOG_FILE" ]; then
 79      log_tail=$(tail -30 "$LOG_FILE")
 80    fi
 81
 82    cat >> "$INBOX_ORG" <<EOF
 83* TODO $title
 84  SCHEDULED: <$(date '+%Y-%m-%d %a')>
 85  :PROPERTIES:
 86  :CREATED: [$(date '+%Y-%m-%d %a %H:%M')]
 87  :END:
 88
 89$details
 90
 91Log file: $LOG_FILE
 92
 93#+begin_src text
 94$log_tail
 95#+end_src
 96EOF
 97  else
 98    log "WARNING: Inbox file not found: $INBOX_ORG"
 99  fi
100}
101
102cleanup() {
103  local exit_code=$?
104
105  # Clean up worktree if it exists
106  if [ -d "$WORKTREE_DIR" ]; then
107    log "Cleaning up worktree: $WORKTREE_DIR"
108    cd "$REPO_PATH"
109    git worktree remove --force "$WORKTREE_DIR" 2>&1 | tee -a "$LOG_FILE" || true
110    [[ -n "$WORKTREE_DIR" ]] && rm -rf "$WORKTREE_DIR" || true
111  fi
112
113  # Clean up the update branch (must happen after worktree removal)
114  if [ -n "${BRANCH_NAME:-}" ] && git show-ref --verify --quiet "refs/heads/$BRANCH_NAME" 2>/dev/null; then
115    log "Cleaning up branch: $BRANCH_NAME"
116    cd "$REPO_PATH"
117    git branch -D "$BRANCH_NAME" 2>&1 | tee -a "$LOG_FILE" || true
118  fi
119
120  if [ $exit_code -ne 0 ]; then
121    log "ERROR: Update process failed with exit code $exit_code"
122
123    local input_desc="all inputs"
124    if [ -n "$FLAKE_INPUTS" ]; then
125      input_desc="inputs: $FLAKE_INPUTS"
126    fi
127
128    add_todo_to_inbox "Fix flake update failure" \
129      "Flake update failed for $input_desc.
130Build systems: $BUILD_SYSTEMS
131Auto-merge: $AUTO_MERGE"
132
133    notify "high" "❌ Flake Update Failed" \
134      "Build failed for $input_desc. TODO added to inbox. See logs: $LOG_FILE" \
135      "warning,flake"
136  fi
137}
138
139trap cleanup EXIT
140
141# Build a single host, capturing stderr. Returns 0 on success, 1 on failure.
142# Sets BUILD_ERROR with captured stderr on failure.
143build_host() {
144  local system="$1"
145  local error_file
146  error_file=$(mktemp)
147
148  log "Building system: $system"
149  if nix build ".#nixosConfigurations.$system.config.system.build.toplevel" \
150     --no-link \
151     --print-build-logs 2> >(tee "$error_file" | tee -a "$LOG_FILE" >&2); then
152    log "$system built successfully"
153    rm -f "$error_file"
154    return 0
155  else
156    log "$system build failed"
157    BUILD_ERROR=$(tail -200 "$error_file")
158    rm -f "$error_file"
159    return 1
160  fi
161}
162
163# Build all systems, returning list of failures.
164# Sets FAILED_SYSTEMS (space-separated) and PASSED_SYSTEMS.
165build_all_systems() {
166  FAILED_SYSTEMS=""
167  PASSED_SYSTEMS=""
168
169  for system in $BUILD_SYSTEMS; do
170    if build_host "$system"; then
171      PASSED_SYSTEMS="$PASSED_SYSTEMS $system"
172    else
173      FAILED_SYSTEMS="$FAILED_SYSTEMS $system"
174    fi
175  done
176
177  # Trim leading spaces
178  FAILED_SYSTEMS="${FAILED_SYSTEMS# }"
179  PASSED_SYSTEMS="${PASSED_SYSTEMS# }"
180}
181
182# Try to auto-fix a build failure using a coding agent.
183# Arguments: $1 = system name, $2 = build error text
184# Returns 0 if the agent ran (check rebuild to see if it actually fixed it).
185try_auto_fix() {
186  local system="$1"
187  local error_text="$2"
188
189  if [ "$AUTO_FIX" != "true" ] && [ "$AUTO_FIX" != "1" ]; then
190    return 1
191  fi
192
193  # Source env file if provided (for API keys, credentials)
194  if [ -n "$AUTO_FIX_ENV_FILE" ] && [ -f "$AUTO_FIX_ENV_FILE" ]; then
195    log "Sourcing auto-fix environment: $AUTO_FIX_ENV_FILE"
196    # shellcheck disable=SC1090
197    source "$AUTO_FIX_ENV_FILE"
198  fi
199
200  log "Running auto-fix agent for $system (command: $AUTO_FIX_COMMAND)"
201
202  # Build the prompt
203  local prompt
204  prompt="The NixOS build for host '${system}' failed after a flake.lock update.
205
206Working directory: $(pwd)
207
208Build command that failed:
209  nix build .#nixosConfigurations.${system}.config.system.build.toplevel
210
211Build error output (last 200 lines):
212\`\`\`
213${error_text}
214\`\`\`
215
216Fix the Nix configuration files to resolve this build error.
217
218Rules:
219- Do NOT modify flake.lock or flake.nix
220- Only edit .nix configuration files
221- Read the AGENTS.md files in the relevant directories for channel-awareness rules
222- If the fix is in a shared module (systems/common/ or home/common/), ensure it works across both nixpkgs channels
223- Prefer host-specific overrides (systems/${system}/extra.nix) over modifying shared code when possible
224- After making changes, verify with: nix build .#nixosConfigurations.${system}.config.system.build.toplevel --no-link"
225
226  # Run the agent
227  # shellcheck disable=SC2086
228  if $AUTO_FIX_COMMAND -p $AUTO_FIX_EXTRA_ARGS "$prompt" 2>&1 | tee -a "$LOG_FILE"; then
229    log "Auto-fix agent completed for $system"
230    return 0
231  else
232    log "Auto-fix agent failed/crashed for $system"
233    return 1
234  fi
235}
236
237# Run the auto-fix loop for all failed systems.
238# Attempts to fix each failing host, with retries and regression checking.
239auto_fix_loop() {
240  local remaining_failures="$1"
241  local all_systems_list="$BUILD_SYSTEMS"
242  local round=0
243
244  while [ -n "$remaining_failures" ] && [ $round -lt "$AUTO_FIX_MAX_ATTEMPTS" ]; do
245    round=$((round + 1))
246    log "=== Auto-fix round $round/$AUTO_FIX_MAX_ATTEMPTS ==="
247
248    local still_failing=""
249
250    for system in $remaining_failures; do
251      # Try to build first (a previous fix might have resolved this too)
252      if build_host "$system"; then
253        log "$system now builds (fixed by previous change)"
254        FIXED_HOSTS+=("$system")
255        continue
256      fi
257
258      # Run the agent
259      if try_auto_fix "$system" "$BUILD_ERROR"; then
260        # Check if the fix worked
261        if build_host "$system"; then
262          log "$system fixed by auto-fix agent"
263          FIXES_APPLIED=$((FIXES_APPLIED + 1))
264          FIXED_HOSTS+=("$system")
265        else
266          log "$system still failing after auto-fix attempt"
267          still_failing="$still_failing $system"
268        fi
269      else
270        log "✗ Auto-fix agent failed to run for $system"
271        still_failing="$still_failing $system"
272      fi
273    done
274
275    remaining_failures="${still_failing# }"
276
277    if [ -z "$remaining_failures" ]; then
278      log "All failures resolved after $round round(s)"
279      break
280    fi
281  done
282
283  # Record unfixed hosts
284  for system in $remaining_failures; do
285    UNFIXED_HOSTS+=("$system")
286  done
287
288  # Regression check: rebuild all previously-passing systems
289  if [ $FIXES_APPLIED -gt 0 ]; then
290    log "=== Regression check: rebuilding all systems ==="
291    local regression_failures=""
292
293    for system in $all_systems_list; do
294      if ! build_host "$system"; then
295        # Check if this was already a known failure
296        local was_fixed=false
297        for fixed in "${FIXED_HOSTS[@]:-}"; do
298          if [ "$system" = "$fixed" ]; then
299            was_fixed=true
300            break
301          fi
302        done
303
304        if [ "$was_fixed" = true ]; then
305          log "⚠ Regression: $system was fixed but now fails again"
306          regression_failures="$regression_failures $system"
307        else
308          # Check if this is a new regression (was passing before)
309          local is_new_regression=true
310          for unfixed in "${UNFIXED_HOSTS[@]:-}"; do
311            if [ "$system" = "$unfixed" ]; then
312              is_new_regression=false
313              break
314            fi
315          done
316
317          if [ "$is_new_regression" = true ]; then
318            log "⚠ Regression: $system was passing but now fails after auto-fix changes"
319            regression_failures="$regression_failures $system"
320          fi
321        fi
322      fi
323    done
324
325    regression_failures="${regression_failures# }"
326
327    if [ -n "$regression_failures" ]; then
328      log "Regressions detected: $regression_failures"
329      log "Attempting to fix regressions..."
330
331      # One more attempt with regression context
332      for system in $regression_failures; do
333        if ! build_host "$system"; then
334          local regression_prompt_extra="IMPORTANT: This is a REGRESSION. Host '$system' was building successfully before auto-fix changes were made to fix other hosts. Your previous fixes broke this host. Fix it WITHOUT breaking the other hosts."
335
336          if try_auto_fix "$system" "$BUILD_ERROR
337$regression_prompt_extra"; then
338            if build_host "$system"; then
339              log "✓ Regression fixed for $system"
340              FIXES_APPLIED=$((FIXES_APPLIED + 1))
341            else
342              log "✗ Could not fix regression for $system"
343              UNFIXED_HOSTS+=("$system")
344            fi
345          else
346            UNFIXED_HOSTS+=("$system")
347          fi
348        fi
349      done
350    fi
351  fi
352}
353
354# =============================================================================
355# Main script
356# =============================================================================
357
358log "Starting flake update process"
359cd "$REPO_PATH"
360
361# Fetch latest changes
362log "Fetching latest changes from $GIT_REMOTE"
363git fetch "$GIT_REMOTE"
364
365# Create update branch name
366BRANCH_NAME="$BRANCH_PREFIX$(date +%Y%m%d)"
367if git show-ref --verify --quiet "refs/heads/$BRANCH_NAME"; then
368  log "Branch $BRANCH_NAME already exists, using unique name"
369  BRANCH_NAME="$BRANCH_PREFIX$(date +%Y%m%d-%H%M%S)"
370fi
371
372# Create worktree from main branch (skip LFS to avoid hook failures)
373log "Creating worktree at $WORKTREE_DIR from $GIT_REMOTE/main"
374GIT_LFS_SKIP_SMUDGE=1 git worktree add "$WORKTREE_DIR" "$GIT_REMOTE/main"
375
376# Switch to worktree
377cd "$WORKTREE_DIR"
378log "Working in isolated worktree: $WORKTREE_DIR"
379
380# Create update branch in the worktree
381log "Creating update branch: $BRANCH_NAME"
382git checkout -b "$BRANCH_NAME"
383
384# Save old flake.lock for before/after comparison
385OLD_FLAKE_LOCK=$(cat flake.lock)
386
387# Update flake.lock
388log "Updating flake.lock"
389if [ -n "$FLAKE_INPUTS" ]; then
390  log "Updating specific inputs: $FLAKE_INPUTS"
391  for input in $FLAKE_INPUTS; do
392    log "Updating input: $input"
393    nix flake lock --update-input "$input" 2>&1 | tee -a "$LOG_FILE"
394  done
395else
396  log "Updating all inputs"
397  nix flake update 2>&1 | tee -a "$LOG_FILE"
398fi
399
400# Check if there are changes
401if ! git diff --quiet flake.lock; then
402  log "Changes detected in flake.lock"
403
404  # Show what changed
405  log "Flake input changes:"
406  git diff flake.lock | grep -E '^\+.*"(narHash|rev)"' | head -20 | tee -a "$LOG_FILE"
407
408  # Build all systems
409  build_all_systems
410
411  if [ -n "$FAILED_SYSTEMS" ]; then
412    log "Build failures detected: $FAILED_SYSTEMS"
413
414    if [ "$AUTO_FIX" = "true" ] || [ "$AUTO_FIX" = "1" ]; then
415      # Commit flake.lock first so the agent works on a clean tree
416      git add flake.lock
417
418      input_desc="all inputs"
419      if [ -n "$FLAKE_INPUTS" ]; then
420        input_desc="$FLAKE_INPUTS"
421      fi
422
423      CHANGES=$(jq -n --argjson old "$OLD_FLAKE_LOCK" --argjson new "$(cat flake.lock)" -r '
424        def rev_map:
425          .nodes | to_entries
426          | map(select(.key != "root" and .value.locked != null))
427          | map({(.key): (.value.locked.rev // .value.locked.narHash // "unknown")})
428          | add // {};
429        ($old | rev_map) as $o |
430        ($new | rev_map) as $n |
431        [$n | to_entries[] | select($o[.key] != null and $o[.key] != .value)] |
432        group_by({old: $o[.key], new: .value}) |
433        map({
434          names: (map(.key) | join(", ")),
435          old: $o[.[0].key][0:12],
436          new: .[0].value[0:12]
437        }) |
438        map("- \(.names): \(.old) → \(.new)") |
439        join("\n")
440      ' 2>/dev/null || echo "Updated flake inputs")
441
442      git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "chore(flake): update $input_desc
443
444$CHANGES"
445
446      # Run auto-fix loop
447      auto_fix_loop "$FAILED_SYSTEMS"
448
449      # Commit any fixes the agent made
450      if [ $FIXES_APPLIED -gt 0 ]; then
451        git add -A
452        if ! git diff --cached --quiet; then
453          local_fixed_list="${FIXED_HOSTS[*]:-}"
454          git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "fix(nix): auto-fix build errors
455
456Fixed hosts: ${local_fixed_list}
457Agent: ${AUTO_FIX_COMMAND}
458Attempts used: ${FIXES_APPLIED}"
459        fi
460      fi
461
462      # Determine overall result
463      if [ ${#UNFIXED_HOSTS[@]} -gt 0 ]; then
464        log "Auto-fix partially succeeded. Unfixed: ${UNFIXED_HOSTS[*]}"
465
466        add_todo_to_inbox "Flake update: ${#UNFIXED_HOSTS[@]} hosts still failing" \
467          "Auto-fix resolved ${#FIXED_HOSTS[@]} host(s) but could not fix: ${UNFIXED_HOSTS[*]}
468Build systems: $BUILD_SYSTEMS
469Agent: $AUTO_FIX_COMMAND"
470
471        # Still push the branch with partial fixes
472        if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
473          log "DRY RUN: Would push partial-fix branch"
474        else
475          git push "$GIT_REMOTE" "$BRANCH_NAME"
476        fi
477
478        notify "high" "⚠️ Flake Updated (${#UNFIXED_HOSTS[@]} hosts still failing)" \
479          "Auto-fixed: ${FIXED_HOSTS[*]:-none}. Still failing: ${UNFIXED_HOSTS[*]}. Branch: $BRANCH_NAME" \
480          "warning,flake,robot"
481
482        exit 1
483      else
484        log "All failures resolved by auto-fix"
485        # Fall through to normal push/merge logic below
486      fi
487    else
488      # No auto-fix — original behavior
489      log "Build failed, not committing changes"
490
491      input_desc="all inputs"
492      if [ -n "$FLAKE_INPUTS" ]; then
493        input_desc="$FLAKE_INPUTS"
494      fi
495
496      add_todo_to_inbox "Flake update build failure" \
497        "Build failed after updating $input_desc.
498Build systems tested: $BUILD_SYSTEMS
499Auto-merge: $AUTO_MERGE"
500
501      notify "high" "❌ Flake Update Build Failed" \
502        "Builds failed for updated $input_desc. TODO added to inbox. Check logs: $LOG_FILE" \
503        "x,flake,warning"
504
505      exit 1
506    fi
507  fi
508
509  # If we get here, all builds passed (either initially or after auto-fix)
510
511  # Commit flake.lock if not already committed (no auto-fix path)
512  if ! git diff --quiet flake.lock || ! git diff --cached --quiet flake.lock; then
513    git add flake.lock
514
515    input_desc="all inputs"
516    if [ -n "$FLAKE_INPUTS" ]; then
517      input_desc="$FLAKE_INPUTS"
518    fi
519
520    CHANGES=$(jq -n --argjson old "$OLD_FLAKE_LOCK" --argjson new "$(cat flake.lock)" -r '
521      def rev_map:
522        .nodes | to_entries
523        | map(select(.key != "root" and .value.locked != null))
524        | map({(.key): (.value.locked.rev // .value.locked.narHash // "unknown")})
525        | add // {};
526      ($old | rev_map) as $o |
527      ($new | rev_map) as $n |
528      [$n | to_entries[] | select($o[.key] != null and $o[.key] != .value)] |
529      group_by({old: $o[.key], new: .value}) |
530      map({
531        names: (map(.key) | join(", ")),
532        old: $o[.[0].key][0:12],
533        new: .[0].value[0:12]
534      }) |
535      map("- \(.names): \(.old) → \(.new)") |
536      join("\n")
537    ' 2>/dev/null || echo "Updated flake inputs")
538
539    COMMIT_MSG="chore(flake): update $input_desc
540
541$CHANGES
542
543Built systems: $BUILD_SYSTEMS"
544
545    git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "$COMMIT_MSG"
546  fi
547
548  # Determine notification details
549  fix_note=""
550  if [ $FIXES_APPLIED -gt 0 ]; then
551    fix_note=" ($FIXES_APPLIED auto-fix(es) applied: ${FIXED_HOSTS[*]})"
552  fi
553
554  if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
555    log "DRY RUN: Would push to $GIT_REMOTE/$BRANCH_NAME"
556    notify "low" "🧪 Flake Update (Dry Run)" \
557      "Branch $BRANCH_NAME created locally. All builds passed: $BUILD_SYSTEMS${fix_note}" \
558      "test_tube,flake"
559  elif [ "$AUTO_MERGE" = "true" ] || [ "$AUTO_MERGE" = "1" ]; then
560    # Auto-merge: rebase onto main and push directly
561    log "Auto-merge enabled: rebasing onto $GIT_REMOTE/$MAIN_BRANCH"
562
563    git fetch "$GIT_REMOTE" "$MAIN_BRANCH"
564
565    if git rebase "$GIT_REMOTE/$MAIN_BRANCH"; then
566      log "Rebase successful, pushing to $GIT_REMOTE/$MAIN_BRANCH"
567      git push "$GIT_REMOTE" "HEAD:$MAIN_BRANCH"
568
569      notify "default" "✅ Flake Auto-Updated & Merged" \
570        "Updates merged to $MAIN_BRANCH. All builds passed: $BUILD_SYSTEMS${fix_note}" \
571        "white_check_mark,flake,merged"
572
573      log "SUCCESS: Flake updated and merged to $MAIN_BRANCH"
574    else
575      log "ERROR: Rebase failed, main branch may have moved"
576      git rebase --abort || true
577
578      add_todo_to_inbox "Flake update rebase conflict" \
579        "Auto-merge failed due to rebase conflict.
580Inputs: $input_desc
581Branch: $BRANCH_NAME (in worktree, needs manual rebase)"
582
583      notify "high" "⚠️ Flake Update Rebase Failed" \
584        "Could not rebase onto $MAIN_BRANCH. TODO added to inbox." \
585        "warning,flake,conflict"
586      exit 1
587    fi
588  else
589    # Branch mode: push to feature branch
590    log "Pushing to $GIT_REMOTE/$BRANCH_NAME"
591    git push "$GIT_REMOTE" "$BRANCH_NAME"
592
593    notify "default" "✅ Flake Updated Successfully" \
594      "Branch $BRANCH_NAME pushed. All builds passed: $BUILD_SYSTEMS${fix_note}" \
595      "white_check_mark,flake"
596
597    log "SUCCESS: Flake updated and pushed to $BRANCH_NAME"
598  fi
599
600else
601  log "No changes in flake.lock, nothing to do"
602  notify "low" "ℹ️ No Flake Updates" \
603    "flake.lock is already up to date" \
604    "information_source,flake"
605fi
606
607log "Flake update process complete"