flake-update-20260505
1#!/usr/bin/env bash
2set -euo pipefail
3
4# Automated NixOS flake.lock updater with optional AI-powered auto-fix
5# This script updates flake.lock, builds verification systems, optionally
6# uses a coding agent to fix build failures, and pushes to remote.
7
8# Configuration from environment or defaults
9REPO_PATH="${REPO_PATH:-/home/vincent/src/home}"
10FLAKE_PATH="${FLAKE_PATH:-$REPO_PATH}"
11GIT_REMOTE="${GIT_REMOTE:-origin}"
12MAIN_BRANCH="${MAIN_BRANCH:-main}"
13BRANCH_PREFIX="${BRANCH_PREFIX:-flake-update-}"
14NTFY_TOPIC="${NTFY_TOPIC:-nix-updates}"
15NTFY_SERVER="${NTFY_SERVER:-https://ntfy.sh}"
16NTFY_TOKEN_FILE="${NTFY_TOKEN_FILE:-}"
17BUILD_SYSTEMS="${BUILD_SYSTEMS:-}"
18DRY_RUN="${DRY_RUN:-false}"
19FLAKE_INPUTS="${FLAKE_INPUTS:-}" # Space-separated list of inputs to update (empty = all)
20AUTO_MERGE="${AUTO_MERGE:-false}" # If true, merge to main on success
21INBOX_ORG="${INBOX_ORG:-$HOME/desktop/org/inbox.org}" # Path to org-mode inbox
22
23# Auto-fix configuration
24AUTO_FIX="${AUTO_FIX:-false}"
25AUTO_FIX_COMMAND="${AUTO_FIX_COMMAND:-pir}"
26AUTO_FIX_EXTRA_ARGS="${AUTO_FIX_EXTRA_ARGS:---model claude-opus-4-6 --no-session --no-extensions --no-themes}"
27AUTO_FIX_MAX_ATTEMPTS="${AUTO_FIX_MAX_ATTEMPTS:-3}"
28AUTO_FIX_ENV_FILE="${AUTO_FIX_ENV_FILE:-}"
29
30LOG_FILE="/var/log/nix-flake-updater/$(date +%Y%m%d-%H%M%S).log"
31mkdir -p "$(dirname "$LOG_FILE")"
32
33# Worktree directory for isolated work (use ~/tmp to avoid tmpfs/RAM)
34WORKTREE_DIR="$HOME/tmp/nix-flake-updater-$(date +%Y%m%d-%H%M%S)"
35mkdir -p "$HOME/tmp"
36
37# Track auto-fix state
38FIXES_APPLIED=0
39FIXED_HOSTS=()
40UNFIXED_HOSTS=()
41
42log() {
43 echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
44}
45
46notify() {
47 local priority="$1"
48 local title="$2"
49 local message="$3"
50 local tags="$4"
51
52 if [ -n "$NTFY_TOKEN_FILE" ] && [ -f "$NTFY_TOKEN_FILE" ]; then
53 curl -s \
54 -H "Authorization: Bearer $(tr -d '\n' < "$NTFY_TOKEN_FILE")" \
55 -H "Title: $title" \
56 -H "Priority: $priority" \
57 -H "Tags: $tags" \
58 -d "$message" \
59 "$NTFY_SERVER/$NTFY_TOPIC" || true
60 else
61 curl -s \
62 -H "Title: $title" \
63 -H "Priority: $priority" \
64 -H "Tags: $tags" \
65 -d "$message" \
66 "$NTFY_SERVER/$NTFY_TOPIC" || true
67 fi
68}
69
70add_todo_to_inbox() {
71 local title="$1"
72 local details="$2"
73
74 if [ -f "$INBOX_ORG" ]; then
75 log "Adding TODO to $INBOX_ORG"
76
77 local log_tail=""
78 if [ -f "$LOG_FILE" ]; then
79 log_tail=$(tail -30 "$LOG_FILE")
80 fi
81
82 cat >> "$INBOX_ORG" <<EOF
83* TODO $title
84 SCHEDULED: <$(date '+%Y-%m-%d %a')>
85 :PROPERTIES:
86 :CREATED: [$(date '+%Y-%m-%d %a %H:%M')]
87 :END:
88
89$details
90
91Log file: $LOG_FILE
92
93#+begin_src text
94$log_tail
95#+end_src
96EOF
97 else
98 log "WARNING: Inbox file not found: $INBOX_ORG"
99 fi
100}
101
102cleanup() {
103 local exit_code=$?
104
105 # Clean up worktree if it exists
106 if [ -d "$WORKTREE_DIR" ]; then
107 log "Cleaning up worktree: $WORKTREE_DIR"
108 cd "$REPO_PATH"
109 git worktree remove --force "$WORKTREE_DIR" 2>&1 | tee -a "$LOG_FILE" || true
110 [[ -n "$WORKTREE_DIR" ]] && rm -rf "$WORKTREE_DIR" || true
111 fi
112
113 # Clean up the update branch (must happen after worktree removal)
114 if [ -n "${BRANCH_NAME:-}" ] && git show-ref --verify --quiet "refs/heads/$BRANCH_NAME" 2>/dev/null; then
115 log "Cleaning up branch: $BRANCH_NAME"
116 cd "$REPO_PATH"
117 git branch -D "$BRANCH_NAME" 2>&1 | tee -a "$LOG_FILE" || true
118 fi
119
120 if [ $exit_code -ne 0 ]; then
121 log "ERROR: Update process failed with exit code $exit_code"
122
123 local input_desc="all inputs"
124 if [ -n "$FLAKE_INPUTS" ]; then
125 input_desc="inputs: $FLAKE_INPUTS"
126 fi
127
128 add_todo_to_inbox "Fix flake update failure" \
129 "Flake update failed for $input_desc.
130Build systems: $BUILD_SYSTEMS
131Auto-merge: $AUTO_MERGE"
132
133 notify "high" "❌ Flake Update Failed" \
134 "Build failed for $input_desc. TODO added to inbox. See logs: $LOG_FILE" \
135 "warning,flake"
136 fi
137}
138
139trap cleanup EXIT
140
141# Build a single host, capturing stderr. Returns 0 on success, 1 on failure.
142# Sets BUILD_ERROR with captured stderr on failure.
143build_host() {
144 local system="$1"
145 local error_file
146 error_file=$(mktemp)
147
148 log "Building system: $system"
149 if nix build ".#nixosConfigurations.$system.config.system.build.toplevel" \
150 --no-link \
151 --print-build-logs 2> >(tee "$error_file" | tee -a "$LOG_FILE" >&2); then
152 log "✓ $system built successfully"
153 rm -f "$error_file"
154 return 0
155 else
156 log "✗ $system build failed"
157 BUILD_ERROR=$(tail -200 "$error_file")
158 rm -f "$error_file"
159 return 1
160 fi
161}
162
163# Build all systems, returning list of failures.
164# Sets FAILED_SYSTEMS (space-separated) and PASSED_SYSTEMS.
165build_all_systems() {
166 FAILED_SYSTEMS=""
167 PASSED_SYSTEMS=""
168
169 for system in $BUILD_SYSTEMS; do
170 if build_host "$system"; then
171 PASSED_SYSTEMS="$PASSED_SYSTEMS $system"
172 else
173 FAILED_SYSTEMS="$FAILED_SYSTEMS $system"
174 fi
175 done
176
177 # Trim leading spaces
178 FAILED_SYSTEMS="${FAILED_SYSTEMS# }"
179 PASSED_SYSTEMS="${PASSED_SYSTEMS# }"
180}
181
182# Try to auto-fix a build failure using a coding agent.
183# Arguments: $1 = system name, $2 = build error text
184# Returns 0 if the agent ran (check rebuild to see if it actually fixed it).
185try_auto_fix() {
186 local system="$1"
187 local error_text="$2"
188
189 if [ "$AUTO_FIX" != "true" ] && [ "$AUTO_FIX" != "1" ]; then
190 return 1
191 fi
192
193 # Source env file if provided (for API keys, credentials)
194 if [ -n "$AUTO_FIX_ENV_FILE" ] && [ -f "$AUTO_FIX_ENV_FILE" ]; then
195 log "Sourcing auto-fix environment: $AUTO_FIX_ENV_FILE"
196 # shellcheck disable=SC1090
197 source "$AUTO_FIX_ENV_FILE"
198 fi
199
200 log "Running auto-fix agent for $system (command: $AUTO_FIX_COMMAND)"
201
202 # Build the prompt
203 local prompt
204 prompt="The NixOS build for host '${system}' failed after a flake.lock update.
205
206Working directory: $(pwd)
207
208Build command that failed:
209 nix build .#nixosConfigurations.${system}.config.system.build.toplevel
210
211Build error output (last 200 lines):
212\`\`\`
213${error_text}
214\`\`\`
215
216Fix the Nix configuration files to resolve this build error.
217
218Rules:
219- Do NOT modify flake.lock or flake.nix
220- Only edit .nix configuration files
221- Read the AGENTS.md files in the relevant directories for channel-awareness rules
222- If the fix is in a shared module (systems/common/ or home/common/), ensure it works across both nixpkgs channels
223- Prefer host-specific overrides (systems/${system}/extra.nix) over modifying shared code when possible
224- After making changes, verify with: nix build .#nixosConfigurations.${system}.config.system.build.toplevel --no-link"
225
226 # Run the agent
227 # shellcheck disable=SC2086
228 if $AUTO_FIX_COMMAND -p $AUTO_FIX_EXTRA_ARGS "$prompt" 2>&1 | tee -a "$LOG_FILE"; then
229 log "Auto-fix agent completed for $system"
230 return 0
231 else
232 log "Auto-fix agent failed/crashed for $system"
233 return 1
234 fi
235}
236
237# Run the auto-fix loop for all failed systems.
238# Attempts to fix each failing host, with retries and regression checking.
239auto_fix_loop() {
240 local remaining_failures="$1"
241 local all_systems_list="$BUILD_SYSTEMS"
242 local round=0
243
244 while [ -n "$remaining_failures" ] && [ $round -lt "$AUTO_FIX_MAX_ATTEMPTS" ]; do
245 round=$((round + 1))
246 log "=== Auto-fix round $round/$AUTO_FIX_MAX_ATTEMPTS ==="
247
248 local still_failing=""
249
250 for system in $remaining_failures; do
251 # Try to build first (a previous fix might have resolved this too)
252 if build_host "$system"; then
253 log "✓ $system now builds (fixed by previous change)"
254 FIXED_HOSTS+=("$system")
255 continue
256 fi
257
258 # Run the agent
259 if try_auto_fix "$system" "$BUILD_ERROR"; then
260 # Check if the fix worked
261 if build_host "$system"; then
262 log "✓ $system fixed by auto-fix agent"
263 FIXES_APPLIED=$((FIXES_APPLIED + 1))
264 FIXED_HOSTS+=("$system")
265 else
266 log "✗ $system still failing after auto-fix attempt"
267 still_failing="$still_failing $system"
268 fi
269 else
270 log "✗ Auto-fix agent failed to run for $system"
271 still_failing="$still_failing $system"
272 fi
273 done
274
275 remaining_failures="${still_failing# }"
276
277 if [ -z "$remaining_failures" ]; then
278 log "All failures resolved after $round round(s)"
279 break
280 fi
281 done
282
283 # Record unfixed hosts
284 for system in $remaining_failures; do
285 UNFIXED_HOSTS+=("$system")
286 done
287
288 # Regression check: rebuild all previously-passing systems
289 if [ $FIXES_APPLIED -gt 0 ]; then
290 log "=== Regression check: rebuilding all systems ==="
291 local regression_failures=""
292
293 for system in $all_systems_list; do
294 if ! build_host "$system"; then
295 # Check if this was already a known failure
296 local was_fixed=false
297 for fixed in "${FIXED_HOSTS[@]:-}"; do
298 if [ "$system" = "$fixed" ]; then
299 was_fixed=true
300 break
301 fi
302 done
303
304 if [ "$was_fixed" = true ]; then
305 log "⚠ Regression: $system was fixed but now fails again"
306 regression_failures="$regression_failures $system"
307 else
308 # Check if this is a new regression (was passing before)
309 local is_new_regression=true
310 for unfixed in "${UNFIXED_HOSTS[@]:-}"; do
311 if [ "$system" = "$unfixed" ]; then
312 is_new_regression=false
313 break
314 fi
315 done
316
317 if [ "$is_new_regression" = true ]; then
318 log "⚠ Regression: $system was passing but now fails after auto-fix changes"
319 regression_failures="$regression_failures $system"
320 fi
321 fi
322 fi
323 done
324
325 regression_failures="${regression_failures# }"
326
327 if [ -n "$regression_failures" ]; then
328 log "Regressions detected: $regression_failures"
329 log "Attempting to fix regressions..."
330
331 # One more attempt with regression context
332 for system in $regression_failures; do
333 if ! build_host "$system"; then
334 local regression_prompt_extra="IMPORTANT: This is a REGRESSION. Host '$system' was building successfully before auto-fix changes were made to fix other hosts. Your previous fixes broke this host. Fix it WITHOUT breaking the other hosts."
335
336 if try_auto_fix "$system" "$BUILD_ERROR
337$regression_prompt_extra"; then
338 if build_host "$system"; then
339 log "✓ Regression fixed for $system"
340 FIXES_APPLIED=$((FIXES_APPLIED + 1))
341 else
342 log "✗ Could not fix regression for $system"
343 UNFIXED_HOSTS+=("$system")
344 fi
345 else
346 UNFIXED_HOSTS+=("$system")
347 fi
348 fi
349 done
350 fi
351 fi
352}
353
354# =============================================================================
355# Main script
356# =============================================================================
357
358log "Starting flake update process"
359cd "$REPO_PATH"
360
361# Fetch latest changes
362log "Fetching latest changes from $GIT_REMOTE"
363git fetch "$GIT_REMOTE"
364
365# Create update branch name
366BRANCH_NAME="$BRANCH_PREFIX$(date +%Y%m%d)"
367if git show-ref --verify --quiet "refs/heads/$BRANCH_NAME"; then
368 log "Branch $BRANCH_NAME already exists, using unique name"
369 BRANCH_NAME="$BRANCH_PREFIX$(date +%Y%m%d-%H%M%S)"
370fi
371
372# Create worktree from main branch (skip LFS to avoid hook failures)
373log "Creating worktree at $WORKTREE_DIR from $GIT_REMOTE/main"
374GIT_LFS_SKIP_SMUDGE=1 git worktree add "$WORKTREE_DIR" "$GIT_REMOTE/main"
375
376# Switch to worktree
377cd "$WORKTREE_DIR"
378log "Working in isolated worktree: $WORKTREE_DIR"
379
380# Create update branch in the worktree
381log "Creating update branch: $BRANCH_NAME"
382git checkout -b "$BRANCH_NAME"
383
384# Save old flake.lock for before/after comparison
385OLD_FLAKE_LOCK=$(cat flake.lock)
386
387# Update flake.lock
388log "Updating flake.lock"
389if [ -n "$FLAKE_INPUTS" ]; then
390 log "Updating specific inputs: $FLAKE_INPUTS"
391 for input in $FLAKE_INPUTS; do
392 log "Updating input: $input"
393 nix flake lock --update-input "$input" 2>&1 | tee -a "$LOG_FILE"
394 done
395else
396 log "Updating all inputs"
397 nix flake update 2>&1 | tee -a "$LOG_FILE"
398fi
399
400# Check if there are changes
401if ! git diff --quiet flake.lock; then
402 log "Changes detected in flake.lock"
403
404 # Show what changed
405 log "Flake input changes:"
406 git diff flake.lock | grep -E '^\+.*"(narHash|rev)"' | head -20 | tee -a "$LOG_FILE"
407
408 # Build all systems
409 build_all_systems
410
411 if [ -n "$FAILED_SYSTEMS" ]; then
412 log "Build failures detected: $FAILED_SYSTEMS"
413
414 if [ "$AUTO_FIX" = "true" ] || [ "$AUTO_FIX" = "1" ]; then
415 # Commit flake.lock first so the agent works on a clean tree
416 git add flake.lock
417
418 input_desc="all inputs"
419 if [ -n "$FLAKE_INPUTS" ]; then
420 input_desc="$FLAKE_INPUTS"
421 fi
422
423 CHANGES=$(jq -n --argjson old "$OLD_FLAKE_LOCK" --argjson new "$(cat flake.lock)" -r '
424 def rev_map:
425 .nodes | to_entries
426 | map(select(.key != "root" and .value.locked != null))
427 | map({(.key): (.value.locked.rev // .value.locked.narHash // "unknown")})
428 | add // {};
429 ($old | rev_map) as $o |
430 ($new | rev_map) as $n |
431 [$n | to_entries[] | select($o[.key] != null and $o[.key] != .value)] |
432 group_by({old: $o[.key], new: .value}) |
433 map({
434 names: (map(.key) | join(", ")),
435 old: $o[.[0].key][0:12],
436 new: .[0].value[0:12]
437 }) |
438 map("- \(.names): \(.old) → \(.new)") |
439 join("\n")
440 ' 2>/dev/null || echo "Updated flake inputs")
441
442 git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "chore(flake): update $input_desc
443
444$CHANGES"
445
446 # Run auto-fix loop
447 auto_fix_loop "$FAILED_SYSTEMS"
448
449 # Commit any fixes the agent made
450 if [ $FIXES_APPLIED -gt 0 ]; then
451 git add -A
452 if ! git diff --cached --quiet; then
453 local_fixed_list="${FIXED_HOSTS[*]:-}"
454 git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "fix(nix): auto-fix build errors
455
456Fixed hosts: ${local_fixed_list}
457Agent: ${AUTO_FIX_COMMAND}
458Attempts used: ${FIXES_APPLIED}"
459 fi
460 fi
461
462 # Determine overall result
463 if [ ${#UNFIXED_HOSTS[@]} -gt 0 ]; then
464 log "Auto-fix partially succeeded. Unfixed: ${UNFIXED_HOSTS[*]}"
465
466 add_todo_to_inbox "Flake update: ${#UNFIXED_HOSTS[@]} hosts still failing" \
467 "Auto-fix resolved ${#FIXED_HOSTS[@]} host(s) but could not fix: ${UNFIXED_HOSTS[*]}
468Build systems: $BUILD_SYSTEMS
469Agent: $AUTO_FIX_COMMAND"
470
471 # Still push the branch with partial fixes
472 if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
473 log "DRY RUN: Would push partial-fix branch"
474 else
475 git push "$GIT_REMOTE" "$BRANCH_NAME"
476 fi
477
478 notify "high" "⚠️ Flake Updated (${#UNFIXED_HOSTS[@]} hosts still failing)" \
479 "Auto-fixed: ${FIXED_HOSTS[*]:-none}. Still failing: ${UNFIXED_HOSTS[*]}. Branch: $BRANCH_NAME" \
480 "warning,flake,robot"
481
482 exit 1
483 else
484 log "All failures resolved by auto-fix"
485 # Fall through to normal push/merge logic below
486 fi
487 else
488 # No auto-fix — original behavior
489 log "Build failed, not committing changes"
490
491 input_desc="all inputs"
492 if [ -n "$FLAKE_INPUTS" ]; then
493 input_desc="$FLAKE_INPUTS"
494 fi
495
496 add_todo_to_inbox "Flake update build failure" \
497 "Build failed after updating $input_desc.
498Build systems tested: $BUILD_SYSTEMS
499Auto-merge: $AUTO_MERGE"
500
501 notify "high" "❌ Flake Update Build Failed" \
502 "Builds failed for updated $input_desc. TODO added to inbox. Check logs: $LOG_FILE" \
503 "x,flake,warning"
504
505 exit 1
506 fi
507 fi
508
509 # If we get here, all builds passed (either initially or after auto-fix)
510
511 # Commit flake.lock if not already committed (no auto-fix path)
512 if ! git diff --quiet flake.lock || ! git diff --cached --quiet flake.lock; then
513 git add flake.lock
514
515 input_desc="all inputs"
516 if [ -n "$FLAKE_INPUTS" ]; then
517 input_desc="$FLAKE_INPUTS"
518 fi
519
520 CHANGES=$(jq -n --argjson old "$OLD_FLAKE_LOCK" --argjson new "$(cat flake.lock)" -r '
521 def rev_map:
522 .nodes | to_entries
523 | map(select(.key != "root" and .value.locked != null))
524 | map({(.key): (.value.locked.rev // .value.locked.narHash // "unknown")})
525 | add // {};
526 ($old | rev_map) as $o |
527 ($new | rev_map) as $n |
528 [$n | to_entries[] | select($o[.key] != null and $o[.key] != .value)] |
529 group_by({old: $o[.key], new: .value}) |
530 map({
531 names: (map(.key) | join(", ")),
532 old: $o[.[0].key][0:12],
533 new: .[0].value[0:12]
534 }) |
535 map("- \(.names): \(.old) → \(.new)") |
536 join("\n")
537 ' 2>/dev/null || echo "Updated flake inputs")
538
539 COMMIT_MSG="chore(flake): update $input_desc
540
541$CHANGES
542
543Built systems: $BUILD_SYSTEMS"
544
545 git -c user.signingkey=/home/vincent/.ssh/id_ed25519 commit -m "$COMMIT_MSG"
546 fi
547
548 # Determine notification details
549 fix_note=""
550 if [ $FIXES_APPLIED -gt 0 ]; then
551 fix_note=" ($FIXES_APPLIED auto-fix(es) applied: ${FIXED_HOSTS[*]})"
552 fi
553
554 if [ "$DRY_RUN" != "false" ] && [ "$DRY_RUN" != "" ] && [ "$DRY_RUN" != "0" ]; then
555 log "DRY RUN: Would push to $GIT_REMOTE/$BRANCH_NAME"
556 notify "low" "🧪 Flake Update (Dry Run)" \
557 "Branch $BRANCH_NAME created locally. All builds passed: $BUILD_SYSTEMS${fix_note}" \
558 "test_tube,flake"
559 elif [ "$AUTO_MERGE" = "true" ] || [ "$AUTO_MERGE" = "1" ]; then
560 # Auto-merge: rebase onto main and push directly
561 log "Auto-merge enabled: rebasing onto $GIT_REMOTE/$MAIN_BRANCH"
562
563 git fetch "$GIT_REMOTE" "$MAIN_BRANCH"
564
565 if git rebase "$GIT_REMOTE/$MAIN_BRANCH"; then
566 log "Rebase successful, pushing to $GIT_REMOTE/$MAIN_BRANCH"
567 git push "$GIT_REMOTE" "HEAD:$MAIN_BRANCH"
568
569 notify "default" "✅ Flake Auto-Updated & Merged" \
570 "Updates merged to $MAIN_BRANCH. All builds passed: $BUILD_SYSTEMS${fix_note}" \
571 "white_check_mark,flake,merged"
572
573 log "SUCCESS: Flake updated and merged to $MAIN_BRANCH"
574 else
575 log "ERROR: Rebase failed, main branch may have moved"
576 git rebase --abort || true
577
578 add_todo_to_inbox "Flake update rebase conflict" \
579 "Auto-merge failed due to rebase conflict.
580Inputs: $input_desc
581Branch: $BRANCH_NAME (in worktree, needs manual rebase)"
582
583 notify "high" "⚠️ Flake Update Rebase Failed" \
584 "Could not rebase onto $MAIN_BRANCH. TODO added to inbox." \
585 "warning,flake,conflict"
586 exit 1
587 fi
588 else
589 # Branch mode: push to feature branch
590 log "Pushing to $GIT_REMOTE/$BRANCH_NAME"
591 git push "$GIT_REMOTE" "$BRANCH_NAME"
592
593 notify "default" "✅ Flake Updated Successfully" \
594 "Branch $BRANCH_NAME pushed. All builds passed: $BUILD_SYSTEMS${fix_note}" \
595 "white_check_mark,flake"
596
597 log "SUCCESS: Flake updated and pushed to $BRANCH_NAME"
598 fi
599
600else
601 log "No changes in flake.lock, nothing to do"
602 notify "low" "ℹ️ No Flake Updates" \
603 "flake.lock is already up to date" \
604 "information_source,flake"
605fi
606
607log "Flake update process complete"