Commit fe96f9dc6f02

Vincent Demeester <vincent@sbr.pm>
2026-04-13 15:20:09
fix(ai-storage): add stale lock detection
Added PID-based and age-based stale lock detection to prevent crashed or killed summarizer processes from permanently blocking session recovery. Locks from dead PIDs or older than 5 minutes are automatically cleaned up on session start and during individual summarizer runs. Orphaned log files are also pruned.
1 parent c0709ce
Changed files (2)
dots
pi
agent
extensions
dots/pi/agent/extensions/ai-storage/index.ts
@@ -28,7 +28,7 @@ import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
 import { getMarkdownTheme } from "@mariozechner/pi-coding-agent";
 import { Box, Markdown, matchesKey, visibleWidth, truncateToWidth } from "@mariozechner/pi-tui";
 import { Type } from "@sinclair/typebox";
-import { writeFile, mkdir, appendFile, readFile, readdir, unlink } from "node:fs/promises";
+import { writeFile, mkdir, appendFile, readFile, readdir, unlink, stat } from "node:fs/promises";
 import { existsSync, openSync, readdirSync } from "node:fs";
 import { join, dirname } from "node:path";
 import { homedir, hostname } from "node:os";
@@ -766,6 +766,59 @@ export default function (pi: ExtensionAPI) {
 		}
 	});
 
+	// Check if a PID is still alive
+	function isPidAlive(pid: number): boolean {
+		try {
+			process.kill(pid, 0);
+			return true;
+		} catch {
+			return false;
+		}
+	}
+
+	// Clean up stale lock and log files in the pending directory
+	const LOCK_MAX_AGE_MS = 5 * 60 * 1000; // 5 minutes
+
+	async function cleanupStaleLocks() {
+		try {
+			const files = await readdir(PENDING_DIR);
+			const lockFiles = files.filter((f) => f.endsWith(".lock"));
+
+			for (const lockFile of lockFiles) {
+				const lockPath = join(PENDING_DIR, lockFile);
+				try {
+					const content = await readFile(lockPath, "utf-8");
+					const pid = parseInt(content.trim(), 10);
+					const pidDead = isNaN(pid) || !isPidAlive(pid);
+
+					if (!pidDead) {
+						// PID alive — check age in case it's hung
+						const lockStat = await stat(lockPath);
+						if (Date.now() - lockStat.mtimeMs <= LOCK_MAX_AGE_MS) continue;
+					}
+
+					// Stale lock — remove it and its log file
+					await unlink(lockPath).catch(() => {});
+					const logPath = lockPath.replace(/\.lock$/, ".log");
+					await unlink(logPath).catch(() => {});
+				} catch {
+					await unlink(lockPath).catch(() => {});
+				}
+			}
+
+			// Clean up orphaned .log files (no matching .json or .lock)
+			const remaining = await readdir(PENDING_DIR);
+			for (const f of remaining.filter((f) => f.endsWith(".log"))) {
+				const base = f.replace(/\.log$/, "");
+				if (!remaining.includes(base) && !remaining.includes(`${base}.lock`)) {
+					await unlink(join(PENDING_DIR, f)).catch(() => {});
+				}
+			}
+		} catch {
+			// Silent failure
+		}
+	}
+
 	// Recover pending transcripts by spawning background summarizer
 	async function recoverPendingTranscripts(ctx: any) {
 		try {
@@ -773,6 +826,9 @@ export default function (pi: ExtensionAPI) {
 				return;
 			}
 
+			// Clean up stale locks from crashed/killed summarizers
+			await cleanupStaleLocks();
+
 			const files = await readdir(PENDING_DIR);
 			const pendingFiles = files.filter((f) => f.endsWith(".json"));
 
dots/pi/agent/extensions/ai-storage/summarizer.ts
@@ -13,7 +13,7 @@
  * Uses pi's own configuration for model/provider selection via `pi -p` (non-interactive mode).
  */
 
-import { readFile, writeFile, mkdir, unlink } from "node:fs/promises";
+import { readFile, writeFile, mkdir, unlink, stat } from "node:fs/promises";
 import { existsSync } from "node:fs";
 import { join } from "node:path";
 import { homedir } from "node:os";
@@ -227,15 +227,59 @@ function stripControlCodes(text: string): string {
 		.trim();
 }
 
+// Check if a PID is still alive
+function isPidAlive(pid: number): boolean {
+	try {
+		process.kill(pid, 0); // Signal 0 = just check existence
+		return true;
+	} catch {
+		return false;
+	}
+}
+
+// Check if a lock file is stale (process dead or lock older than maxAge)
+const LOCK_MAX_AGE_MS = 5 * 60 * 1000; // 5 minutes
+
+async function isStaleLock(lockFile: string): Promise<boolean> {
+	try {
+		const content = await readFile(lockFile, "utf-8");
+		const pid = parseInt(content.trim(), 10);
+
+		// If PID is not alive, lock is stale
+		if (!isNaN(pid) && !isPidAlive(pid)) {
+			console.log(`Stale lock detected: PID ${pid} is dead`);
+			return true;
+		}
+
+		// If lock is older than max age, consider it stale (process may be hung)
+		const lockStat = await stat(lockFile);
+		const ageMs = Date.now() - lockStat.mtimeMs;
+		if (ageMs > LOCK_MAX_AGE_MS) {
+			console.log(`Stale lock detected: lock is ${Math.round(ageMs / 1000)}s old (max ${LOCK_MAX_AGE_MS / 1000}s)`);
+			return true;
+		}
+
+		return false;
+	} catch {
+		// Can't read lock file - treat as stale
+		return true;
+	}
+}
+
 // Main
 async function main() {
 	const lockFile = `${transcriptPath}.lock`;
 	
 	try {
-		// Check if already being processed (simple lock)
+		// Check if already being processed
 		if (existsSync(lockFile)) {
-			console.log(`Already being processed (lock exists): ${transcriptPath}`);
-			process.exit(0);
+			if (await isStaleLock(lockFile)) {
+				console.log(`Removing stale lock: ${lockFile}`);
+				await unlink(lockFile).catch(() => {});
+			} else {
+				console.log(`Already being processed (lock exists, PID alive): ${transcriptPath}`);
+				process.exit(0);
+			}
 		}
 		
 		// Check if pending file still exists