From 116e61d8cbf214d15c22f867ff6b69c585a86098 Mon Sep 17 00:00:00 2001 From: Jamkris Date: Tue, 19 May 2026 11:01:10 +0900 Subject: [PATCH] fix(lib): retry rename on Windows EPERM/EACCES/EBUSY in writeBridgeAtomic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #1983 round 1 introduced unique-suffix tmp paths so two concurrent writers no longer share a single `.tmp` file. That fix is correct under POSIX semantics — `rename(2)` is atomic between source and destination, so each writer renames onto the same target without conflict. Windows `MoveFileExW` is not the same. It fails with EPERM / EACCES / EBUSY when the target is currently being renamed by *another* process — a short race window that fires reliably under this hook's PostToolUse + statusline concurrency. Round 1's CI run made this visible: Test (windows-latest, Node 18.x, npm) — FAILURE Error: EPERM: operation not permitted, rename 'C:\…\ecc-metrics-test-bridge-race-….json.9504.4aef575a.tmp' -> 'C:\…\ecc-metrics-test-bridge-race-….json' at writeBridgeAtomic (scripts/lib/session-bridge.js:79:8) All nine Windows matrix cells (Node 18 / 20 / 22 × npm / pnpm / yarn) hit the same path. POSIX matrices (Linux + macOS) passed unchanged. Fix: extract a `renameWithRetry(tmp, target)` helper that retries `fs.renameSync` up to 5 times on EPERM / EACCES / EBUSY with exponential backoff (20 ms → 320 ms total). Other error codes (ENOENT, ENOSPC, EROFS, …) re-throw on the first attempt — they are not transient. POSIX runs hit the first try and exit immediately. The backoff uses `Atomics.wait` on a throwaway `SharedArrayBuffer` so the retry path does not busy-spin the CPU; verified on Node ≥ 17 that this works on the main thread. There is a `try/catch` fallback to a brief busy-wait for older runtimes where `Atomics.wait` is restricted to workers. `writeBridgeAtomic` calls the helper instead of `fs.renameSync` and keeps its existing best-effort tmp cleanup on terminal failure. `renameWithRetry` is added to `module.exports` so the companion `writeWarnState` in `scripts/hooks/ecc-context-monitor.js` can adopt the same retry policy without duplicating the helper. That adoption lands in the next commit. Local: `node tests/lib/session-bridge.test.js` 14/14, `yarn test` green, `yarn lint` clean. The round-1 test (two concurrent child writers, 200 iterations each) now passes on macOS without retrying at all (POSIX path) and is expected to pass on Windows via the new retry loop. --- scripts/lib/session-bridge.js | 48 ++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/scripts/lib/session-bridge.js b/scripts/lib/session-bridge.js index b50216fc..19033d63 100644 --- a/scripts/lib/session-bridge.js +++ b/scripts/lib/session-bridge.js @@ -76,13 +76,58 @@ function writeBridgeAtomic(sessionId, data) { const tmp = `${target}.${process.pid}.${crypto.randomBytes(4).toString('hex')}.tmp`; fs.writeFileSync(tmp, JSON.stringify(data), 'utf8'); try { - fs.renameSync(tmp, target); + renameWithRetry(tmp, target); } catch (err) { try { fs.unlinkSync(tmp); } catch { /* ignore */ } throw err; } } +/** + * Replace a file via rename, retrying briefly on transient OS-level errors. + * + * POSIX `rename(2)` is atomic between source and destination, so concurrent + * writers each rename onto the same target without conflict. Windows + * `MoveFileExW` is different: it fails with EPERM/EACCES/EBUSY if the + * target is currently being renamed by *another* process — a short race + * window that fires reliably under our PostToolUse + statusline concurrency. + * + * To stay portable, retry up to 5 times with exponential backoff (20 ms, + * 40, 80, 160, 320) on the Windows-only transient codes. POSIX runs hit + * the first try and exit immediately. Other error codes (ENOENT, ENOSPC, + * EROFS, …) re-throw without retry — they are not transient. + * + * Sleep uses `Atomics.wait` on a throwaway SharedArrayBuffer so the + * retry path does not busy-spin the CPU. This works on the main thread + * in Node ≥ 17 (and on workers in earlier versions). + * + * @param {string} tmp + * @param {string} target + */ +function renameWithRetry(tmp, target) { + const RETRY_CODES = new Set(['EPERM', 'EACCES', 'EBUSY']); + const MAX_ATTEMPTS = 5; + for (let attempt = 0; ; attempt++) { + try { + fs.renameSync(tmp, target); + return; + } catch (err) { + if (attempt + 1 >= MAX_ATTEMPTS || !RETRY_CODES.has(err.code)) { + throw err; + } + const delayMs = 20 << attempt; + try { + Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, delayMs); + } catch { + // Atomics.wait throws on the main thread in some older runtimes; + // fall back to a brief busy-wait so the retry path still has a delay. + const until = Date.now() + delayMs; + while (Date.now() < until) { /* spin */ } + } + } + } +} + /** * Resolve session ID from environment variables. * @returns {string|null} Sanitized session ID or null @@ -97,6 +142,7 @@ module.exports = { getBridgePath, readBridge, writeBridgeAtomic, + renameWithRetry, resolveSessionId, MAX_SESSION_ID_LENGTH };