From f9e8287346be04577d3dcce6b80c48bbfe233352 Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Mon, 16 Mar 2026 13:47:25 -0700 Subject: [PATCH] fix: observer memory explosion with throttling, re-entrancy guard, and tail sampling (#536) Three fixes for the positive feedback loop causing runaway memory usage: 1. SIGUSR1 throttling in observe.sh: Signal observer only every 20 observations (configurable via ECC_OBSERVER_SIGNAL_EVERY_N) instead of on every tool call. Uses a counter file to track invocations. 2. Re-entrancy guard in observer-loop.sh on_usr1(): ANALYZING flag prevents parallel Claude analysis processes from spawning when signals arrive while analysis is already running. 3. Cooldown + tail-based sampling in observer-loop.sh: - 60s cooldown between analyses (ECC_OBSERVER_ANALYSIS_COOLDOWN) - Only last 500 lines sent to LLM (ECC_OBSERVER_MAX_ANALYSIS_LINES) instead of the entire observations file Closes #521 --- .../agents/observer-loop.sh | 38 +- .../continuous-learning-v2/hooks/observe.sh | 57 ++- tests/hooks/observer-memory.test.js | 360 ++++++++++++++++++ 3 files changed, 435 insertions(+), 20 deletions(-) create mode 100644 tests/hooks/observer-memory.test.js diff --git a/skills/continuous-learning-v2/agents/observer-loop.sh b/skills/continuous-learning-v2/agents/observer-loop.sh index b5db7264..2be2049a 100755 --- a/skills/continuous-learning-v2/agents/observer-loop.sh +++ b/skills/continuous-learning-v2/agents/observer-loop.sh @@ -1,11 +1,19 @@ #!/usr/bin/env bash # Continuous Learning v2 - Observer background loop +# +# Fix for #521: Added re-entrancy guard, cooldown throttle, and +# tail-based sampling to prevent memory explosion from runaway +# parallel Claude analysis processes. set +e unset CLAUDECODE SLEEP_PID="" USR1_FIRED=0 +ANALYZING=0 +LAST_ANALYSIS_EPOCH=0 +# Minimum seconds between analyses (prevents rapid re-triggering) +ANALYSIS_COOLDOWN="${ECC_OBSERVER_ANALYSIS_COOLDOWN:-60}" cleanup() { [ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null @@ -44,9 +52,17 @@ analyze_observations() { return fi + # Sample recent observations instead of loading the entire file (#521). + # This prevents multi-MB payloads from being passed to the LLM. + MAX_ANALYSIS_LINES="${ECC_OBSERVER_MAX_ANALYSIS_LINES:-500}" + analysis_file="$(mktemp "${TMPDIR:-/tmp}/ecc-observer-analysis.XXXXXX.jsonl")" + tail -n "$MAX_ANALYSIS_LINES" "$OBSERVATIONS_FILE" > "$analysis_file" + analysis_count=$(wc -l < "$analysis_file" 2>/dev/null || echo 0) + echo "[$(date)] Using last $analysis_count of $obs_count observations for analysis" >> "$LOG_FILE" + prompt_file="$(mktemp "${TMPDIR:-/tmp}/ecc-observer-prompt.XXXXXX")" cat > "$prompt_file" <.md. CRITICAL: Every instinct file MUST use this exact format: @@ -113,7 +129,7 @@ PROMPT wait "$claude_pid" exit_code=$? kill "$watchdog_pid" 2>/dev/null || true - rm -f "$prompt_file" + rm -f "$prompt_file" "$analysis_file" if [ "$exit_code" -ne 0 ]; then echo "[$(date)] Claude analysis failed (exit $exit_code)" >> "$LOG_FILE" @@ -130,7 +146,25 @@ on_usr1() { [ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null SLEEP_PID="" USR1_FIRED=1 + + # Re-entrancy guard: skip if analysis is already running (#521) + if [ "$ANALYZING" -eq 1 ]; then + echo "[$(date)] Analysis already in progress, skipping signal" >> "$LOG_FILE" + return + fi + + # Cooldown: skip if last analysis was too recent (#521) + now_epoch=$(date +%s) + elapsed=$(( now_epoch - LAST_ANALYSIS_EPOCH )) + if [ "$elapsed" -lt "$ANALYSIS_COOLDOWN" ]; then + echo "[$(date)] Analysis cooldown active (${elapsed}s < ${ANALYSIS_COOLDOWN}s), skipping" >> "$LOG_FILE" + return + fi + + ANALYZING=1 analyze_observations + LAST_ANALYSIS_EPOCH=$(date +%s) + ANALYZING=0 } trap on_usr1 USR1 diff --git a/skills/continuous-learning-v2/hooks/observe.sh b/skills/continuous-learning-v2/hooks/observe.sh index 14a862be..ce6f563e 100755 --- a/skills/continuous-learning-v2/hooks/observe.sh +++ b/skills/continuous-learning-v2/hooks/observe.sh @@ -365,24 +365,45 @@ if [ "$OBSERVER_ENABLED" = "true" ]; then fi fi -# Signal observer if running (check both project-scoped and global observer, deduplicate) -signaled_pids=" " -for pid_file in "${PROJECT_DIR}/.observer.pid" "${CONFIG_DIR}/.observer.pid"; do - if [ -f "$pid_file" ]; then - observer_pid=$(cat "$pid_file" 2>/dev/null || true) - # Validate PID is a positive integer (>1) - case "$observer_pid" in - ''|*[!0-9]*|0|1) rm -f "$pid_file" 2>/dev/null || true; continue ;; - esac - # Deduplicate: skip if already signaled this pass - case "$signaled_pids" in - *" $observer_pid "*) continue ;; - esac - if kill -0 "$observer_pid" 2>/dev/null; then - kill -USR1 "$observer_pid" 2>/dev/null || true - signaled_pids="${signaled_pids}${observer_pid} " - fi +# Throttle SIGUSR1: only signal observer every N observations (#521) +# This prevents rapid signaling when tool calls fire every second, +# which caused runaway parallel Claude analysis processes. +SIGNAL_EVERY_N="${ECC_OBSERVER_SIGNAL_EVERY_N:-20}" +SIGNAL_COUNTER_FILE="${PROJECT_DIR}/.observer-signal-counter" + +should_signal=0 +if [ -f "$SIGNAL_COUNTER_FILE" ]; then + counter=$(cat "$SIGNAL_COUNTER_FILE" 2>/dev/null || echo 0) + counter=$((counter + 1)) + if [ "$counter" -ge "$SIGNAL_EVERY_N" ]; then + should_signal=1 + counter=0 fi -done + echo "$counter" > "$SIGNAL_COUNTER_FILE" +else + echo "1" > "$SIGNAL_COUNTER_FILE" +fi + +# Signal observer if running and throttle allows (check both project-scoped and global observer, deduplicate) +if [ "$should_signal" -eq 1 ]; then + signaled_pids=" " + for pid_file in "${PROJECT_DIR}/.observer.pid" "${CONFIG_DIR}/.observer.pid"; do + if [ -f "$pid_file" ]; then + observer_pid=$(cat "$pid_file" 2>/dev/null || true) + # Validate PID is a positive integer (>1) + case "$observer_pid" in + ''|*[!0-9]*|0|1) rm -f "$pid_file" 2>/dev/null || true; continue ;; + esac + # Deduplicate: skip if already signaled this pass + case "$signaled_pids" in + *" $observer_pid "*) continue ;; + esac + if kill -0 "$observer_pid" 2>/dev/null; then + kill -USR1 "$observer_pid" 2>/dev/null || true + signaled_pids="${signaled_pids}${observer_pid} " + fi + fi + done +fi exit 0 diff --git a/tests/hooks/observer-memory.test.js b/tests/hooks/observer-memory.test.js new file mode 100644 index 00000000..b380386c --- /dev/null +++ b/tests/hooks/observer-memory.test.js @@ -0,0 +1,360 @@ +/** + * Tests for observer memory explosion fix (#521) + * + * Validates three fixes: + * 1. SIGUSR1 throttling in observe.sh (signal counter) + * 2. Tail-based sampling in observer-loop.sh (not loading entire file) + * 3. Re-entrancy guard + cooldown in observer-loop.sh on_usr1() + * + * Run with: node tests/hooks/observer-memory.test.js + */ + +const assert = require('assert'); +const path = require('path'); +const fs = require('fs'); +const os = require('os'); +const { spawnSync } = require('child_process'); + +let passed = 0; +let failed = 0; + +function test(name, fn) { + try { + fn(); + console.log(` \u2713 ${name}`); + passed++; + } catch (err) { + console.log(` \u2717 ${name}`); + console.log(` Error: ${err.message}`); + failed++; + } +} + +function createTempDir() { + return fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-observer-test-')); +} + +function cleanupDir(dir) { + try { + fs.rmSync(dir, { recursive: true, force: true }); + } catch { + // ignore cleanup errors + } +} + +const repoRoot = path.resolve(__dirname, '..', '..'); +const observeShPath = path.join(repoRoot, 'skills', 'continuous-learning-v2', 'hooks', 'observe.sh'); +const observerLoopPath = path.join(repoRoot, 'skills', 'continuous-learning-v2', 'agents', 'observer-loop.sh'); + +console.log('\n=== Observer Memory Fix Tests (#521) ===\n'); + +// ────────────────────────────────────────────────────── +// Test group 1: observe.sh SIGUSR1 throttling +// ────────────────────────────────────────────────────── + +console.log('--- observe.sh signal throttling ---'); + +test('observe.sh contains SIGNAL_EVERY_N throttle variable', () => { + const content = fs.readFileSync(observeShPath, 'utf8'); + assert.ok( + content.includes('SIGNAL_EVERY_N'), + 'observe.sh should define SIGNAL_EVERY_N for throttling' + ); +}); + +test('observe.sh uses a counter file instead of signaling every call', () => { + const content = fs.readFileSync(observeShPath, 'utf8'); + assert.ok( + content.includes('.observer-signal-counter'), + 'observe.sh should use a signal counter file' + ); +}); + +test('observe.sh only signals when counter reaches threshold', () => { + const content = fs.readFileSync(observeShPath, 'utf8'); + assert.ok( + content.includes('should_signal=0'), + 'observe.sh should default should_signal to 0' + ); + assert.ok( + content.includes('should_signal=1'), + 'observe.sh should set should_signal=1 when threshold reached' + ); + assert.ok( + content.includes('if [ "$should_signal" -eq 1 ]'), + 'observe.sh should gate kill -USR1 behind should_signal check' + ); +}); + +test('observe.sh default throttle is 20 observations per signal', () => { + const content = fs.readFileSync(observeShPath, 'utf8'); + assert.ok( + content.includes('ECC_OBSERVER_SIGNAL_EVERY_N:-20'), + 'Default signal frequency should be every 20 observations' + ); +}); + +// ────────────────────────────────────────────────────── +// Test group 2: observer-loop.sh re-entrancy guard +// ────────────────────────────────────────────────────── + +console.log('\n--- observer-loop.sh re-entrancy guard ---'); + +test('observer-loop.sh defines ANALYZING guard variable', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + assert.ok( + content.includes('ANALYZING=0'), + 'observer-loop.sh should initialize ANALYZING=0' + ); +}); + +test('on_usr1 checks ANALYZING before starting analysis', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + assert.ok( + content.includes('if [ "$ANALYZING" -eq 1 ]'), + 'on_usr1 should check ANALYZING flag' + ); + assert.ok( + content.includes('Analysis already in progress, skipping signal'), + 'on_usr1 should log when skipping due to re-entrancy' + ); +}); + +test('on_usr1 sets ANALYZING=1 before and ANALYZING=0 after analysis', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + // Check that ANALYZING=1 is set before analyze_observations + const analyzeCall = content.indexOf('ANALYZING=1'); + const analyzeObsCall = content.indexOf('analyze_observations', analyzeCall); + const analyzeReset = content.indexOf('ANALYZING=0', analyzeObsCall); + assert.ok(analyzeCall > 0, 'ANALYZING=1 should be set'); + assert.ok(analyzeObsCall > analyzeCall, 'analyze_observations should be called after ANALYZING=1'); + assert.ok(analyzeReset > analyzeObsCall, 'ANALYZING=0 should follow analyze_observations'); +}); + +// ────────────────────────────────────────────────────── +// Test group 3: observer-loop.sh cooldown throttle +// ────────────────────────────────────────────────────── + +console.log('\n--- observer-loop.sh cooldown throttle ---'); + +test('observer-loop.sh defines ANALYSIS_COOLDOWN', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + assert.ok( + content.includes('ANALYSIS_COOLDOWN'), + 'observer-loop.sh should define ANALYSIS_COOLDOWN' + ); +}); + +test('on_usr1 enforces cooldown between analyses', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + assert.ok( + content.includes('LAST_ANALYSIS_EPOCH'), + 'Should track last analysis time' + ); + assert.ok( + content.includes('Analysis cooldown active'), + 'Should log when cooldown prevents analysis' + ); +}); + +test('default cooldown is 60 seconds', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + assert.ok( + content.includes('ECC_OBSERVER_ANALYSIS_COOLDOWN:-60'), + 'Default cooldown should be 60 seconds' + ); +}); + +// ────────────────────────────────────────────────────── +// Test group 4: Tail-based sampling (no full file load) +// ────────────────────────────────────────────────────── + +console.log('\n--- observer-loop.sh tail-based sampling ---'); + +test('analyze_observations uses tail to sample recent observations', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + assert.ok( + content.includes('tail -n "$MAX_ANALYSIS_LINES"'), + 'Should use tail to limit observations sent to LLM' + ); +}); + +test('default max analysis lines is 500', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + assert.ok( + content.includes('ECC_OBSERVER_MAX_ANALYSIS_LINES:-500'), + 'Default should sample last 500 lines' + ); +}); + +test('analysis temp file is created and cleaned up', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + assert.ok( + content.includes('ecc-observer-analysis'), + 'Should create a temp analysis file' + ); + assert.ok( + content.includes('rm -f "$prompt_file" "$analysis_file"'), + 'Should clean up both prompt and analysis temp files' + ); +}); + +test('prompt references analysis_file not full OBSERVATIONS_FILE', () => { + const content = fs.readFileSync(observerLoopPath, 'utf8'); + // The prompt heredoc should reference analysis_file for the Read instruction. + // Find the section between the heredoc open and close markers. + const heredocStart = content.indexOf('cat > "$prompt_file" < 0, 'Should find prompt heredoc start'); + assert.ok(heredocEnd > heredocStart, 'Should find prompt heredoc end'); + const promptSection = content.substring(heredocStart, heredocEnd); + assert.ok( + promptSection.includes('${analysis_file}'), + 'Prompt should point Claude at the sampled analysis file, not the full observations file' + ); +}); + +// ────────────────────────────────────────────────────── +// Test group 5: Signal counter file simulation +// ────────────────────────────────────────────────────── + +console.log('\n--- Signal counter file behavior ---'); + +test('counter file increments and resets correctly', () => { + const testDir = createTempDir(); + const counterFile = path.join(testDir, '.observer-signal-counter'); + + // Simulate 20 calls - first 19 should not signal, 20th should + const signalEveryN = 20; + let signalCount = 0; + + for (let i = 0; i < 40; i++) { + let shouldSignal = false; + if (fs.existsSync(counterFile)) { + let counter = parseInt(fs.readFileSync(counterFile, 'utf8').trim(), 10) || 0; + counter++; + if (counter >= signalEveryN) { + shouldSignal = true; + counter = 0; + } + fs.writeFileSync(counterFile, String(counter)); + } else { + fs.writeFileSync(counterFile, '1'); + } + if (shouldSignal) signalCount++; + } + + // 40 calls with threshold 20 should signal exactly 2 times + // (at call 20 and call 40) + assert.strictEqual(signalCount, 2, `Expected 2 signals over 40 calls, got ${signalCount}`); + + cleanupDir(testDir); +}); + +test('counter file handles missing/corrupt file gracefully', () => { + const testDir = createTempDir(); + const counterFile = path.join(testDir, '.observer-signal-counter'); + + // Write corrupt content + fs.writeFileSync(counterFile, 'not-a-number'); + const counter = parseInt(fs.readFileSync(counterFile, 'utf8').trim(), 10) || 0; + assert.strictEqual(counter, 0, 'Corrupt counter should default to 0'); + + cleanupDir(testDir); +}); + +// ────────────────────────────────────────────────────── +// Test group 6: End-to-end observe.sh signal throttle (shell) +// ────────────────────────────────────────────────────── + +console.log('\n--- observe.sh end-to-end throttle (shell execution) ---'); + +test('observe.sh creates counter file and increments on each call', () => { + // This test runs observe.sh with minimal input to verify counter behavior. + // We need python3, bash, and a valid project dir to test the full flow. + // We use ECC_SKIP_OBSERVE=0 and minimal JSON so observe.sh processes but + // exits before signaling (no observer PID running). + + const testDir = createTempDir(); + const projectDir = path.join(testDir, 'project'); + fs.mkdirSync(projectDir, { recursive: true }); + + // Create a minimal detect-project.sh that sets required vars + const skillRoot = path.join(testDir, 'skill'); + const scriptsDir = path.join(skillRoot, 'scripts'); + const hooksDir = path.join(skillRoot, 'hooks'); + fs.mkdirSync(scriptsDir, { recursive: true }); + fs.mkdirSync(hooksDir, { recursive: true }); + + // Minimal detect-project.sh stub + fs.writeFileSync(path.join(scriptsDir, 'detect-project.sh'), [ + '#!/bin/bash', + `PROJECT_ID="test-project"`, + `PROJECT_NAME="test-project"`, + `PROJECT_ROOT="${projectDir}"`, + `PROJECT_DIR="${projectDir}"`, + `CLV2_PYTHON_CMD="${process.platform === 'win32' ? 'python' : 'python3'}"`, + '' + ].join('\n')); + + // Copy observe.sh but patch SKILL_ROOT to our test dir + let observeContent = fs.readFileSync(observeShPath, 'utf8'); + observeContent = observeContent.replace( + 'SKILL_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"', + `SKILL_ROOT="${skillRoot}"` + ); + const testObserve = path.join(hooksDir, 'observe.sh'); + fs.writeFileSync(testObserve, observeContent, { mode: 0o755 }); + + const hookInput = JSON.stringify({ + tool_name: 'Read', + tool_input: { file_path: '/tmp/test.txt' }, + session_id: 'test-session', + cwd: projectDir + }); + + // Run observe.sh twice + for (let i = 0; i < 2; i++) { + spawnSync('bash', [testObserve, 'post'], { + input: hookInput, + env: { + ...process.env, + HOME: testDir, + CLAUDE_CODE_ENTRYPOINT: 'cli', + ECC_HOOK_PROFILE: 'standard', + ECC_SKIP_OBSERVE: '0', + CLAUDE_PROJECT_DIR: projectDir + }, + timeout: 5000 + }); + } + + const counterFile = path.join(projectDir, '.observer-signal-counter'); + if (fs.existsSync(counterFile)) { + const val = fs.readFileSync(counterFile, 'utf8').trim(); + const counterVal = parseInt(val, 10); + assert.ok( + counterVal >= 1 && counterVal <= 2, + `Counter should be 1 or 2 after 2 calls, got ${counterVal}` + ); + } else { + // If python3 is not available the hook exits early - that is acceptable + const hasPython = spawnSync('python3', ['--version']).status === 0; + if (hasPython) { + assert.fail('Counter file should exist after running observe.sh'); + } + } + + cleanupDir(testDir); +}); + +// ────────────────────────────────────────────────────── +// Summary +// ────────────────────────────────────────────────────── + +console.log('\n=== Test Results ==='); +console.log(`Passed: ${passed}`); +console.log(`Failed: ${failed}`); +console.log(`Total: ${passed + failed}\n`); + +process.exit(failed > 0 ? 1 : 0);