mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-04-05 08:43:29 +08:00
fix: observer memory explosion with throttling, re-entrancy guard, and tail sampling (#536)
Three fixes for the positive feedback loop causing runaway memory usage:
1. SIGUSR1 throttling in observe.sh: Signal observer only every 20
observations (configurable via ECC_OBSERVER_SIGNAL_EVERY_N) instead
of on every tool call. Uses a counter file to track invocations.
2. Re-entrancy guard in observer-loop.sh on_usr1(): ANALYZING flag
prevents parallel Claude analysis processes from spawning when
signals arrive while analysis is already running.
3. Cooldown + tail-based sampling in observer-loop.sh:
- 60s cooldown between analyses (ECC_OBSERVER_ANALYSIS_COOLDOWN)
- Only last 500 lines sent to LLM (ECC_OBSERVER_MAX_ANALYSIS_LINES)
instead of the entire observations file
Closes #521
This commit is contained in:
@@ -1,11 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
# Continuous Learning v2 - Observer background loop
|
||||
#
|
||||
# Fix for #521: Added re-entrancy guard, cooldown throttle, and
|
||||
# tail-based sampling to prevent memory explosion from runaway
|
||||
# parallel Claude analysis processes.
|
||||
|
||||
set +e
|
||||
unset CLAUDECODE
|
||||
|
||||
SLEEP_PID=""
|
||||
USR1_FIRED=0
|
||||
ANALYZING=0
|
||||
LAST_ANALYSIS_EPOCH=0
|
||||
# Minimum seconds between analyses (prevents rapid re-triggering)
|
||||
ANALYSIS_COOLDOWN="${ECC_OBSERVER_ANALYSIS_COOLDOWN:-60}"
|
||||
|
||||
cleanup() {
|
||||
[ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null
|
||||
@@ -44,9 +52,17 @@ analyze_observations() {
|
||||
return
|
||||
fi
|
||||
|
||||
# Sample recent observations instead of loading the entire file (#521).
|
||||
# This prevents multi-MB payloads from being passed to the LLM.
|
||||
MAX_ANALYSIS_LINES="${ECC_OBSERVER_MAX_ANALYSIS_LINES:-500}"
|
||||
analysis_file="$(mktemp "${TMPDIR:-/tmp}/ecc-observer-analysis.XXXXXX.jsonl")"
|
||||
tail -n "$MAX_ANALYSIS_LINES" "$OBSERVATIONS_FILE" > "$analysis_file"
|
||||
analysis_count=$(wc -l < "$analysis_file" 2>/dev/null || echo 0)
|
||||
echo "[$(date)] Using last $analysis_count of $obs_count observations for analysis" >> "$LOG_FILE"
|
||||
|
||||
prompt_file="$(mktemp "${TMPDIR:-/tmp}/ecc-observer-prompt.XXXXXX")"
|
||||
cat > "$prompt_file" <<PROMPT
|
||||
Read ${OBSERVATIONS_FILE} and identify patterns for the project ${PROJECT_NAME} (user corrections, error resolutions, repeated workflows, tool preferences).
|
||||
Read ${analysis_file} and identify patterns for the project ${PROJECT_NAME} (user corrections, error resolutions, repeated workflows, tool preferences).
|
||||
If you find 3+ occurrences of the same pattern, create an instinct file in ${INSTINCTS_DIR}/<id>.md.
|
||||
|
||||
CRITICAL: Every instinct file MUST use this exact format:
|
||||
@@ -113,7 +129,7 @@ PROMPT
|
||||
wait "$claude_pid"
|
||||
exit_code=$?
|
||||
kill "$watchdog_pid" 2>/dev/null || true
|
||||
rm -f "$prompt_file"
|
||||
rm -f "$prompt_file" "$analysis_file"
|
||||
|
||||
if [ "$exit_code" -ne 0 ]; then
|
||||
echo "[$(date)] Claude analysis failed (exit $exit_code)" >> "$LOG_FILE"
|
||||
@@ -130,7 +146,25 @@ on_usr1() {
|
||||
[ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null
|
||||
SLEEP_PID=""
|
||||
USR1_FIRED=1
|
||||
|
||||
# Re-entrancy guard: skip if analysis is already running (#521)
|
||||
if [ "$ANALYZING" -eq 1 ]; then
|
||||
echo "[$(date)] Analysis already in progress, skipping signal" >> "$LOG_FILE"
|
||||
return
|
||||
fi
|
||||
|
||||
# Cooldown: skip if last analysis was too recent (#521)
|
||||
now_epoch=$(date +%s)
|
||||
elapsed=$(( now_epoch - LAST_ANALYSIS_EPOCH ))
|
||||
if [ "$elapsed" -lt "$ANALYSIS_COOLDOWN" ]; then
|
||||
echo "[$(date)] Analysis cooldown active (${elapsed}s < ${ANALYSIS_COOLDOWN}s), skipping" >> "$LOG_FILE"
|
||||
return
|
||||
fi
|
||||
|
||||
ANALYZING=1
|
||||
analyze_observations
|
||||
LAST_ANALYSIS_EPOCH=$(date +%s)
|
||||
ANALYZING=0
|
||||
}
|
||||
trap on_usr1 USR1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user