fix: observer memory explosion with throttling, re-entrancy guard, and tail sampling (#536)

Three fixes for the positive feedback loop causing runaway memory usage: 1. SIGUSR1 throttling in observe.sh: Signal observer only every 20 observations (configurable via ECC_OBSERVER_SIGNAL_EVERY_N) instead of on every tool call. Uses a counter file to track invocations. 2. Re-entrancy guard in observer-loop.sh on_usr1(): ANALYZING flag prevents parallel Claude analysis processes from spawning when signals arrive while analysis is already running. 3. Cooldown + tail-based sampling in observer-loop.sh: - 60s cooldown between analyses (ECC_OBSERVER_ANALYSIS_COOLDOWN) - Only last 500 lines sent to LLM (ECC_OBSERVER_MAX_ANALYSIS_LINES) instead of the entire observations file Closes #521
2026-06-13 19:51:24 +08:00 · 2026-03-16 13:47:25 -07:00
parent bb27dde116
commit f9e8287346
3 changed files with 435 additions and 20 deletions
@@ -1,11 +1,19 @@
 #!/usr/bin/env bash
 # Continuous Learning v2 - Observer background loop
+#
+# Fix for #521: Added re-entrancy guard, cooldown throttle, and
+# tail-based sampling to prevent memory explosion from runaway
+# parallel Claude analysis processes.

 set +e
 unset CLAUDECODE

 SLEEP_PID=""
 USR1_FIRED=0
+ANALYZING=0
+LAST_ANALYSIS_EPOCH=0
+# Minimum seconds between analyses (prevents rapid re-triggering)
+ANALYSIS_COOLDOWN="${ECC_OBSERVER_ANALYSIS_COOLDOWN:-60}"

 cleanup() {
  [ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null
@@ -44,9 +52,17 @@ analyze_observations() {
    return
  fi

+  # Sample recent observations instead of loading the entire file (#521).
+  # This prevents multi-MB payloads from being passed to the LLM.
+  MAX_ANALYSIS_LINES="${ECC_OBSERVER_MAX_ANALYSIS_LINES:-500}"
+  analysis_file="$(mktemp "${TMPDIR:-/tmp}/ecc-observer-analysis.XXXXXX.jsonl")"
+  tail -n "$MAX_ANALYSIS_LINES" "$OBSERVATIONS_FILE" > "$analysis_file"
+  analysis_count=$(wc -l < "$analysis_file" 2>/dev/null || echo 0)
+  echo "[$(date)] Using last $analysis_count of $obs_count observations for analysis" >> "$LOG_FILE"
+
  prompt_file="$(mktemp "${TMPDIR:-/tmp}/ecc-observer-prompt.XXXXXX")"
  cat > "$prompt_file" <<PROMPT
-Read ${OBSERVATIONS_FILE} and identify patterns for the project ${PROJECT_NAME} (user corrections, error resolutions, repeated workflows, tool preferences).
+Read ${analysis_file} and identify patterns for the project ${PROJECT_NAME} (user corrections, error resolutions, repeated workflows, tool preferences).
 If you find 3+ occurrences of the same pattern, create an instinct file in ${INSTINCTS_DIR}/<id>.md.

 CRITICAL: Every instinct file MUST use this exact format:
@@ -113,7 +129,7 @@ PROMPT
  wait "$claude_pid"
  exit_code=$?
  kill "$watchdog_pid" 2>/dev/null || true
-  rm -f "$prompt_file"
+  rm -f "$prompt_file" "$analysis_file"

  if [ "$exit_code" -ne 0 ]; then
    echo "[$(date)] Claude analysis failed (exit $exit_code)" >> "$LOG_FILE"
@@ -130,7 +146,25 @@ on_usr1() {
  [ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null
  SLEEP_PID=""
  USR1_FIRED=1
+
+  # Re-entrancy guard: skip if analysis is already running (#521)
+  if [ "$ANALYZING" -eq 1 ]; then
+    echo "[$(date)] Analysis already in progress, skipping signal" >> "$LOG_FILE"
+    return
+  fi
+
+  # Cooldown: skip if last analysis was too recent (#521)
+  now_epoch=$(date +%s)
+  elapsed=$(( now_epoch - LAST_ANALYSIS_EPOCH ))
+  if [ "$elapsed" -lt "$ANALYSIS_COOLDOWN" ]; then
+    echo "[$(date)] Analysis cooldown active (${elapsed}s < ${ANALYSIS_COOLDOWN}s), skipping" >> "$LOG_FILE"
+    return
+  fi
+
+  ANALYZING=1
  analyze_observations
+  LAST_ANALYSIS_EPOCH=$(date +%s)
+  ANALYZING=0
 }
 trap on_usr1 USR1

@@ -365,24 +365,45 @@ if [ "$OBSERVER_ENABLED" = "true" ]; then
  fi
 fi

-# Signal observer if running (check both project-scoped and global observer, deduplicate)
-signaled_pids=" "
-for pid_file in "${PROJECT_DIR}/.observer.pid" "${CONFIG_DIR}/.observer.pid"; do
-  if [ -f "$pid_file" ]; then
-    observer_pid=$(cat "$pid_file" 2>/dev/null || true)
-    # Validate PID is a positive integer (>1)
-    case "$observer_pid" in
-      ''|*[!0-9]*|0|1) rm -f "$pid_file" 2>/dev/null || true; continue ;;
-    esac
-    # Deduplicate: skip if already signaled this pass
-    case "$signaled_pids" in
-      *" $observer_pid "*) continue ;;
-    esac
-    if kill -0 "$observer_pid" 2>/dev/null; then
-      kill -USR1 "$observer_pid" 2>/dev/null || true
-      signaled_pids="${signaled_pids}${observer_pid} "
-    fi
+# Throttle SIGUSR1: only signal observer every N observations (#521)
+# This prevents rapid signaling when tool calls fire every second,
+# which caused runaway parallel Claude analysis processes.
+SIGNAL_EVERY_N="${ECC_OBSERVER_SIGNAL_EVERY_N:-20}"
+SIGNAL_COUNTER_FILE="${PROJECT_DIR}/.observer-signal-counter"
+
+should_signal=0
+if [ -f "$SIGNAL_COUNTER_FILE" ]; then
+  counter=$(cat "$SIGNAL_COUNTER_FILE" 2>/dev/null || echo 0)
+  counter=$((counter + 1))
+  if [ "$counter" -ge "$SIGNAL_EVERY_N" ]; then
+    should_signal=1
+    counter=0
  fi
-done
+  echo "$counter" > "$SIGNAL_COUNTER_FILE"
+else
+  echo "1" > "$SIGNAL_COUNTER_FILE"
+fi
+
+# Signal observer if running and throttle allows (check both project-scoped and global observer, deduplicate)
+if [ "$should_signal" -eq 1 ]; then
+  signaled_pids=" "
+  for pid_file in "${PROJECT_DIR}/.observer.pid" "${CONFIG_DIR}/.observer.pid"; do
+    if [ -f "$pid_file" ]; then
+      observer_pid=$(cat "$pid_file" 2>/dev/null || true)
+      # Validate PID is a positive integer (>1)
+      case "$observer_pid" in
+        ''|*[!0-9]*|0|1) rm -f "$pid_file" 2>/dev/null || true; continue ;;
+      esac
+      # Deduplicate: skip if already signaled this pass
+      case "$signaled_pids" in
+        *" $observer_pid "*) continue ;;
+      esac
+      if kill -0 "$observer_pid" 2>/dev/null; then
+        kill -USR1 "$observer_pid" 2>/dev/null || true
+        signaled_pids="${signaled_pids}${observer_pid} "
+      fi
+    fi
+  done
+fi

 exit 0