From adc0f670081311319c621f762a0559dddd62b3ea Mon Sep 17 00:00:00 2001 From: zzzhizhi <77013105+zzzhizhia@users.noreply.github.com> Date: Tue, 3 Mar 2026 14:23:01 +0800 Subject: [PATCH] fix(continuous-learning-v2): observer background process crashes immediately (#312) * fix(continuous-learning-v2): observer background process crashes immediately Three bugs prevent the observer from running: 1. Nested session detection: When launched from a Claude Code session, the child process inherits CLAUDECODE env var, causing `claude` CLI to refuse with "cannot be launched inside another session". Fix: unset CLAUDECODE in the background process. 2. set -e kills the loop: The parent script's `set -e` is inherited by the subshell. When `claude` exits non-zero (e.g. max turns reached), the entire observer loop dies. Fix: `set +e` in the background process. 3. Subshell dies when parent exits: `( ... ) & disown` loses IO handles when the parent shell exits, killing the background process. Fix: use `nohup /bin/bash -c '...'` for full detachment, and `sleep & wait` to allow SIGUSR1 to interrupt sleep without killing the process. Additionally, the prompt for Haiku now includes the exact instinct file format inline (YAML frontmatter with id/trigger/confidence/domain/source fields), since the previous prompt referenced "the observer agent spec" which Haiku could not actually read, resulting in instinct files that the CLI parser could not parse. * fix: address review feedback on observer process management - Use `env` to pass variables to child process instead of quote-splicing, avoiding shell injection risk from special chars in paths - Add USR1_FIRED flag to prevent double analysis when SIGUSR1 interrupts the sleep/wait cycle - Track SLEEP_PID and kill it in both TERM trap and USR1 handler to prevent orphaned sleep processes from accumulating - Consolidate cleanup logic into a dedicated cleanup() function * fix: guard PID file cleanup against race condition on restart Only remove PID file in cleanup trap if it still belongs to the current process, preventing a restarted observer from losing its PID file when the old process exits. --- .../agents/start-observer.sh | 118 +++++++++++++----- 1 file changed, 86 insertions(+), 32 deletions(-) diff --git a/skills/continuous-learning-v2/agents/start-observer.sh b/skills/continuous-learning-v2/agents/start-observer.sh index 99b25099..3197b8bc 100755 --- a/skills/continuous-learning-v2/agents/start-observer.sh +++ b/skills/continuous-learning-v2/agents/start-observer.sh @@ -14,6 +14,9 @@ set -e +# NOTE: set -e is disabled inside the background subshell below +# to prevent claude CLI failures from killing the observer loop. + # ───────────────────────────────────────────── # Project detection # ───────────────────────────────────────────── @@ -132,12 +135,37 @@ case "${1:-start}" in echo "Starting observer agent for ${PROJECT_NAME}..." - # The observer loop - ( - trap 'rm -f "$PID_FILE"; exit 0' TERM INT + # The observer loop — fully detached with nohup, IO redirected to log. + # Variables passed safely via env to avoid shell injection from special chars in paths. + nohup env \ + CONFIG_DIR="$CONFIG_DIR" \ + PID_FILE="$PID_FILE" \ + LOG_FILE="$LOG_FILE" \ + OBSERVATIONS_FILE="$OBSERVATIONS_FILE" \ + INSTINCTS_DIR="$INSTINCTS_DIR" \ + PROJECT_DIR="$PROJECT_DIR" \ + PROJECT_NAME="$PROJECT_NAME" \ + PROJECT_ID="$PROJECT_ID" \ + MIN_OBSERVATIONS="$MIN_OBSERVATIONS" \ + OBSERVER_INTERVAL_SECONDS="$OBSERVER_INTERVAL_SECONDS" \ + /bin/bash -c ' + set +e + unset CLAUDECODE + + SLEEP_PID="" + USR1_FIRED=0 + + cleanup() { + [ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null + # Only remove PID file if it still belongs to this process + if [ -f "$PID_FILE" ] && [ "$(cat "$PID_FILE" 2>/dev/null)" = "$$" ]; then + rm -f "$PID_FILE" + fi + exit 0 + } + trap cleanup TERM INT analyze_observations() { - # Only analyze if observations file exists and has enough entries if [ ! -f "$OBSERVATIONS_FILE" ]; then return fi @@ -149,36 +177,45 @@ case "${1:-start}" in echo "[$(date)] Analyzing $obs_count observations for project ${PROJECT_NAME}..." >> "$LOG_FILE" # Use Claude Code with Haiku to analyze observations - # The prompt now specifies project-scoped instinct creation + # The prompt specifies project-scoped instinct creation if command -v claude &> /dev/null; then exit_code=0 - claude --model haiku --print \ - "Read $OBSERVATIONS_FILE and identify patterns for the project '${PROJECT_NAME}'. -If you find 3+ occurrences of the same pattern, create an instinct file in $INSTINCTS_DIR/ following this format: + claude --model haiku --max-turns 3 --print \ + "Read $OBSERVATIONS_FILE and identify patterns for the project '${PROJECT_NAME}' (user corrections, error resolutions, repeated workflows, tool preferences). +If you find 3+ occurrences of the same pattern, create an instinct file in $INSTINCTS_DIR/.md. + +CRITICAL: Every instinct file MUST use this exact format: --- -id: -trigger: \"\" -confidence: <0.3-0.9> -domain: +id: kebab-case-name +trigger: \"when \" +confidence: <0.3-0.85 based on frequency: 3-5 times=0.5, 6-10=0.7, 11+=0.85> +domain: source: session-observation scope: project project_id: ${PROJECT_ID} project_name: ${PROJECT_NAME} --- -# +# Title ## Action -<What to do> +<what to do, one clear sentence> ## Evidence -<What observations led to this> +- Observed N times in session <id> +- Pattern: <description> +- Last observed: <date> -Be conservative - only create instincts for clear patterns. -If a pattern seems universal (not project-specific), set scope to 'global' instead of 'project'. -Examples of global patterns: 'always validate user input', 'prefer explicit error handling'. -Examples of project patterns: 'use React functional components', 'follow Django REST framework conventions'." \ +Rules: +- Be conservative, only clear patterns with 3+ observations +- Use narrow, specific triggers +- Never include actual code snippets, only describe patterns +- If a similar instinct already exists in $INSTINCTS_DIR/, update it instead of creating a duplicate +- The YAML frontmatter (between --- markers) with id field is MANDATORY +- If a pattern seems universal (not project-specific), set scope to 'global' instead of 'project' +- Examples of global patterns: 'always validate user input', 'prefer explicit error handling' +- Examples of project patterns: 'use React functional components', 'follow Django REST framework conventions'" \ >> "$LOG_FILE" 2>&1 || exit_code=$? if [ "$exit_code" -ne 0 ]; then echo "[$(date)] Claude analysis failed (exit $exit_code)" >> "$LOG_FILE" @@ -187,7 +224,6 @@ Examples of project patterns: 'use React functional components', 'follow Django echo "[$(date)] claude CLI not found, skipping analysis" >> "$LOG_FILE" fi - # Archive processed observations if [ -f "$OBSERVATIONS_FILE" ]; then archive_dir="${PROJECT_DIR}/observations.archive" mkdir -p "$archive_dir" @@ -195,28 +231,46 @@ Examples of project patterns: 'use React functional components', 'follow Django fi } - # Handle SIGUSR1 for on-demand analysis - trap 'analyze_observations' USR1 + on_usr1() { + # Kill pending sleep to avoid leak, then analyze + [ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null + SLEEP_PID="" + USR1_FIRED=1 + analyze_observations + } + trap on_usr1 USR1 echo "$$" > "$PID_FILE" echo "[$(date)] Observer started for ${PROJECT_NAME} (PID: $$)" >> "$LOG_FILE" while true; do - # Check at configured interval (default: 5 minutes) - sleep "$OBSERVER_INTERVAL_SECONDS" + # Interruptible sleep — allows USR1 trap to fire immediately + sleep "$OBSERVER_INTERVAL_SECONDS" & + SLEEP_PID=$! + wait $SLEEP_PID 2>/dev/null + SLEEP_PID="" - analyze_observations + # Skip scheduled analysis if USR1 already ran it + if [ "$USR1_FIRED" -eq 1 ]; then + USR1_FIRED=0 + else + analyze_observations + fi done - ) & + ' >> "$LOG_FILE" 2>&1 & - disown - - # Wait a moment for PID file - sleep 1 + # Wait for PID file + sleep 2 if [ -f "$PID_FILE" ]; then - echo "Observer started (PID: $(cat "$PID_FILE"))" - echo "Log: $LOG_FILE" + pid=$(cat "$PID_FILE") + if kill -0 "$pid" 2>/dev/null; then + echo "Observer started (PID: $pid)" + echo "Log: $LOG_FILE" + else + echo "Failed to start observer (process died immediately, check $LOG_FILE)" + exit 1 + fi else echo "Failed to start observer" exit 1