mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-04-01 06:33:27 +08:00
fix(continuous-learning-v2): observer background process crashes immediately (#312)
* fix(continuous-learning-v2): observer background process crashes immediately Three bugs prevent the observer from running: 1. Nested session detection: When launched from a Claude Code session, the child process inherits CLAUDECODE env var, causing `claude` CLI to refuse with "cannot be launched inside another session". Fix: unset CLAUDECODE in the background process. 2. set -e kills the loop: The parent script's `set -e` is inherited by the subshell. When `claude` exits non-zero (e.g. max turns reached), the entire observer loop dies. Fix: `set +e` in the background process. 3. Subshell dies when parent exits: `( ... ) & disown` loses IO handles when the parent shell exits, killing the background process. Fix: use `nohup /bin/bash -c '...'` for full detachment, and `sleep & wait` to allow SIGUSR1 to interrupt sleep without killing the process. Additionally, the prompt for Haiku now includes the exact instinct file format inline (YAML frontmatter with id/trigger/confidence/domain/source fields), since the previous prompt referenced "the observer agent spec" which Haiku could not actually read, resulting in instinct files that the CLI parser could not parse. * fix: address review feedback on observer process management - Use `env` to pass variables to child process instead of quote-splicing, avoiding shell injection risk from special chars in paths - Add USR1_FIRED flag to prevent double analysis when SIGUSR1 interrupts the sleep/wait cycle - Track SLEEP_PID and kill it in both TERM trap and USR1 handler to prevent orphaned sleep processes from accumulating - Consolidate cleanup logic into a dedicated cleanup() function * fix: guard PID file cleanup against race condition on restart Only remove PID file in cleanup trap if it still belongs to the current process, preventing a restarted observer from losing its PID file when the old process exits.
This commit is contained in:
@@ -14,6 +14,9 @@
|
||||
|
||||
set -e
|
||||
|
||||
# NOTE: set -e is disabled inside the background subshell below
|
||||
# to prevent claude CLI failures from killing the observer loop.
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# Project detection
|
||||
# ─────────────────────────────────────────────
|
||||
@@ -132,12 +135,37 @@ case "${1:-start}" in
|
||||
|
||||
echo "Starting observer agent for ${PROJECT_NAME}..."
|
||||
|
||||
# The observer loop
|
||||
(
|
||||
trap 'rm -f "$PID_FILE"; exit 0' TERM INT
|
||||
# The observer loop — fully detached with nohup, IO redirected to log.
|
||||
# Variables passed safely via env to avoid shell injection from special chars in paths.
|
||||
nohup env \
|
||||
CONFIG_DIR="$CONFIG_DIR" \
|
||||
PID_FILE="$PID_FILE" \
|
||||
LOG_FILE="$LOG_FILE" \
|
||||
OBSERVATIONS_FILE="$OBSERVATIONS_FILE" \
|
||||
INSTINCTS_DIR="$INSTINCTS_DIR" \
|
||||
PROJECT_DIR="$PROJECT_DIR" \
|
||||
PROJECT_NAME="$PROJECT_NAME" \
|
||||
PROJECT_ID="$PROJECT_ID" \
|
||||
MIN_OBSERVATIONS="$MIN_OBSERVATIONS" \
|
||||
OBSERVER_INTERVAL_SECONDS="$OBSERVER_INTERVAL_SECONDS" \
|
||||
/bin/bash -c '
|
||||
set +e
|
||||
unset CLAUDECODE
|
||||
|
||||
SLEEP_PID=""
|
||||
USR1_FIRED=0
|
||||
|
||||
cleanup() {
|
||||
[ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null
|
||||
# Only remove PID file if it still belongs to this process
|
||||
if [ -f "$PID_FILE" ] && [ "$(cat "$PID_FILE" 2>/dev/null)" = "$$" ]; then
|
||||
rm -f "$PID_FILE"
|
||||
fi
|
||||
exit 0
|
||||
}
|
||||
trap cleanup TERM INT
|
||||
|
||||
analyze_observations() {
|
||||
# Only analyze if observations file exists and has enough entries
|
||||
if [ ! -f "$OBSERVATIONS_FILE" ]; then
|
||||
return
|
||||
fi
|
||||
@@ -149,36 +177,45 @@ case "${1:-start}" in
|
||||
echo "[$(date)] Analyzing $obs_count observations for project ${PROJECT_NAME}..." >> "$LOG_FILE"
|
||||
|
||||
# Use Claude Code with Haiku to analyze observations
|
||||
# The prompt now specifies project-scoped instinct creation
|
||||
# The prompt specifies project-scoped instinct creation
|
||||
if command -v claude &> /dev/null; then
|
||||
exit_code=0
|
||||
claude --model haiku --print \
|
||||
"Read $OBSERVATIONS_FILE and identify patterns for the project '${PROJECT_NAME}'.
|
||||
If you find 3+ occurrences of the same pattern, create an instinct file in $INSTINCTS_DIR/ following this format:
|
||||
claude --model haiku --max-turns 3 --print \
|
||||
"Read $OBSERVATIONS_FILE and identify patterns for the project '${PROJECT_NAME}' (user corrections, error resolutions, repeated workflows, tool preferences).
|
||||
If you find 3+ occurrences of the same pattern, create an instinct file in $INSTINCTS_DIR/<id>.md.
|
||||
|
||||
CRITICAL: Every instinct file MUST use this exact format:
|
||||
|
||||
---
|
||||
id: <kebab-case-id>
|
||||
trigger: \"<when this happens>\"
|
||||
confidence: <0.3-0.9>
|
||||
domain: <code-style|testing|git|debugging|workflow|etc>
|
||||
id: kebab-case-name
|
||||
trigger: \"when <specific condition>\"
|
||||
confidence: <0.3-0.85 based on frequency: 3-5 times=0.5, 6-10=0.7, 11+=0.85>
|
||||
domain: <one of: code-style, testing, git, debugging, workflow, file-patterns>
|
||||
source: session-observation
|
||||
scope: project
|
||||
project_id: ${PROJECT_ID}
|
||||
project_name: ${PROJECT_NAME}
|
||||
---
|
||||
|
||||
# <Title>
|
||||
# Title
|
||||
|
||||
## Action
|
||||
<What to do>
|
||||
<what to do, one clear sentence>
|
||||
|
||||
## Evidence
|
||||
<What observations led to this>
|
||||
- Observed N times in session <id>
|
||||
- Pattern: <description>
|
||||
- Last observed: <date>
|
||||
|
||||
Be conservative - only create instincts for clear patterns.
|
||||
If a pattern seems universal (not project-specific), set scope to 'global' instead of 'project'.
|
||||
Examples of global patterns: 'always validate user input', 'prefer explicit error handling'.
|
||||
Examples of project patterns: 'use React functional components', 'follow Django REST framework conventions'." \
|
||||
Rules:
|
||||
- Be conservative, only clear patterns with 3+ observations
|
||||
- Use narrow, specific triggers
|
||||
- Never include actual code snippets, only describe patterns
|
||||
- If a similar instinct already exists in $INSTINCTS_DIR/, update it instead of creating a duplicate
|
||||
- The YAML frontmatter (between --- markers) with id field is MANDATORY
|
||||
- If a pattern seems universal (not project-specific), set scope to 'global' instead of 'project'
|
||||
- Examples of global patterns: 'always validate user input', 'prefer explicit error handling'
|
||||
- Examples of project patterns: 'use React functional components', 'follow Django REST framework conventions'" \
|
||||
>> "$LOG_FILE" 2>&1 || exit_code=$?
|
||||
if [ "$exit_code" -ne 0 ]; then
|
||||
echo "[$(date)] Claude analysis failed (exit $exit_code)" >> "$LOG_FILE"
|
||||
@@ -187,7 +224,6 @@ Examples of project patterns: 'use React functional components', 'follow Django
|
||||
echo "[$(date)] claude CLI not found, skipping analysis" >> "$LOG_FILE"
|
||||
fi
|
||||
|
||||
# Archive processed observations
|
||||
if [ -f "$OBSERVATIONS_FILE" ]; then
|
||||
archive_dir="${PROJECT_DIR}/observations.archive"
|
||||
mkdir -p "$archive_dir"
|
||||
@@ -195,28 +231,46 @@ Examples of project patterns: 'use React functional components', 'follow Django
|
||||
fi
|
||||
}
|
||||
|
||||
# Handle SIGUSR1 for on-demand analysis
|
||||
trap 'analyze_observations' USR1
|
||||
on_usr1() {
|
||||
# Kill pending sleep to avoid leak, then analyze
|
||||
[ -n "$SLEEP_PID" ] && kill "$SLEEP_PID" 2>/dev/null
|
||||
SLEEP_PID=""
|
||||
USR1_FIRED=1
|
||||
analyze_observations
|
||||
}
|
||||
trap on_usr1 USR1
|
||||
|
||||
echo "$$" > "$PID_FILE"
|
||||
echo "[$(date)] Observer started for ${PROJECT_NAME} (PID: $$)" >> "$LOG_FILE"
|
||||
|
||||
while true; do
|
||||
# Check at configured interval (default: 5 minutes)
|
||||
sleep "$OBSERVER_INTERVAL_SECONDS"
|
||||
# Interruptible sleep — allows USR1 trap to fire immediately
|
||||
sleep "$OBSERVER_INTERVAL_SECONDS" &
|
||||
SLEEP_PID=$!
|
||||
wait $SLEEP_PID 2>/dev/null
|
||||
SLEEP_PID=""
|
||||
|
||||
analyze_observations
|
||||
# Skip scheduled analysis if USR1 already ran it
|
||||
if [ "$USR1_FIRED" -eq 1 ]; then
|
||||
USR1_FIRED=0
|
||||
else
|
||||
analyze_observations
|
||||
fi
|
||||
done
|
||||
) &
|
||||
' >> "$LOG_FILE" 2>&1 &
|
||||
|
||||
disown
|
||||
|
||||
# Wait a moment for PID file
|
||||
sleep 1
|
||||
# Wait for PID file
|
||||
sleep 2
|
||||
|
||||
if [ -f "$PID_FILE" ]; then
|
||||
echo "Observer started (PID: $(cat "$PID_FILE"))"
|
||||
echo "Log: $LOG_FILE"
|
||||
pid=$(cat "$PID_FILE")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
echo "Observer started (PID: $pid)"
|
||||
echo "Log: $LOG_FILE"
|
||||
else
|
||||
echo "Failed to start observer (process died immediately, check $LOG_FILE)"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Failed to start observer"
|
||||
exit 1
|
||||
|
||||
Reference in New Issue
Block a user