feat: self-improving skills loop — observe, inspect, amend, evaluate

- Add skill health observation layer (execution logging, success/failure tracking)
- Add skill health inspector (trace recurring failures across runs)
- Add amendify mechanism (propose SKILL.md patches from failure evidence)
- Add evaluation scaffolding (compare amended vs original performance)
- Wire into session-inspect CLI: skills:health, skills:amendify, skills:evaluate
- 1145/1145 tests passing (+3 new)
This commit is contained in:
Affaan Mustafa
2026-03-14 23:21:18 -07:00
parent 2b2777915e
commit c53bba9e02
7 changed files with 753 additions and 6 deletions

View File

@@ -0,0 +1,89 @@
'use strict';
const { buildSkillHealthReport } = require('./health');
const AMENDMENT_SCHEMA_VERSION = 'ecc.skill-amendment-proposal.v1';
function createProposalId(skillId) {
return `amend-${skillId}-${Date.now()}`;
}
function summarizePatchPreview(skillId, health) {
const lines = [
'## Failure-Driven Amendments',
'',
`- Focus skill routing for \`${skillId}\` when tasks match the proven success cases.`,
];
if (health.recurringErrors[0]) {
lines.push(`- Add explicit guardrails for recurring failure: ${health.recurringErrors[0].error}.`);
}
if (health.recurringTasks[0]) {
lines.push(`- Add an example workflow for task pattern: ${health.recurringTasks[0].task}.`);
}
if (health.recurringFeedback[0]) {
lines.push(`- Address repeated user feedback: ${health.recurringFeedback[0].feedback}.`);
}
lines.push('- Add a verification checklist before declaring the skill output complete.');
return lines.join('\n');
}
function proposeSkillAmendment(skillId, records, options = {}) {
const report = buildSkillHealthReport(records, {
...options,
skillId,
minFailureCount: options.minFailureCount || 1
});
const [health] = report.skills;
if (!health || health.failures === 0) {
return {
schemaVersion: AMENDMENT_SCHEMA_VERSION,
skill: {
id: skillId,
path: null
},
status: 'insufficient-evidence',
rationale: ['No failed observations were available for this skill.'],
patch: null
};
}
const preview = summarizePatchPreview(skillId, health);
return {
schemaVersion: AMENDMENT_SCHEMA_VERSION,
proposalId: createProposalId(skillId),
generatedAt: new Date().toISOString(),
status: 'proposed',
skill: {
id: skillId,
path: health.skill.path || null
},
evidence: {
totalRuns: health.totalRuns,
failures: health.failures,
successRate: health.successRate,
recurringErrors: health.recurringErrors,
recurringTasks: health.recurringTasks,
recurringFeedback: health.recurringFeedback
},
rationale: [
'Proposals are generated from repeated failed runs rather than a single anecdotal error.',
'The suggested patch is additive so the original SKILL.md intent remains auditable.'
],
patch: {
format: 'markdown-fragment',
targetPath: health.skill.path || `skills/${skillId}/SKILL.md`,
preview
}
};
}
module.exports = {
AMENDMENT_SCHEMA_VERSION,
proposeSkillAmendment
};

View File

@@ -0,0 +1,59 @@
'use strict';
const EVALUATION_SCHEMA_VERSION = 'ecc.skill-evaluation.v1';
function roundRate(value) {
return Math.round(value * 1000) / 1000;
}
function summarize(records) {
const runs = records.length;
const successes = records.filter(record => record.outcome && record.outcome.success).length;
const failures = runs - successes;
return {
runs,
successes,
failures,
successRate: runs > 0 ? roundRate(successes / runs) : 0
};
}
function buildSkillEvaluationScaffold(skillId, records, options = {}) {
const minimumRunsPerVariant = options.minimumRunsPerVariant || 2;
const amendmentId = options.amendmentId || null;
const filtered = records.filter(record => record.skill && record.skill.id === skillId);
const baseline = filtered.filter(record => !record.run || record.run.variant !== 'amended');
const amended = filtered.filter(record => record.run && record.run.variant === 'amended')
.filter(record => !amendmentId || record.run.amendmentId === amendmentId);
const baselineSummary = summarize(baseline);
const amendedSummary = summarize(amended);
const delta = {
successRate: roundRate(amendedSummary.successRate - baselineSummary.successRate),
failures: amendedSummary.failures - baselineSummary.failures
};
let recommendation = 'insufficient-data';
if (baselineSummary.runs >= minimumRunsPerVariant && amendedSummary.runs >= minimumRunsPerVariant) {
recommendation = delta.successRate > 0 ? 'promote-amendment' : 'keep-baseline';
}
return {
schemaVersion: EVALUATION_SCHEMA_VERSION,
generatedAt: new Date().toISOString(),
skillId,
amendmentId,
gate: {
minimumRunsPerVariant
},
baseline: baselineSummary,
amended: amendedSummary,
delta,
recommendation
};
}
module.exports = {
EVALUATION_SCHEMA_VERSION,
buildSkillEvaluationScaffold
};

View File

@@ -0,0 +1,118 @@
'use strict';
const HEALTH_SCHEMA_VERSION = 'ecc.skill-health.v1';
function roundRate(value) {
return Math.round(value * 1000) / 1000;
}
function rankCounts(values) {
return Array.from(values.entries())
.map(([value, count]) => ({ value, count }))
.sort((left, right) => right.count - left.count || left.value.localeCompare(right.value));
}
function summarizeVariantRuns(records) {
return records.reduce((accumulator, record) => {
const key = record.run && record.run.variant ? record.run.variant : 'baseline';
if (!accumulator[key]) {
accumulator[key] = { runs: 0, successes: 0, failures: 0 };
}
accumulator[key].runs += 1;
if (record.outcome && record.outcome.success) {
accumulator[key].successes += 1;
} else {
accumulator[key].failures += 1;
}
return accumulator;
}, {});
}
function deriveSkillStatus(skillSummary, options = {}) {
const minFailureCount = options.minFailureCount || 2;
if (skillSummary.failures >= minFailureCount) {
return 'failing';
}
if (skillSummary.failures > 0) {
return 'watch';
}
return 'healthy';
}
function buildSkillHealthReport(records, options = {}) {
const filterSkillId = options.skillId || null;
const filtered = filterSkillId
? records.filter(record => record.skill && record.skill.id === filterSkillId)
: records.slice();
const grouped = filtered.reduce((accumulator, record) => {
const skillId = record.skill.id;
if (!accumulator.has(skillId)) {
accumulator.set(skillId, []);
}
accumulator.get(skillId).push(record);
return accumulator;
}, new Map());
const skills = Array.from(grouped.entries())
.map(([skillId, skillRecords]) => {
const successes = skillRecords.filter(record => record.outcome && record.outcome.success).length;
const failures = skillRecords.length - successes;
const recurringErrors = new Map();
const recurringTasks = new Map();
const recurringFeedback = new Map();
skillRecords.forEach(record => {
if (!record.outcome || record.outcome.success) {
return;
}
if (record.outcome.error) {
recurringErrors.set(record.outcome.error, (recurringErrors.get(record.outcome.error) || 0) + 1);
}
if (record.task) {
recurringTasks.set(record.task, (recurringTasks.get(record.task) || 0) + 1);
}
if (record.outcome.feedback) {
recurringFeedback.set(record.outcome.feedback, (recurringFeedback.get(record.outcome.feedback) || 0) + 1);
}
});
const summary = {
skill: {
id: skillId,
path: skillRecords[0].skill.path || null
},
totalRuns: skillRecords.length,
successes,
failures,
successRate: skillRecords.length > 0 ? roundRate(successes / skillRecords.length) : 0,
status: 'healthy',
recurringErrors: rankCounts(recurringErrors).map(entry => ({ error: entry.value, count: entry.count })),
recurringTasks: rankCounts(recurringTasks).map(entry => ({ task: entry.value, count: entry.count })),
recurringFeedback: rankCounts(recurringFeedback).map(entry => ({ feedback: entry.value, count: entry.count })),
variants: summarizeVariantRuns(skillRecords)
};
summary.status = deriveSkillStatus(summary, options);
return summary;
})
.sort((left, right) => right.failures - left.failures || left.skill.id.localeCompare(right.skill.id));
return {
schemaVersion: HEALTH_SCHEMA_VERSION,
generatedAt: new Date().toISOString(),
totalObservations: filtered.length,
skillCount: skills.length,
skills
};
}
module.exports = {
HEALTH_SCHEMA_VERSION,
buildSkillHealthReport
};

View File

@@ -0,0 +1,108 @@
'use strict';
const fs = require('fs');
const path = require('path');
const os = require('os');
const OBSERVATION_SCHEMA_VERSION = 'ecc.skill-observation.v1';
function resolveProjectRoot(options = {}) {
return path.resolve(options.projectRoot || options.cwd || process.cwd());
}
function getSkillTelemetryRoot(options = {}) {
return path.join(resolveProjectRoot(options), '.claude', 'ecc', 'skills');
}
function getSkillObservationsPath(options = {}) {
return path.join(getSkillTelemetryRoot(options), 'observations.jsonl');
}
function ensureString(value, label) {
if (typeof value !== 'string' || value.trim().length === 0) {
throw new Error(`${label} must be a non-empty string`);
}
return value.trim();
}
function createObservationId() {
return `obs-${Date.now()}-${process.pid}-${Math.random().toString(16).slice(2, 8)}`;
}
function createSkillObservation(input) {
const task = ensureString(input.task, 'task');
const skillId = ensureString(input.skill && input.skill.id, 'skill.id');
const skillPath = typeof input.skill.path === 'string' && input.skill.path.trim().length > 0
? input.skill.path.trim()
: null;
const success = Boolean(input.success);
const error = input.error == null ? null : String(input.error);
const feedback = input.feedback == null ? null : String(input.feedback);
const variant = typeof input.variant === 'string' && input.variant.trim().length > 0
? input.variant.trim()
: 'baseline';
return {
schemaVersion: OBSERVATION_SCHEMA_VERSION,
observationId: typeof input.observationId === 'string' && input.observationId.length > 0
? input.observationId
: createObservationId(),
timestamp: typeof input.timestamp === 'string' && input.timestamp.length > 0
? input.timestamp
: new Date().toISOString(),
task,
skill: {
id: skillId,
path: skillPath
},
outcome: {
success,
status: success ? 'success' : 'failure',
error,
feedback
},
run: {
variant,
amendmentId: input.amendmentId || null,
sessionId: input.sessionId || null,
source: input.source || 'manual'
}
};
}
function appendSkillObservation(observation, options = {}) {
const outputPath = getSkillObservationsPath(options);
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
fs.appendFileSync(outputPath, `${JSON.stringify(observation)}${os.EOL}`, 'utf8');
return outputPath;
}
function readSkillObservations(options = {}) {
const observationPath = path.resolve(options.observationsPath || getSkillObservationsPath(options));
if (!fs.existsSync(observationPath)) {
return [];
}
return fs.readFileSync(observationPath, 'utf8')
.split(/\r?\n/)
.filter(Boolean)
.map(line => {
try {
return JSON.parse(line);
} catch {
return null;
}
})
.filter(record => record && record.schemaVersion === OBSERVATION_SCHEMA_VERSION);
}
module.exports = {
OBSERVATION_SCHEMA_VERSION,
appendSkillObservation,
createSkillObservation,
getSkillObservationsPath,
getSkillTelemetryRoot,
readSkillObservations,
resolveProjectRoot
};

View File

@@ -5,6 +5,10 @@ const fs = require('fs');
const path = require('path');
const { createAdapterRegistry, inspectSessionTarget } = require('./lib/session-adapters/registry');
const { readSkillObservations } = require('./lib/skill-improvement/observations');
const { buildSkillHealthReport } = require('./lib/skill-improvement/health');
const { proposeSkillAmendment } = require('./lib/skill-improvement/amendify');
const { buildSkillEvaluationScaffold } = require('./lib/skill-improvement/evaluate');
function usage() {
console.log([
@@ -18,12 +22,17 @@ function usage() {
' claude:latest Most recent Claude session history entry',
' claude:<id|alias> Specific Claude session or alias',
' <session.tmp> Direct path to a Claude session file',
' skills:health Inspect skill failure/success patterns from observations',
' skills:amendify Propose a SKILL.md patch from failure evidence',
' skills:evaluate Compare baseline vs amended skill outcomes',
'',
'Examples:',
' node scripts/session-inspect.js .claude/plan/workflow.json',
' node scripts/session-inspect.js workflow-visual-proof',
' node scripts/session-inspect.js claude:latest',
' node scripts/session-inspect.js latest --target-type claude-history',
' node scripts/session-inspect.js skills:health',
' node scripts/session-inspect.js skills:amendify --skill api-design',
' node scripts/session-inspect.js claude:a1b2c3d4 --write /tmp/session.json'
].join('\n'));
}
@@ -39,14 +48,57 @@ function parseArgs(argv) {
const targetTypeIndex = args.indexOf('--target-type');
const targetType = targetTypeIndex >= 0 ? args[targetTypeIndex + 1] : null;
const skillIndex = args.indexOf('--skill');
const skillId = skillIndex >= 0 ? args[skillIndex + 1] : null;
const amendmentIndex = args.indexOf('--amendment-id');
const amendmentId = amendmentIndex >= 0 ? args[amendmentIndex + 1] : null;
const observationsIndex = args.indexOf('--observations');
const observationsPath = observationsIndex >= 0 ? args[observationsIndex + 1] : null;
const writeIndex = args.indexOf('--write');
const writePath = writeIndex >= 0 ? args[writeIndex + 1] : null;
return { target, adapterId, targetType, writePath, listAdapters };
return { target, adapterId, targetType, writePath, listAdapters, skillId, amendmentId, observationsPath };
}
function inspectSkillLoopTarget(target, options = {}) {
const observations = readSkillObservations({
cwd: options.cwd,
projectRoot: options.cwd,
observationsPath: options.observationsPath
});
if (target === 'skills:health') {
return buildSkillHealthReport(observations, {
skillId: options.skillId || null
});
}
if (target === 'skills:amendify') {
if (!options.skillId) {
throw new Error('skills:amendify requires --skill <id>');
}
return proposeSkillAmendment(options.skillId, observations);
}
if (target === 'skills:evaluate') {
if (!options.skillId) {
throw new Error('skills:evaluate requires --skill <id>');
}
return buildSkillEvaluationScaffold(options.skillId, observations, {
amendmentId: options.amendmentId || null
});
}
return null;
}
function main() {
const { target, adapterId, targetType, writePath, listAdapters } = parseArgs(process.argv);
const { target, adapterId, targetType, writePath, listAdapters, skillId, amendmentId, observationsPath } = parseArgs(process.argv);
if (listAdapters) {
const registry = createAdapterRegistry();
@@ -59,12 +111,20 @@ function main() {
process.exit(1);
}
const inspectTarget = targetType ? { type: targetType, value: target } : target;
const snapshot = inspectSessionTarget(inspectTarget, {
const skillLoopPayload = inspectSkillLoopTarget(target, {
cwd: process.cwd(),
skillId,
amendmentId,
observationsPath
});
const payloadObject = skillLoopPayload || inspectSessionTarget(
targetType ? { type: targetType, value: target } : target,
{
cwd: process.cwd(),
adapterId
});
const payload = JSON.stringify(snapshot, null, 2);
}
);
const payload = JSON.stringify(payloadObject, null, 2);
if (writePath) {
const absoluteWritePath = path.resolve(writePath);

View File

@@ -0,0 +1,186 @@
'use strict';
const assert = require('assert');
const fs = require('fs');
const os = require('os');
const path = require('path');
const {
appendSkillObservation,
createSkillObservation,
getSkillObservationsPath,
readSkillObservations
} = require('../../scripts/lib/skill-improvement/observations');
const { buildSkillHealthReport } = require('../../scripts/lib/skill-improvement/health');
const { proposeSkillAmendment } = require('../../scripts/lib/skill-improvement/amendify');
const { buildSkillEvaluationScaffold } = require('../../scripts/lib/skill-improvement/evaluate');
console.log('=== Testing skill-improvement ===\n');
let passed = 0;
let failed = 0;
function test(name, fn) {
try {
fn();
console.log(`${name}`);
passed += 1;
} catch (error) {
console.log(`${name}: ${error.message}`);
failed += 1;
}
}
function makeProjectRoot(prefix) {
return fs.mkdtempSync(path.join(os.tmpdir(), prefix));
}
function cleanup(dirPath) {
fs.rmSync(dirPath, { recursive: true, force: true });
}
test('observation layer writes and reads structured skill outcomes', () => {
const projectRoot = makeProjectRoot('ecc-skill-observe-');
try {
const observation = createSkillObservation({
task: 'Fix flaky Playwright test',
skill: {
id: 'e2e-testing',
path: 'skills/e2e-testing/SKILL.md'
},
success: false,
error: 'playwright timeout',
feedback: 'Timed out waiting for locator',
sessionId: 'sess-1234'
});
appendSkillObservation(observation, { projectRoot });
const records = readSkillObservations({ projectRoot });
assert.strictEqual(records.length, 1);
assert.strictEqual(records[0].schemaVersion, 'ecc.skill-observation.v1');
assert.strictEqual(records[0].task, 'Fix flaky Playwright test');
assert.strictEqual(records[0].skill.id, 'e2e-testing');
assert.strictEqual(records[0].outcome.success, false);
assert.strictEqual(records[0].outcome.error, 'playwright timeout');
assert.strictEqual(getSkillObservationsPath({ projectRoot }), path.join(projectRoot, '.claude', 'ecc', 'skills', 'observations.jsonl'));
} finally {
cleanup(projectRoot);
}
});
test('health inspector traces recurring failures for a skill across runs', () => {
const projectRoot = makeProjectRoot('ecc-skill-health-');
try {
[
createSkillObservation({
task: 'Ship Next.js auth middleware',
skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
success: false,
error: 'missing csrf guidance',
feedback: 'Did not mention CSRF'
}),
createSkillObservation({
task: 'Harden Next.js auth middleware',
skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
success: false,
error: 'missing csrf guidance',
feedback: 'Repeated omission'
}),
createSkillObservation({
task: 'Review payment webhook security',
skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
success: true
})
].forEach(record => appendSkillObservation(record, { projectRoot }));
const report = buildSkillHealthReport(readSkillObservations({ projectRoot }), {
minFailureCount: 2
});
const skill = report.skills.find(entry => entry.skill.id === 'security-review');
assert.ok(skill, 'security-review should appear in the report');
assert.strictEqual(skill.totalRuns, 3);
assert.strictEqual(skill.failures, 2);
assert.strictEqual(skill.status, 'failing');
assert.strictEqual(skill.recurringErrors[0].error, 'missing csrf guidance');
assert.strictEqual(skill.recurringErrors[0].count, 2);
} finally {
cleanup(projectRoot);
}
});
test('amendify proposes SKILL.md patch content from failure evidence', () => {
const records = [
createSkillObservation({
task: 'Add API rate limiting',
skill: { id: 'api-design', path: 'skills/api-design/SKILL.md' },
success: false,
error: 'missing rate limiting guidance',
feedback: 'No rate-limit section'
}),
createSkillObservation({
task: 'Design public API error envelopes',
skill: { id: 'api-design', path: 'skills/api-design/SKILL.md' },
success: false,
error: 'missing error response examples',
feedback: 'Need explicit examples'
})
];
const proposal = proposeSkillAmendment('api-design', records);
assert.strictEqual(proposal.schemaVersion, 'ecc.skill-amendment-proposal.v1');
assert.strictEqual(proposal.skill.id, 'api-design');
assert.strictEqual(proposal.status, 'proposed');
assert.ok(proposal.patch.preview.includes('## Failure-Driven Amendments'));
assert.ok(proposal.patch.preview.includes('rate limiting'));
assert.ok(proposal.patch.preview.includes('error response'));
});
test('evaluation scaffold compares amended and baseline performance', () => {
const records = [
createSkillObservation({
task: 'Fix flaky login test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
success: false,
variant: 'baseline'
}),
createSkillObservation({
task: 'Fix flaky checkout test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
success: true,
variant: 'baseline'
}),
createSkillObservation({
task: 'Fix flaky login test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
success: true,
variant: 'amended',
amendmentId: 'amend-1'
}),
createSkillObservation({
task: 'Fix flaky checkout test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
success: true,
variant: 'amended',
amendmentId: 'amend-1'
})
];
const evaluation = buildSkillEvaluationScaffold('e2e-testing', records, {
amendmentId: 'amend-1',
minimumRunsPerVariant: 2
});
assert.strictEqual(evaluation.schemaVersion, 'ecc.skill-evaluation.v1');
assert.strictEqual(evaluation.baseline.runs, 2);
assert.strictEqual(evaluation.amended.runs, 2);
assert.strictEqual(evaluation.delta.successRate, 0.5);
assert.strictEqual(evaluation.recommendation, 'promote-amendment');
});
console.log(`\n=== Results: ${passed} passed, ${failed} failed ===`);
if (failed > 0) process.exit(1);

View File

@@ -143,6 +143,133 @@ function runTests() {
}
})) passed++; else failed++;
if (test('inspects skill health from recorded observations', () => {
const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-skills-'));
const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
fs.mkdirSync(observationsDir, { recursive: true });
fs.writeFileSync(
path.join(observationsDir, 'observations.jsonl'),
[
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-1',
timestamp: '2026-03-14T12:00:00.000Z',
task: 'Review auth middleware',
skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
outcome: { success: false, status: 'failure', error: 'missing csrf guidance', feedback: 'Need CSRF coverage' },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
}),
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-2',
timestamp: '2026-03-14T12:05:00.000Z',
task: 'Review auth middleware',
skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
outcome: { success: false, status: 'failure', error: 'missing csrf guidance', feedback: null },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-2' }
})
].join('\n') + '\n'
);
try {
const result = run(['skills:health'], { cwd: projectRoot });
assert.strictEqual(result.code, 0, result.stderr);
const payload = JSON.parse(result.stdout);
assert.strictEqual(payload.schemaVersion, 'ecc.skill-health.v1');
assert.ok(payload.skills.some(skill => skill.skill.id === 'security-review'));
} finally {
fs.rmSync(projectRoot, { recursive: true, force: true });
}
})) passed++; else failed++;
if (test('proposes skill amendments through session-inspect', () => {
const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-amend-'));
const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
fs.mkdirSync(observationsDir, { recursive: true });
fs.writeFileSync(
path.join(observationsDir, 'observations.jsonl'),
[
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-1',
timestamp: '2026-03-14T12:00:00.000Z',
task: 'Add rate limiting',
skill: { id: 'api-design', path: 'skills/api-design/SKILL.md' },
outcome: { success: false, status: 'failure', error: 'missing rate limiting guidance', feedback: 'Need rate limiting examples' },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
})
].join('\n') + '\n'
);
try {
const result = run(['skills:amendify', '--skill', 'api-design'], { cwd: projectRoot });
assert.strictEqual(result.code, 0, result.stderr);
const payload = JSON.parse(result.stdout);
assert.strictEqual(payload.schemaVersion, 'ecc.skill-amendment-proposal.v1');
assert.strictEqual(payload.skill.id, 'api-design');
assert.ok(payload.patch.preview.includes('Failure-Driven Amendments'));
} finally {
fs.rmSync(projectRoot, { recursive: true, force: true });
}
})) passed++; else failed++;
if (test('builds skill evaluation scaffolding through session-inspect', () => {
const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-eval-'));
const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
fs.mkdirSync(observationsDir, { recursive: true });
fs.writeFileSync(
path.join(observationsDir, 'observations.jsonl'),
[
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-1',
timestamp: '2026-03-14T12:00:00.000Z',
task: 'Fix flaky login test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
outcome: { success: false, status: 'failure', error: null, feedback: null },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
}),
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-2',
timestamp: '2026-03-14T12:10:00.000Z',
task: 'Fix flaky checkout test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
outcome: { success: true, status: 'success', error: null, feedback: null },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-2' }
}),
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-3',
timestamp: '2026-03-14T12:20:00.000Z',
task: 'Fix flaky login test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
outcome: { success: true, status: 'success', error: null, feedback: null },
run: { variant: 'amended', amendmentId: 'amend-1', sessionId: 'sess-3' }
}),
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-4',
timestamp: '2026-03-14T12:30:00.000Z',
task: 'Fix flaky checkout test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
outcome: { success: true, status: 'success', error: null, feedback: null },
run: { variant: 'amended', amendmentId: 'amend-1', sessionId: 'sess-4' }
})
].join('\n') + '\n'
);
try {
const result = run(['skills:evaluate', '--skill', 'e2e-testing', '--amendment-id', 'amend-1'], { cwd: projectRoot });
assert.strictEqual(result.code, 0, result.stderr);
const payload = JSON.parse(result.stdout);
assert.strictEqual(payload.schemaVersion, 'ecc.skill-evaluation.v1');
assert.strictEqual(payload.recommendation, 'promote-amendment');
} finally {
fs.rmSync(projectRoot, { recursive: true, force: true });
}
})) passed++; else failed++;
console.log(`\nResults: Passed: ${passed}, Failed: ${failed}`);
process.exit(failed > 0 ? 1 : 0);
}