feat: self-improving skills loop — observe, inspect, amend, evaluate

- Add skill health observation layer (execution logging, success/failure tracking)
- Add skill health inspector (trace recurring failures across runs)
- Add amendify mechanism (propose SKILL.md patches from failure evidence)
- Add evaluation scaffolding (compare amended vs original performance)
- Wire into session-inspect CLI: skills:health, skills:amendify, skills:evaluate
- 1145/1145 tests passing (+3 new)
This commit is contained in:
Affaan Mustafa
2026-03-14 23:21:18 -07:00
parent 2b2777915e
commit c53bba9e02
7 changed files with 753 additions and 6 deletions

View File

@@ -143,6 +143,133 @@ function runTests() {
}
})) passed++; else failed++;
if (test('inspects skill health from recorded observations', () => {
const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-skills-'));
const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
fs.mkdirSync(observationsDir, { recursive: true });
fs.writeFileSync(
path.join(observationsDir, 'observations.jsonl'),
[
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-1',
timestamp: '2026-03-14T12:00:00.000Z',
task: 'Review auth middleware',
skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
outcome: { success: false, status: 'failure', error: 'missing csrf guidance', feedback: 'Need CSRF coverage' },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
}),
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-2',
timestamp: '2026-03-14T12:05:00.000Z',
task: 'Review auth middleware',
skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
outcome: { success: false, status: 'failure', error: 'missing csrf guidance', feedback: null },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-2' }
})
].join('\n') + '\n'
);
try {
const result = run(['skills:health'], { cwd: projectRoot });
assert.strictEqual(result.code, 0, result.stderr);
const payload = JSON.parse(result.stdout);
assert.strictEqual(payload.schemaVersion, 'ecc.skill-health.v1');
assert.ok(payload.skills.some(skill => skill.skill.id === 'security-review'));
} finally {
fs.rmSync(projectRoot, { recursive: true, force: true });
}
})) passed++; else failed++;
if (test('proposes skill amendments through session-inspect', () => {
const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-amend-'));
const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
fs.mkdirSync(observationsDir, { recursive: true });
fs.writeFileSync(
path.join(observationsDir, 'observations.jsonl'),
[
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-1',
timestamp: '2026-03-14T12:00:00.000Z',
task: 'Add rate limiting',
skill: { id: 'api-design', path: 'skills/api-design/SKILL.md' },
outcome: { success: false, status: 'failure', error: 'missing rate limiting guidance', feedback: 'Need rate limiting examples' },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
})
].join('\n') + '\n'
);
try {
const result = run(['skills:amendify', '--skill', 'api-design'], { cwd: projectRoot });
assert.strictEqual(result.code, 0, result.stderr);
const payload = JSON.parse(result.stdout);
assert.strictEqual(payload.schemaVersion, 'ecc.skill-amendment-proposal.v1');
assert.strictEqual(payload.skill.id, 'api-design');
assert.ok(payload.patch.preview.includes('Failure-Driven Amendments'));
} finally {
fs.rmSync(projectRoot, { recursive: true, force: true });
}
})) passed++; else failed++;
if (test('builds skill evaluation scaffolding through session-inspect', () => {
const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-eval-'));
const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
fs.mkdirSync(observationsDir, { recursive: true });
fs.writeFileSync(
path.join(observationsDir, 'observations.jsonl'),
[
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-1',
timestamp: '2026-03-14T12:00:00.000Z',
task: 'Fix flaky login test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
outcome: { success: false, status: 'failure', error: null, feedback: null },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
}),
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-2',
timestamp: '2026-03-14T12:10:00.000Z',
task: 'Fix flaky checkout test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
outcome: { success: true, status: 'success', error: null, feedback: null },
run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-2' }
}),
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-3',
timestamp: '2026-03-14T12:20:00.000Z',
task: 'Fix flaky login test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
outcome: { success: true, status: 'success', error: null, feedback: null },
run: { variant: 'amended', amendmentId: 'amend-1', sessionId: 'sess-3' }
}),
JSON.stringify({
schemaVersion: 'ecc.skill-observation.v1',
observationId: 'obs-4',
timestamp: '2026-03-14T12:30:00.000Z',
task: 'Fix flaky checkout test',
skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
outcome: { success: true, status: 'success', error: null, feedback: null },
run: { variant: 'amended', amendmentId: 'amend-1', sessionId: 'sess-4' }
})
].join('\n') + '\n'
);
try {
const result = run(['skills:evaluate', '--skill', 'e2e-testing', '--amendment-id', 'amend-1'], { cwd: projectRoot });
assert.strictEqual(result.code, 0, result.stderr);
const payload = JSON.parse(result.stdout);
assert.strictEqual(payload.schemaVersion, 'ecc.skill-evaluation.v1');
assert.strictEqual(payload.recommendation, 'promote-amendment');
} finally {
fs.rmSync(projectRoot, { recursive: true, force: true });
}
})) passed++; else failed++;
console.log(`\nResults: Passed: ${passed}, Failed: ${failed}`);
process.exit(failed > 0 ? 1 : 0);
}