feat: self-improving skills loop — observe, inspect, amend, evaluate

- Add skill health observation layer (execution logging, success/failure tracking) - Add skill health inspector (trace recurring failures across runs) - Add amendify mechanism (propose SKILL.md patches from failure evidence) - Add evaluation scaffolding (compare amended vs original performance) - Wire into session-inspect CLI: skills:health, skills:amendify, skills:evaluate - 1145/1145 tests passing (+3 new)
2026-05-20 07:43:07 +08:00 · 2026-03-14 23:21:18 -07:00
parent 2b2777915e
commit c53bba9e02
7 changed files with 753 additions and 6 deletions
--- a/tests/lib/skill-improvement.test.js
+++ b/tests/lib/skill-improvement.test.js
@@ -0,0 +1,186 @@
+'use strict';
+
+const assert = require('assert');
+const fs = require('fs');
+const os = require('os');
+const path = require('path');
+
+const {
+  appendSkillObservation,
+  createSkillObservation,
+  getSkillObservationsPath,
+  readSkillObservations
+} = require('../../scripts/lib/skill-improvement/observations');
+const { buildSkillHealthReport } = require('../../scripts/lib/skill-improvement/health');
+const { proposeSkillAmendment } = require('../../scripts/lib/skill-improvement/amendify');
+const { buildSkillEvaluationScaffold } = require('../../scripts/lib/skill-improvement/evaluate');
+
+console.log('=== Testing skill-improvement ===\n');
+
+let passed = 0;
+let failed = 0;
+
+function test(name, fn) {
+  try {
+    fn();
+    console.log(`  ✓ ${name}`);
+    passed += 1;
+  } catch (error) {
+    console.log(`  ✗ ${name}: ${error.message}`);
+    failed += 1;
+  }
+}
+
+function makeProjectRoot(prefix) {
+  return fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+}
+
+function cleanup(dirPath) {
+  fs.rmSync(dirPath, { recursive: true, force: true });
+}
+
+test('observation layer writes and reads structured skill outcomes', () => {
+  const projectRoot = makeProjectRoot('ecc-skill-observe-');
+
+  try {
+    const observation = createSkillObservation({
+      task: 'Fix flaky Playwright test',
+      skill: {
+        id: 'e2e-testing',
+        path: 'skills/e2e-testing/SKILL.md'
+      },
+      success: false,
+      error: 'playwright timeout',
+      feedback: 'Timed out waiting for locator',
+      sessionId: 'sess-1234'
+    });
+
+    appendSkillObservation(observation, { projectRoot });
+    const records = readSkillObservations({ projectRoot });
+
+    assert.strictEqual(records.length, 1);
+    assert.strictEqual(records[0].schemaVersion, 'ecc.skill-observation.v1');
+    assert.strictEqual(records[0].task, 'Fix flaky Playwright test');
+    assert.strictEqual(records[0].skill.id, 'e2e-testing');
+    assert.strictEqual(records[0].outcome.success, false);
+    assert.strictEqual(records[0].outcome.error, 'playwright timeout');
+    assert.strictEqual(getSkillObservationsPath({ projectRoot }), path.join(projectRoot, '.claude', 'ecc', 'skills', 'observations.jsonl'));
+  } finally {
+    cleanup(projectRoot);
+  }
+});
+
+test('health inspector traces recurring failures for a skill across runs', () => {
+  const projectRoot = makeProjectRoot('ecc-skill-health-');
+
+  try {
+    [
+      createSkillObservation({
+        task: 'Ship Next.js auth middleware',
+        skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
+        success: false,
+        error: 'missing csrf guidance',
+        feedback: 'Did not mention CSRF'
+      }),
+      createSkillObservation({
+        task: 'Harden Next.js auth middleware',
+        skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
+        success: false,
+        error: 'missing csrf guidance',
+        feedback: 'Repeated omission'
+      }),
+      createSkillObservation({
+        task: 'Review payment webhook security',
+        skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
+        success: true
+      })
+    ].forEach(record => appendSkillObservation(record, { projectRoot }));
+
+    const report = buildSkillHealthReport(readSkillObservations({ projectRoot }), {
+      minFailureCount: 2
+    });
+    const skill = report.skills.find(entry => entry.skill.id === 'security-review');
+
+    assert.ok(skill, 'security-review should appear in the report');
+    assert.strictEqual(skill.totalRuns, 3);
+    assert.strictEqual(skill.failures, 2);
+    assert.strictEqual(skill.status, 'failing');
+    assert.strictEqual(skill.recurringErrors[0].error, 'missing csrf guidance');
+    assert.strictEqual(skill.recurringErrors[0].count, 2);
+  } finally {
+    cleanup(projectRoot);
+  }
+});
+
+test('amendify proposes SKILL.md patch content from failure evidence', () => {
+  const records = [
+    createSkillObservation({
+      task: 'Add API rate limiting',
+      skill: { id: 'api-design', path: 'skills/api-design/SKILL.md' },
+      success: false,
+      error: 'missing rate limiting guidance',
+      feedback: 'No rate-limit section'
+    }),
+    createSkillObservation({
+      task: 'Design public API error envelopes',
+      skill: { id: 'api-design', path: 'skills/api-design/SKILL.md' },
+      success: false,
+      error: 'missing error response examples',
+      feedback: 'Need explicit examples'
+    })
+  ];
+
+  const proposal = proposeSkillAmendment('api-design', records);
+
+  assert.strictEqual(proposal.schemaVersion, 'ecc.skill-amendment-proposal.v1');
+  assert.strictEqual(proposal.skill.id, 'api-design');
+  assert.strictEqual(proposal.status, 'proposed');
+  assert.ok(proposal.patch.preview.includes('## Failure-Driven Amendments'));
+  assert.ok(proposal.patch.preview.includes('rate limiting'));
+  assert.ok(proposal.patch.preview.includes('error response'));
+});
+
+test('evaluation scaffold compares amended and baseline performance', () => {
+  const records = [
+    createSkillObservation({
+      task: 'Fix flaky login test',
+      skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
+      success: false,
+      variant: 'baseline'
+    }),
+    createSkillObservation({
+      task: 'Fix flaky checkout test',
+      skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
+      success: true,
+      variant: 'baseline'
+    }),
+    createSkillObservation({
+      task: 'Fix flaky login test',
+      skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
+      success: true,
+      variant: 'amended',
+      amendmentId: 'amend-1'
+    }),
+    createSkillObservation({
+      task: 'Fix flaky checkout test',
+      skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
+      success: true,
+      variant: 'amended',
+      amendmentId: 'amend-1'
+    })
+  ];
+
+  const evaluation = buildSkillEvaluationScaffold('e2e-testing', records, {
+    amendmentId: 'amend-1',
+    minimumRunsPerVariant: 2
+  });
+
+  assert.strictEqual(evaluation.schemaVersion, 'ecc.skill-evaluation.v1');
+  assert.strictEqual(evaluation.baseline.runs, 2);
+  assert.strictEqual(evaluation.amended.runs, 2);
+  assert.strictEqual(evaluation.delta.successRate, 0.5);
+  assert.strictEqual(evaluation.recommendation, 'promote-amendment');
+});
+
+console.log(`\n=== Results: ${passed} passed, ${failed} failed ===`);
+if (failed > 0) process.exit(1);
--- a/tests/scripts/session-inspect.test.js
+++ b/tests/scripts/session-inspect.test.js
@@ -143,6 +143,133 @@ function runTests() {
    }
  })) passed++; else failed++;

+  if (test('inspects skill health from recorded observations', () => {
+    const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-skills-'));
+    const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
+    fs.mkdirSync(observationsDir, { recursive: true });
+    fs.writeFileSync(
+      path.join(observationsDir, 'observations.jsonl'),
+      [
+        JSON.stringify({
+          schemaVersion: 'ecc.skill-observation.v1',
+          observationId: 'obs-1',
+          timestamp: '2026-03-14T12:00:00.000Z',
+          task: 'Review auth middleware',
+          skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
+          outcome: { success: false, status: 'failure', error: 'missing csrf guidance', feedback: 'Need CSRF coverage' },
+          run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
+        }),
+        JSON.stringify({
+          schemaVersion: 'ecc.skill-observation.v1',
+          observationId: 'obs-2',
+          timestamp: '2026-03-14T12:05:00.000Z',
+          task: 'Review auth middleware',
+          skill: { id: 'security-review', path: 'skills/security-review/SKILL.md' },
+          outcome: { success: false, status: 'failure', error: 'missing csrf guidance', feedback: null },
+          run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-2' }
+        })
+      ].join('\n') + '\n'
+    );
+
+    try {
+      const result = run(['skills:health'], { cwd: projectRoot });
+      assert.strictEqual(result.code, 0, result.stderr);
+      const payload = JSON.parse(result.stdout);
+      assert.strictEqual(payload.schemaVersion, 'ecc.skill-health.v1');
+      assert.ok(payload.skills.some(skill => skill.skill.id === 'security-review'));
+    } finally {
+      fs.rmSync(projectRoot, { recursive: true, force: true });
+    }
+  })) passed++; else failed++;
+
+  if (test('proposes skill amendments through session-inspect', () => {
+    const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-amend-'));
+    const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
+    fs.mkdirSync(observationsDir, { recursive: true });
+    fs.writeFileSync(
+      path.join(observationsDir, 'observations.jsonl'),
+      [
+        JSON.stringify({
+          schemaVersion: 'ecc.skill-observation.v1',
+          observationId: 'obs-1',
+          timestamp: '2026-03-14T12:00:00.000Z',
+          task: 'Add rate limiting',
+          skill: { id: 'api-design', path: 'skills/api-design/SKILL.md' },
+          outcome: { success: false, status: 'failure', error: 'missing rate limiting guidance', feedback: 'Need rate limiting examples' },
+          run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
+        })
+      ].join('\n') + '\n'
+    );
+
+    try {
+      const result = run(['skills:amendify', '--skill', 'api-design'], { cwd: projectRoot });
+      assert.strictEqual(result.code, 0, result.stderr);
+      const payload = JSON.parse(result.stdout);
+      assert.strictEqual(payload.schemaVersion, 'ecc.skill-amendment-proposal.v1');
+      assert.strictEqual(payload.skill.id, 'api-design');
+      assert.ok(payload.patch.preview.includes('Failure-Driven Amendments'));
+    } finally {
+      fs.rmSync(projectRoot, { recursive: true, force: true });
+    }
+  })) passed++; else failed++;
+
+  if (test('builds skill evaluation scaffolding through session-inspect', () => {
+    const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-session-inspect-eval-'));
+    const observationsDir = path.join(projectRoot, '.claude', 'ecc', 'skills');
+    fs.mkdirSync(observationsDir, { recursive: true });
+    fs.writeFileSync(
+      path.join(observationsDir, 'observations.jsonl'),
+      [
+        JSON.stringify({
+          schemaVersion: 'ecc.skill-observation.v1',
+          observationId: 'obs-1',
+          timestamp: '2026-03-14T12:00:00.000Z',
+          task: 'Fix flaky login test',
+          skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
+          outcome: { success: false, status: 'failure', error: null, feedback: null },
+          run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-1' }
+        }),
+        JSON.stringify({
+          schemaVersion: 'ecc.skill-observation.v1',
+          observationId: 'obs-2',
+          timestamp: '2026-03-14T12:10:00.000Z',
+          task: 'Fix flaky checkout test',
+          skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
+          outcome: { success: true, status: 'success', error: null, feedback: null },
+          run: { variant: 'baseline', amendmentId: null, sessionId: 'sess-2' }
+        }),
+        JSON.stringify({
+          schemaVersion: 'ecc.skill-observation.v1',
+          observationId: 'obs-3',
+          timestamp: '2026-03-14T12:20:00.000Z',
+          task: 'Fix flaky login test',
+          skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
+          outcome: { success: true, status: 'success', error: null, feedback: null },
+          run: { variant: 'amended', amendmentId: 'amend-1', sessionId: 'sess-3' }
+        }),
+        JSON.stringify({
+          schemaVersion: 'ecc.skill-observation.v1',
+          observationId: 'obs-4',
+          timestamp: '2026-03-14T12:30:00.000Z',
+          task: 'Fix flaky checkout test',
+          skill: { id: 'e2e-testing', path: 'skills/e2e-testing/SKILL.md' },
+          outcome: { success: true, status: 'success', error: null, feedback: null },
+          run: { variant: 'amended', amendmentId: 'amend-1', sessionId: 'sess-4' }
+        })
+      ].join('\n') + '\n'
+    );
+
+    try {
+      const result = run(['skills:evaluate', '--skill', 'e2e-testing', '--amendment-id', 'amend-1'], { cwd: projectRoot });
+      assert.strictEqual(result.code, 0, result.stderr);
+      const payload = JSON.parse(result.stdout);
+      assert.strictEqual(payload.schemaVersion, 'ecc.skill-evaluation.v1');
+      assert.strictEqual(payload.recommendation, 'promote-amendment');
+    } finally {
+      fs.rmSync(projectRoot, { recursive: true, force: true });
+    }
+  })) passed++; else failed++;
+
  console.log(`\nResults: Passed: ${passed}, Failed: ${failed}`);
  process.exit(failed > 0 ? 1 : 0);
 }