Compare commits

...

15 Commits

Author SHA1 Message Date
Affaan Mustafa
3fc22ae751 fix: lock patched fast-uri 2026-05-12 20:20:54 -04:00
Affaan Mustafa
c229b74d41 docs: record AgentShield baseline CLI (#1834) 2026-05-12 20:15:09 -04:00
Affaan Mustafa
be42989746 docs: define AgentShield enterprise roadmap (#1833) 2026-05-12 19:56:12 -04:00
Affaan Mustafa
d2d8cda8b3 docs: record AgentShield PDF export decision (#1832) 2026-05-12 19:28:26 -04:00
Affaan Mustafa
894ee03930 docs: record ECC-Tools evaluator corpus merge (#1831) 2026-05-12 19:12:20 -04:00
Affaan Mustafa
37c27a60fd docs: add deep-analyzer evaluator scenario 2026-05-12 18:52:09 -04:00
Affaan Mustafa
337ced0828 docs: add skill-quality evaluator scenario 2026-05-12 18:36:25 -04:00
Affaan Mustafa
b25d4770f5 docs: add AgentShield policy exception evaluator scenario 2026-05-12 18:19:49 -04:00
Affaan Mustafa
6fbf58d590 ci: keep package manager cache failures non-blocking 2026-05-12 18:03:30 -04:00
Affaan Mustafa
3dddfc8270 docs: add evaluator harness config scenario 2026-05-12 18:03:30 -04:00
Affaan Mustafa
cd90c84c32 docs: add evaluator CI failure scenario (#1826) 2026-05-12 17:44:00 -04:00
Affaan Mustafa
863519eecf docs: add evaluator billing readiness scenario (#1825) 2026-05-12 17:24:34 -04:00
Affaan Mustafa
dcf5668b27 docs: add evaluator rag prototype (#1824) 2026-05-12 17:04:39 -04:00
Affaan Mustafa
f2deedcf3d docs: record clean plugin publication smoke (#1823) 2026-05-12 16:45:54 -04:00
Affaan Mustafa
bfacf37715 docs: record rc1 publication dry-run evidence (#1822) 2026-05-12 16:27:52 -04:00
48 changed files with 2706 additions and 36 deletions

View File

@@ -77,6 +77,7 @@ jobs:
- name: Cache npm
if: matrix.pm == 'npm'
continue-on-error: true
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ${{ steps.npm-cache-dir.outputs.dir }}
@@ -94,6 +95,7 @@ jobs:
- name: Cache pnpm
if: matrix.pm == 'pnpm'
continue-on-error: true
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ${{ steps.pnpm-cache-dir.outputs.dir }}
@@ -115,6 +117,7 @@ jobs:
- name: Cache yarn
if: matrix.pm == 'yarn'
continue-on-error: true
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ${{ steps.yarn-cache-dir.outputs.dir }}
@@ -124,6 +127,7 @@ jobs:
- name: Cache bun
if: matrix.pm == 'bun'
continue-on-error: true
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ~/.bun/install/cache

View File

@@ -67,6 +67,7 @@ jobs:
- name: Cache npm
if: inputs.package-manager == 'npm'
continue-on-error: true
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ${{ steps.npm-cache-dir.outputs.dir }}
@@ -84,6 +85,7 @@ jobs:
- name: Cache pnpm
if: inputs.package-manager == 'pnpm'
continue-on-error: true
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ${{ steps.pnpm-cache-dir.outputs.dir }}
@@ -105,6 +107,7 @@ jobs:
- name: Cache yarn
if: inputs.package-manager == 'yarn'
continue-on-error: true
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ${{ steps.yarn-cache-dir.outputs.dir }}
@@ -114,6 +117,7 @@ jobs:
- name: Cache bun
if: inputs.package-manager == 'bun'
continue-on-error: true
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ~/.bun/install/cache

View File

@@ -6,3 +6,17 @@ scripts/release.sh
# Plugin dev notes (not needed by consumers)
.claude-plugin/PLUGIN_SCHEMA_NOTES.md
# Python/test cache artifacts are local build byproducts, not runtime surface
__pycache__/
**/__pycache__/
**/__pycache__/**
*.pyc
*.pyo
*.pyd
**/*.pyc
**/*.pyo
**/*.pyd
*$py.class
.pytest_cache/
**/.pytest_cache/**

View File

@@ -45,6 +45,24 @@ As of 2026-05-12:
rc.1 naming decision: ship as Everything Claude Code (ECC), keep
`ecc-universal` for npm, keep `ecc` for Claude/Codex plugin slugs, and defer
any broader repo/package rename until after the release pipeline is proven.
- `docs/releases/2.0.0-rc.1/publication-evidence-2026-05-12.md` records the
dry-run publication evidence pass: npm pack/publish dry-runs, temp install
smoke, Claude plugin validation/tag preflight, Codex marketplace CLI shape,
OpenCode build, and the remaining approval-gated release blockers.
- A detached clean worktree at
`bfacf37715b39655cbc2c48f12f2a35c67cb0253` verified Claude plugin tag
dry-run without `--force`, local marketplace discovery, temp-home local
install, enabled plugin listing, and clean uninstall for `ecc@ecc`
`2.0.0-rc.1`.
- `docs/architecture/evaluator-rag-prototype.md` and
`examples/evaluator-rag-prototype/` define the first read-only
self-improving harness prototype: scenario specs, traces, reports,
candidate playbooks, verifier results, accepted maintainer-salvage,
billing-readiness, CI-failure-diagnosis, and harness-config-quality
candidates, plus the AgentShield policy-exception scenario and rejected
unsafe candidates.
- The npm package surface now excludes Python bytecode/cache artifacts through
package `files` negation rules and a publish-surface regression test.
- `docs/legacy-artifact-inventory.md` records that no `_legacy-documents-*`
directories exist in the current checkout, inventories the two sibling
workspace-level `_legacy-documents-*` repos as sanitized extraction sources,
@@ -81,6 +99,24 @@ As of 2026-05-12:
scope, expiry, and days-until-expiry reporting; terminal output and GitHub
Action job-summary evidence; README docs; rebuilt action bundles; and
1,708-test validation.
- AgentShield PR #63 exposed baseline drift in the GitHub Action with
`baseline` / `save-baseline` inputs, baseline drift outputs, job-summary
evidence, regression annotations, README/API docs, rebuilt action bundles,
and green remote action/self-scan/Node verification.
- AgentShield PR #64 added the first-class `agentshield baseline write`
CLI command with severity filtering, JSON metadata output, README/API docs,
rebuilt CLI bundle, local TDD coverage, and green remote action/self-scan/Node
verification.
- AgentShield PDF-export decision: defer a native PDF writer for now. The
self-contained HTML executive report remains the exportable buyer artifact
and can be printed to PDF when needed; native PDF generation should wait for
explicit enterprise/compliance demand or a print-fidelity gap in the HTML
report.
- `docs/architecture/agentshield-enterprise-research-roadmap.md` identifies
the next AgentShield enterprise signal: move from scanner/report/policy gate
to a team control plane with baseline drift, evidence packs, multi-harness
adapters, corpus accuracy gates, remediation routing, threat intelligence,
and ECC-Tools/GitHub App integration.
- ECC PR #1778 recovered the useful stale #1413 network/homelab architect-agent
concepts.
- ECC-Tools PR #26 added cost/token-risk predictive follow-ups for AI routing,
@@ -128,6 +164,11 @@ As of 2026-05-12:
follow-up backlog items, preserving GitHub object caps while creating or
reusing Linear issues when `LINEAR_API_KEY` and `LINEAR_TEAM_ID` are
configured.
- ECC-Tools PR #40 added a checked-in evaluator/RAG corpus contract covering
stale-PR salvage, billing readiness, CI failure diagnosis, harness config
quality, AgentShield policy exceptions, skill-quality evidence,
deep-analyzer evidence, and RAG/evaluator comparison evidence, with each
scenario exercising missing-evidence and evidence-backed diffs.
- ECC PR #1803 landed the contributor Quarkus handling branch after maintainer
cleanup, current-`main` alignment, full local validation, and preservation of
the author's removal of incomplete ja-JP and zh-CN Quarkus translations.
@@ -178,12 +219,12 @@ is not complete unless the evidence column exists and has been freshly verified.
| ECC 2.0 preview pack ready | Release docs, quickstart, publication readiness, release notes | `docs/releases/2.0.0-rc.1/` and readiness docs are in-tree | Needs final release evidence |
| Hermes specialized skills included safely | Hermes setup/import docs and sanitized skill surface | Hermes setup and import playbook are public; secrets stay local | Needs final release review |
| Naming and rename readiness | Naming matrix across package/plugin/docs/social surfaces | `docs/releases/2.0.0-rc.1/naming-and-publication-matrix.md` records current package, repo, Claude plugin, Codex plugin, OpenCode, and npm availability evidence | Complete for rc.1; post-rc rename remains future work |
| Claude and Codex plugin publication | Contact/submission path with required artifacts and status | Publication readiness plus naming matrix document local validation and CLI marketplace/tag surfaces | Needs final release-commit plugin tag/install evidence |
| Claude and Codex plugin publication | Contact/submission path with required artifacts and status | Publication readiness, naming matrix, and May 12 dry-run evidence document plugin validation, clean-checkout Claude tag/install smoke, and Codex marketplace CLI shape | Needs explicit approval for real tag/push and marketplace submission |
| Articles, tweets, and announcements | X thread, LinkedIn copy, GitHub release copy, push checklist | Draft launch collateral exists under rc.1 release docs | Needs URL-backed refresh |
| AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal |
| ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus |
| AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit, baseline drift Action/CLI surfaces, enterprise research roadmap | PRs #53, #55-#64 landed with test evidence; native PDF export deferred in favor of self-contained HTML plus print-to-PDF until explicit enterprise demand appears; `docs/architecture/agentshield-enterprise-research-roadmap.md` selects baseline drift as the first control-plane slice | Baseline-drift Action and CLI write surfaces landed; evidence-pack routing remains |
| ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog, evaluator/RAG corpus | PRs #26-#40 landed with test evidence | Needs capacity-backed Linear rollout |
| GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete |
| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates exist | Needs evaluation/RAG prototype |
| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md`, `examples/evaluator-rag-prototype/`, and ECC-Tools PR #40 define read-only stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, AgentShield policy-exception, skill-quality evidence, deep-analyzer evidence, and RAG/evaluator comparison scenarios with trace, report, playbook, verifier, and predictive-check artifacts | Local corpus complete; hosted integration remains future |
| Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch |
| Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active |
| Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout |
@@ -202,9 +243,9 @@ back to the repo evidence and merge commits.
| Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch |
| Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag |
| Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA |
| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype design | Before deep analyzer expansion |
| AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision |
| ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch |
| Evaluation and RAG | Reference-set validation, harness audit, traces, ECC-Tools corpus | Read-only evaluator/RAG prototype plus stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, AgentShield policy-exception, skill-quality evidence, deep-analyzer evidence, and RAG/evaluator comparison fixtures | Hosted retrieval/check-run automation plan |
| AgentShield enterprise | AgentShield PR evidence and roadmap notes | Baseline-drift evidence-pack and backlog sync follow-up | Next implementation batch |
| ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy, evaluator/RAG corpus | Capacity-backed Linear rollout | Next implementation batch |
| Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch |
The project status update should always include:
@@ -319,6 +360,9 @@ Acceptance:
- Enterprise reports include JSON plus self-contained HTML executive output
with risk posture, priority findings, category exposure, and policy-exception
lifecycle evidence in terminal/CI summaries.
- Native PDF export is not a GA blocker unless an enterprise/compliance
workflow requires a generated PDF file instead of the self-contained HTML
report and browser print-to-PDF path.
### 6. ECC Tools Billing, Deep Analysis, PR Checks, And Linear Sync
@@ -340,6 +384,11 @@ Acceptance:
Manifest Integrity, CI/CD Recommendation, Cost/Token Risk, Reference Set
Validation, Deep Analyzer Evidence, RAG/Evaluator Evidence,
PR Review/Salvage Evidence, Skill Quality, and Agent Config Review.
- Evaluator/RAG billing readiness fixture
`examples/evaluator-rag-prototype/billing-marketplace-readiness/` records the
read-only claim-verification path for Marketplace, App, subscription, seat,
entitlement, and plan language before launch copy can treat those claims as
live.
- Cost/token-risk predictive follow-ups flag AI routing, model-call, usage,
quota, and budget changes when budget evidence is missing.
- Reference-set validation follow-ups flag analyzer, skill, agent, command, and
@@ -355,6 +404,10 @@ Acceptance:
- RAG/evaluator follow-ups flag retrieval, embedding, ranking, and evaluator
changes that lack reference-set comparison, golden trace, benchmark, fixture,
or eval-run evidence.
- Evaluator/RAG corpus contract mirrors the local prototype scenarios into
ECC-Tools fixtures and tests for stale-PR salvage, billing readiness,
CI failure diagnosis, harness config quality, AgentShield policy exceptions,
skill-quality evidence, deep-analyzer evidence, and RAG/evaluator comparison.
- PR review/stale-salvage follow-ups flag review, triage, stale-closure, and
pull-request automation changes that lack stale-salvage fixtures,
reviewer-thread cases, or reopen-flow reference evidence.
@@ -392,9 +445,14 @@ Acceptance:
## Next Engineering Slices
1. Decide whether AgentShield PDF export adds value beyond the merged HTML
executive report, corpus benchmark output, and exception lifecycle audit.
1. Finish the AgentShield baseline-drift control-plane slice from
`docs/architecture/agentshield-enterprise-research-roadmap.md`: PR #63
shipped the GitHub Action baseline outputs and job-summary evidence; PR #64
shipped first-class baseline snapshot creation through
`agentshield baseline write`; the remaining work is evidence-pack routing
and ECC-Tools backlog sync integration.
2. Enable/configure the merged Linear backlog sync path after workspace issue
capacity clears or the Linear workspace is upgraded.
3. Expand the evaluator/RAG corpus with real cleanup-batch cases as future
maintainer-owned examples land.
3. Use the ECC-Tools evaluator/RAG corpus as the promotion gate before adding
hosted retrieval, vector storage, model-backed judging, or automated
check-run promotion.

View File

@@ -136,6 +136,13 @@ Repo work:
- `agentshield`: feed prompt-injection and config-risk findings into regression
suites.
Current prototype:
- `docs/architecture/evaluator-rag-prototype.md` defines the read-only
evaluator/RAG artifact contract.
- `examples/evaluator-rag-prototype/` records the first scenario spec, trace,
report, candidate playbook, and verifier result for stale-PR salvage.
Verification:
- read-only prototype that emits a trace, report, candidate playbook, and

View File

@@ -0,0 +1,329 @@
# AgentShield Enterprise Research Roadmap
Generated: 2026-05-12
This is a planning artifact for the next AgentShield enterprise iteration. It
does not modify AgentShield code. The goal is to turn the current scanner,
policy gate, corpus, and reporting surface into a security control plane for
teams running AI coding agents across multiple harnesses.
## Evidence Reviewed
Current AgentShield repository state:
- AgentShield checkout on clean `main`.
- `README.md`, `API.md`, `package.json`, `.github/workflows/*`, and
`src/`/`tests/` module layout.
- Current supported user surfaces: `agentshield scan`, `agentshield init`,
`agentshield miniclaw start`, scanner JSON, MiniClaw API, GitHub Action,
HTML, SARIF, markdown, terminal, and JSON reports.
- Current enterprise-like surfaces: policy packs, GitHub Action policy
enforcement, SARIF policy violations, supply-chain provenance, corpus
benchmark, HTML executive reports, and exception lifecycle audit.
External references checked from official GitHub repos or README sources:
- [stablyai/orca](https://github.com/stablyai/orca): multi-agent IDE,
worktree isolation, live agent status, GitHub integration, diff review, and
notifications.
- [superset-sh/superset](https://github.com/superset-sh/superset): AI-agent
editor with worktree orchestration, built-in diff review, workspace presets,
and universal CLI-agent compatibility.
- [standardagents/dmux](https://github.com/standardagents/dmux): tmux/worktree
multiplexer with lifecycle hooks, multi-agent launches, pane visibility, and
merge/PR workflows.
- [jarrodwatts/claude-hud](https://github.com/jarrodwatts/claude-hud): Claude
Code statusline, context health, tool activity, agent tracking, todo
progress, transcript parsing, and usage telemetry.
- [stanford-iris-lab/meta-harness](https://github.com/stanford-iris-lab/meta-harness):
harness optimization through repeatable tasks, logged proposer interactions,
and evaluated scaffold changes.
- [greyhaven-ai/autocontext](https://github.com/greyhaven-ai/autocontext):
recursive improvement loop with traces, scored generations, playbooks,
persisted knowledge, scenario evaluation, and optional production traces.
- [NousResearch/hermes-agent](https://github.com/NousResearch/hermes-agent):
self-improving skills, memory, session search, multi-platform gateway,
scheduled automation, terminal backends, and trajectory generation.
- [anthropics/claude-code](https://github.com/anthropics/claude-code):
terminal, IDE, GitHub, plugin, permission, MCP, and data-retention surfaces.
- [anomalyco/opencode](https://github.com/anomalyco/opencode): provider-agnostic
open-source coding agent with build/plan agents, desktop beta,
client/server architecture, and LSP support.
- [opencode-ai/opencode](https://github.com/opencode-ai/opencode): earlier
archived Go-based terminal agent with sessions, providers, LSP, file change
tracking, custom commands, and auto-compact.
- [zed-industries/zed](https://github.com/zed-industries/zed): high-performance
multiplayer editor with strict license/compliance CI expectations.
- [aidenybai/ghast](https://github.com/aidenybai/ghast): native terminal
multiplexer built around Ghostty, workspace grouping, split panes, drag/drop,
notifications, and terminal search.
Local Claude Code source inspection:
- Reviewed only non-secret local file/module shape from a private Claude Code
source snapshot.
- Relevant surfaces observed: `tools/`, `utils/permissions/`, `utils/mcp/`,
`utils/hooks/`, `utils/plugins/`, `types/permissions.ts`,
`types/plugin.ts`, `remote/`, `tasks/`, `assistant/sessionHistory.ts`,
and session/history utilities.
- No code was copied. The takeaway is that AgentShield should track permissions,
plugins, MCP, hooks, remote sessions, task/subagent activity, and history as
first-class audit domains rather than treating a `.claude/` tree as the only
source of truth.
## Current AgentShield Position
AgentShield is already more than a static lint tool:
- Rule coverage spans secrets, permissions, hooks, MCP servers, agent configs,
prompt injection, supply chain, taint analysis, sandbox execution, policy
evaluation, runtime repair/status, corpus validation, MiniClaw, and Opus
analysis.
- Reports are usable by humans and machines: terminal, JSON, markdown, HTML,
SARIF, scan logs, and GitHub Action outputs.
- Enterprise hooks exist: policy packs, exception metadata, expiring/expired
exception reporting, SARIF code scanning, and job-summary output.
- Accuracy work is active: `runtimeConfidence`, template/example weighting,
docs-example downgrades, hook-manifest resolution, false-positive audit
guidance, and corpus readiness.
The next iteration should not be "add more regex rules" by default. The higher
leverage move is to make AgentShield remember, compare, route, and enforce
security posture across time, repos, teams, and harnesses.
## Enterprise Gaps
### 1. Organization Baselines And Drift
Enterprise buyers need to know whether a repo, team, or agent fleet is getting
safer or riskier over time. AgentShield has scan logs and baseline comparison
modules, and PR #63 now exposes that drift through GitHub Action inputs,
outputs, annotations, and job-summary evidence. PR #64 adds first-class
baseline snapshot creation through `agentshield baseline write`. The remaining
product surface should make CLI drift summaries, evidence packs, and
owner-ready deltas explicit.
Target capability:
- `agentshield baseline write --path .claude --output agentshield-baseline.json`
- `agentshield scan --baseline agentshield-baseline.json`
- Report sections for new, fixed, unchanged, suppressed, and policy-excepted
findings.
- GitHub Action output that posts "security posture changed" rather than only a
point-in-time grade.
### 2. Multi-Harness Security Adapters
The market is moving toward many parallel agent harnesses, not one tool. Orca,
Superset, dmux, OpenCode, Claude Code, Codex, Gemini, Zed, and terminal
multiplexers all create different security surfaces.
Target capability:
- A small adapter registry for `claude-code`, `opencode`, `codex`, `gemini`,
`zed`, `dmux`, `orca`, `superset`, and `generic-terminal`.
- Each adapter declares config paths, permission concepts, plugin surfaces,
MCP/tooling conventions, history/session surfaces, and CI evidence.
- Report output groups findings by harness and confidence, so template/docs
findings do not look like active runtime exposure.
### 3. Session And Worktree Awareness
Worktree-native orchestrators change the risk model. A team can run many agents
in parallel, each with its own branch, shell, MCP config, and local state.
Target capability:
- Optional scan metadata for branch, worktree path, agent name, session id,
provider, and orchestrator.
- A scan-history table that answers: which worktree introduced a new permission,
which agent run added a risky MCP, which branch relaxed policy, and whether
the final merged branch fixed it.
- A compact "security HUD" summary usable by statuslines, GitHub checks, and
local dashboards.
### 4. Evidence Packs For Buyers And Auditors
HTML reports are the right buyer-facing artifact today; native PDF is deferred.
The deeper need is a portable evidence bundle that can be attached to audits,
security reviews, and customer questionnaires.
Target capability:
- `agentshield scan --evidence-pack out/agentshield-evidence`
- Bundle includes JSON report, HTML report, SARIF, policy evaluation,
exception audit, baseline diff, dependency/provenance summary, and a short
README explaining how to interpret the artifacts.
- Optional redaction mode for secrets, local paths, usernames, and project names.
### 5. Regression Corpus And Reference Sets
Meta-Harness and Autocontext point to the same lesson: improvements need scored
scenarios, traces, and playbooks. AgentShield already has a corpus benchmark,
but enterprise trust needs a curated reference set for false positives,
false negatives, and policy regressions.
Target capability:
- Versioned scenario fixtures for critical rules, false-positive suppressions,
policy exceptions, template/docs examples, plugin manifests, and hook-code
resolution.
- Per-category precision/coverage reporting, not just aggregate readiness.
- A "no accuracy regression" gate that must pass before releases.
- Playbook notes for why a suppression exists and when it should expire.
### 6. Remediation Workflow
Security tools become enterprise-grade when they turn findings into accountable
work without flooding maintainers.
Target capability:
- One-click or CLI-generated remediation branch for safe transforms.
- Policy comments that group findings by owner and risk rather than by file
order.
- GitHub App support for check-run annotations, issue caps, Linear sync, and
deferred backlog export.
- Finding fingerprints that avoid duplicate issues across repeated scans.
### 7. Threat Intelligence And Package Reputation
Agent security depends on MCP packages, plugin repositories, action bundles,
and rapidly changing CLI ecosystems. Static checks need a maintained external
reputation layer.
Target capability:
- A local-first threat-intel cache for known MCP/package risks, CVEs, malware
package names, suspicious install scripts, mutable git dependencies, and
known-good packages.
- Offline deterministic mode remains available.
- Online enrichment is opt-in and produces clear provenance for every external
claim.
### 8. Commercial And Team Controls
AgentShield is already connected conceptually to the ECC Tools GitHub App.
Native GitHub payments make the product path more concrete: free local scans,
paid org policy gates, paid evidence bundles, and paid drift/history.
Target capability:
- Tier-aware GitHub App checks: free static scan, paid org policy enforcement,
paid evidence packs, paid historical drift, and paid deep analysis.
- Seat/team mapping for policy owners and exception approvers.
- Billing readiness checks shared with ECC-Tools so payment state never changes
enforcement behavior silently.
## Recommended Build Order
### Slice 1: Baseline Drift MVP
Implement the smallest enterprise control-plane primitive: compare this scan to
the last accepted baseline.
Artifacts:
- Baseline JSON schema.
- Baseline writer and comparator.
- Terminal and JSON report sections for new/fixed/unchanged findings.
- Tests covering stable fingerprints, fixed findings, new findings, and policy
exception carry-forward.
Why first:
- It reuses existing scan output.
- It improves CLI, GitHub Action, and GitHub App value at once.
- It does not require a hosted service.
### Slice 2: Evidence Pack Bundle
Bundle the existing machine and human reports into a portable audit artifact.
Artifacts:
- `--evidence-pack <dir>` CLI flag.
- Redacted bundle README.
- HTML, JSON, SARIF, policy, exception, and baseline diff files.
- Tests for file layout, redaction, and deterministic output names.
Why second:
- It converts existing reporting work into buyer-ready proof.
- It keeps native PDF deferred while still meeting audit handoff needs.
### Slice 3: Harness Adapter Registry
Make harness support explicit instead of implicit.
Artifacts:
- Adapter metadata for Claude Code, OpenCode, Codex, Gemini, dmux, generic
terminal, and project-local templates.
- Discovery output that reports which adapters matched and why.
- Report grouping by adapter.
- Tests using fixture directories for each adapter.
Why third:
- It aligns AgentShield with ECC's harness-agnostic positioning.
- It creates a stable surface for future Zed, Orca, Superset, and Hermes
integration without pretending all harnesses share Claude's config model.
### Slice 4: Corpus Accuracy Gate
Promote the corpus from a benchmark into a release gate.
Artifacts:
- Per-category corpus report.
- Required category thresholds.
- Regression snapshots for known false-positive suppressions.
- Release checklist entry requiring corpus readiness before publish.
Why fourth:
- It prevents enterprise credibility from degrading as rules expand.
- It creates a durable route for Meta-Harness/Autocontext-style improvement
loops later.
### Slice 5: GitHub App And Linear Sync Wiring
Connect AgentShield findings to ECC-Tools follow-up routing.
Artifacts:
- Finding fingerprints compatible with ECC-Tools issue caps.
- Linear-ready backlog export for baseline drift and policy violations.
- Check-run annotations grouped by owner/risk.
- Tests that ensure repeated scans do not spam duplicate issues.
Why fifth:
- It needs the baseline/fingerprint work from Slice 1.
- It is the bridge from local CLI to paid team workflow.
## Non-Goals For This Iteration
- Native PDF generation, unless buyer/compliance workflows explicitly require
generated PDF instead of HTML plus print-to-PDF.
- Hosted dashboards before the local baseline/evidence/fingerprint contracts are
stable.
- Fine-tuning or model training before deterministic corpus gates and reference
traces exist.
- Broad automated code rewrites for risky findings without explicit,
reviewable transforms and tests.
## Acceptance Gates
The AgentShield enterprise iteration is not complete until these are true:
- Local `npm run typecheck`, `npm run lint`, `npm test`, and `npm run build`
pass from the AgentShield repository root.
- Built CLI smoke tests cover the new flags or report modes.
- GitHub Action self-test covers the new CI-visible output.
- Documentation names the free/local path and the paid/team path separately.
- Evidence produced by the feature is deterministic enough for CI diffing.
- ECC-Tools can consume the finding fingerprints or backlog export without
exceeding GitHub/Linear object caps.
- The GA roadmap and Linear project status link to the merged AgentShield PRs.

View File

@@ -0,0 +1,158 @@
# Evaluator RAG Prototype
ECC 2.0 needs a self-improving harness loop that can learn from real work
without blindly mutating a user's Claude, Codex, OpenCode, dmux, Zed, or
terminal setup. This prototype defines the smallest read-only artifact set for
that loop.
The fixture set lives in
[`examples/evaluator-rag-prototype/`](../../examples/evaluator-rag-prototype/).
It started with the May 2026 stale-PR cleanup and salvage lane because that
lane has real inputs, real accepted work, and real rejected work. The corpus now
also includes a billing/Marketplace readiness scenario so launch copy cannot
treat dry-run release evidence or roadmap intent as live billing state. A
CI-failure diagnosis scenario adds the log-first workflow needed before an
agent proposes fixes for red checks. A harness-config quality scenario keeps
MCP, plugin, hook, command, agent, and adapter recommendations tied to the
adapter matrix before they mutate setup guidance. An AgentShield policy
exception scenario gates security exceptions on SARIF/report evidence, owner
fields, expiry state, and remediation-versus-exception decisions. A
skill-quality evidence scenario requires observed failure or feedback evidence,
working examples, reference-set gaps, and validation commands before a skill
amendment can be promoted. A deep-analyzer evidence scenario requires analyzer
corpus cases, expected-output comparisons, and risk-taxonomy proof before
repository or commit-analysis behavior can change.
## Reference Pressure
- Meta-Harness: treat the harness itself as an experiment with scenario specs,
verifier results, and promoted playbooks.
- Autocontext: store traces, reports, artifacts, and reusable improvements
before changing installed agent assets.
- Claude HUD: expose context, tools, todos, agent activity, checks, and risk so
an evaluator can judge a run after the fact.
- Hermes Agent: keep skills, memories, scheduler-like follow-ups, and terminal
gateway behavior explicit instead of hiding local commands.
- dmux, Orca, Superset, and Ghast: preserve worktree/session state so parallel
agent work can be compared, resumed, or closed cleanly.
- ECC Tools: route evaluator findings into PR comments, check runs, and Linear
backlog items without flooding GitHub.
## Artifact Contract
Every evaluator/RAG run is read-only until a verifier promotes a playbook.
| Artifact | Purpose | Fixture |
| --- | --- | --- |
| Scenario spec | Declares the objective, allowed evidence, forbidden actions, and pass/fail gates. | `scenario.json` |
| Trace | Captures observation, retrieval, proposal, verification, and promotion events. | `trace.json` |
| Report | Summarizes scores, evidence coverage, risks, and recommended next action. | `report.json` |
| Candidate playbook | Describes the maintainer-owned workflow that could be reused later. | `candidate-playbook.md` |
| Verifier result | Accepts or rejects candidates with concrete reasons and rollback notes. | `verifier-result.json` |
The prototype deliberately separates retrieval from action. A run can retrieve
closed PR diffs, Linear status, CI history, and local docs, but it cannot close,
merge, publish, tag, or rewrite configs as part of the evaluator pass.
## Phase Model
1. Observe the current queue, dirty worktrees, branch state, open PRs/issues,
discussions, CI state, and release gates.
2. Retrieve relevant reference evidence: stale-salvage ledger rows, prior
maintainer PRs, current docs, analyzer findings, CI failures, and harness
adapter rules.
3. Propose one or more playbooks with source attribution and expected
validation gates.
4. Verify each playbook against explicit acceptance and rejection rules.
5. Promote only the candidate that improves the scenario without widening blast
radius.
6. Record rollback guidance and unresolved manual-review tails.
## First Scenario
The first scenario is `stale-pr-salvage-maintainer-branch`.
It models the rule Affaan set during the May 2026 cleanup: stale closure is
queue hygiene, not loss of useful work. Useful closed PR work should be ported
into maintainer-owned PRs with attribution/backlinks, while generated churn,
bulk localization, and ambiguous translator work stay out of blind
cherry-picks.
The verifier accepts a maintainer salvage branch that:
- credits source PRs;
- avoids raw private context and personal paths;
- does not import stale bulk localization without translator review;
- records a durable ledger update;
- runs the same validation gates as a normal code, docs, or catalog change;
- leaves release publication actions approval-gated.
The verifier rejects a blind cherry-pick proposal that:
- imports stale translation/doc churn wholesale;
- skips the current catalog/install architecture;
- lacks attribution;
- lacks tests or ledger updates;
- mutates release or plugin publication state.
## Corpus Fixtures
The root fixture files preserve the original
`stale-pr-salvage-maintainer-branch` prototype. Additional scenarios can live in
subdirectories when they reuse the same five-artifact contract.
Current corpus:
- `stale-pr-salvage-maintainer-branch`: recovers useful closed PR work through
maintainer-owned branches with attribution and validation.
- `billing-marketplace-readiness`: verifies billing, App, and Marketplace
launch claims before public copy says they are live.
- `ci-failure-diagnosis`: requires failed-job logs, changed-file scope, and a
named regression command before a CI fix playbook can be promoted.
- `harness-config-quality`: requires adapter state, install/onramp path,
verification commands, risk notes, and config-preservation behavior before a
harness setup recommendation can be promoted.
- `agentshield-policy-exception`: requires AgentShield SARIF or report
evidence, policy-pack source, owner/ticket/scope/expiry fields, and expired
exception enforcement before a policy exception can be promoted.
- `skill-quality-evidence`: requires focused skill scope, observed failure or
user-feedback evidence, examples/reference-set coverage, validation commands,
and publication safety before a skill amendment can be promoted.
- `deep-analyzer-evidence`: requires maintained analyzer corpus cases,
expected-output comparisons, representative repository/commit histories, and
regression commands before deep-analysis behavior can be promoted.
## ECC Tools Mapping
ECC Tools already flags missing RAG/evaluator evidence for retrieval,
embedding, ranking, and evaluator changes. This prototype gives those checks a
target shape:
- `scenario.json` maps to analyzer corpus inputs.
- `trace.json` maps to golden traces and run telemetry.
- `report.json` maps to PR comment summaries and Linear backlog summaries.
- `candidate-playbook.md` maps to the suggested follow-up PR body.
- `verifier-result.json` maps to pass/fail check-run evidence.
Future ECC Tools work should consume these artifacts as fixture shape before it
adds hosted retrieval or model-backed judging. The local prototype is enough to
prove the contract before any paid API or vector store is introduced.
## Promotion Rules
A candidate can be promoted only when:
- the verifier result is `accepted`;
- at least one rejected candidate proves the verifier can say no;
- every source PR or reference artifact has attribution;
- the proposed action is maintainer-owned and reversible;
- validation commands are named;
- unresolved translator, release, billing, or publication items remain blocked
until separately approved.
## Next Expansion
The local evaluator/RAG corpus now covers the current evidence buckets. Future
work should consume these fixtures from ECC Tools before adding hosted
retrieval, vector storage, model-backed judging, or automated check-run
promotion.

View File

@@ -0,0 +1,103 @@
# ECC v2.0.0-rc.1 Publication Evidence — 2026-05-12
This is dry-run release evidence only. It does not create a GitHub release, npm
publication, plugin tag, marketplace submission, or announcement post.
## Source Commit
| Field | Evidence |
| --- | --- |
| Upstream main base | `0598af70a51346bae34d987b9bed143386055967` |
| Evidence branch | `codex/release-publication-evidence` |
| Evidence scope | Working tree with this branch's package hygiene and release-doc updates |
| Git remote | `https://github.com/affaan-m/everything-claude-code.git` |
| Local status caveat | Working tree had the unrelated untracked `docs/drafts/` directory |
The actual release operator should repeat these checks from the final release
commit with a clean checkout before publishing.
## Registry And Release State
| Surface | Command | Result |
| --- | --- | --- |
| GitHub prerelease | `gh release view v2.0.0-rc.1 --repo affaan-m/everything-claude-code --json tagName,url,isPrerelease` | `release not found` |
| npm dist-tags | `npm view ecc-universal dist-tags --json` | `{ "latest": "1.10.0" }` |
| npm package metadata | `node -p "require('./package.json').name + '@' + require('./package.json').version"` | `ecc-universal@2.0.0-rc.1` |
| Product identity | `rg -n "Everything Claude Code" README.md CHANGELOG.md docs/releases/2.0.0-rc.1` | Present in README and rc.1 release docs |
## npm Dry Run
The first pack pass exposed local Python bytecode cache files in the tarball
because broad package `files` entries included untracked local `__pycache__`
paths. This branch adds explicit package-file exclusions and a regression test
so `npm pack` fails if Python bytecode appears in the package surface.
| Command | Result |
| --- | --- |
| `node tests/scripts/npm-publish-surface.test.js` | Passed `2/2`; includes Python bytecode exclusion assertion |
| `npm pack --dry-run --json` | `ecc-universal-2.0.0-rc.1.tgz`; `entryCount: 965`; `size: 1565968`; `unpackedSize: 4934637`; `hasBytecode: false` |
| `npm publish --tag next --dry-run --json` | Dry-run target is npm registry with `tag next`; `entryCount: 965`; `hasBytecode: false` |
Temporary install smoke:
| Command | Result |
| --- | --- |
| `npm pack --pack-destination /tmp/ecc-publication-smoke-dd9ud5 --json` | Created `ecc-universal-2.0.0-rc.1.tgz` for local install smoke |
| `npm install --prefix /tmp/ecc-publication-smoke-dd9ud5 /tmp/ecc-publication-smoke-dd9ud5/ecc-universal-2.0.0-rc.1.tgz` | Added 8 packages |
| `node /tmp/ecc-publication-smoke-dd9ud5/node_modules/ecc-universal/scripts/ecc.js --help` | Printed ECC selective-install CLI help |
| `node /tmp/ecc-publication-smoke-dd9ud5/node_modules/ecc-universal/scripts/catalog.js profiles --json` | Returned the 6 install profiles: `minimal`, `core`, `developer`, `security`, `research`, `full` |
| `find /tmp/ecc-publication-smoke-dd9ud5/node_modules/ecc-universal -path '*__pycache__*' -o -name '*.pyc' -o -name '*.pyo' -o -name '*.pyd'` | No output |
## Plugin And Harness Evidence
| Surface | Command | Result |
| --- | --- | --- |
| Claude plugin manifest | `claude plugin validate .claude-plugin/plugin.json` | Passed |
| Claude plugin tag preflight | `claude plugin tag .claude-plugin --dry-run` | Blocked by unrelated untracked `docs/drafts/` |
| Claude plugin tag forced dry-run | `claude plugin tag .claude-plugin --dry-run --force` | Would create `ecc--v2.0.0-rc.1` at HEAD; do not use `--force` for real release unless maintainer decides |
| Codex marketplace CLI | `codex plugin marketplace --help` and subcommand help | Supports `add`, `upgrade`, and `remove`; `add` supports repo and local marketplace roots |
| OpenCode package | `npm run build:opencode` | Passed |
| Claude hook/plugin route | `node tests/hooks/hooks.test.js` | Passed `236/236` |
| Codex release surface | `node tests/docs/ecc2-release-surface.test.js` | Passed `18/18` |
| Agent/catalog metadata | `node tests/scripts/catalog.test.js` | Passed `7/7` |
| Observability gate | `npm run observability:ready` | Passed `16/16` |
## Clean-Checkout Claude Plugin Smoke
This follow-up pass used a detached clean worktree at
`/tmp/ecc-clean-plugin-evidence` from commit
`bfacf37715b39655cbc2c48f12f2a35c67cb0253`. It used an isolated temp home
(`HOME=/tmp/ecc-clean-plugin-home`) and a temp local project
(`/tmp/ecc-plugin-install-smoke`), so it did not write to the user's real Claude
plugin config.
| Command | Result |
| --- | --- |
| `git -C /tmp/ecc-clean-plugin-evidence status --short --branch` | `## HEAD (no branch)` with no dirty or untracked files |
| `claude plugin validate .claude-plugin/plugin.json` | Passed |
| `claude plugin validate .claude-plugin/marketplace.json` | Passed |
| `claude plugin tag .claude-plugin --dry-run` | Passed without `--force`; would create `ecc--v2.0.0-rc.1` at HEAD and push `refs/tags/ecc--v2.0.0-rc.1` |
| `claude plugin marketplace add /tmp/ecc-clean-plugin-evidence --scope local` with temp `HOME` | Added marketplace `ecc` in local settings |
| `claude plugin list --available --json` with temp `HOME` | Listed `ecc@ecc`, version `2.0.0-rc.1`, source `./` |
| `claude plugin install ecc@ecc --scope local` with temp `HOME` | Installed `ecc@ecc` in local scope |
| `claude plugin list --json` with temp `HOME` | Listed `ecc@ecc`, version `2.0.0-rc.1`, enabled, local scope, install path under `/tmp/ecc-clean-plugin-home/.claude/plugins/cache/ecc/ecc/2.0.0-rc.1` |
| `claude plugin uninstall ecc@ecc --scope local` with temp `HOME` | Uninstalled successfully; final plugin list was `[]` |
## Announcement Placeholder Check
The forbidden-placeholder scan only returned the publication-readiness checklist
lines that name those forbidden placeholders. No launch-pack placeholder
instances were found.
## Remaining Blockers
- Create or verify GitHub prerelease `v2.0.0-rc.1`.
- Publish `ecc-universal@2.0.0-rc.1` with npm dist-tag `next`.
- Create and push the Claude plugin tag only after explicit approval. The clean
checkout dry run and temp install smoke now pass.
- Confirm the live Claude/Codex/OpenCode marketplace submission path or record
the manual submission owner and status.
- Verify ECC Tools billing/App/Marketplace claims before using them in launch
copy.
- Refresh announcement copy with live URLs after release and package/plugin
URLs exist.

View File

@@ -6,33 +6,35 @@ URLs from the exact commit being released.
For the current rc.1 naming decision and package/plugin publication path, see
[`naming-and-publication-matrix.md`](naming-and-publication-matrix.md).
For the May 12 dry-run evidence pass, see
[`publication-evidence-2026-05-12.md`](publication-evidence-2026-05-12.md).
## Release Identity Matrix
| Surface | Expected value | Source of truth | Fresh check | Evidence artifact | Owner | Status |
| --- | --- | --- | --- | --- | --- | --- |
| Product name | Everything Claude Code / ECC | `README.md`, `CHANGELOG.md`, release notes | `rg -n "Everything Claude Code" README.md CHANGELOG.md docs/releases/2.0.0-rc.1` | Pending | Release owner | Pending |
| GitHub repo | `affaan-m/everything-claude-code` | Git remote and release URLs | `git remote get-url origin` | Pending | Release owner | Pending |
| Git tag | `v2.0.0-rc.1` | GitHub releases | `gh release view v2.0.0-rc.1 --repo affaan-m/everything-claude-code` | Pending | Release owner | Pending |
| npm package | `ecc-universal` | `package.json` | `node -p "require('./package.json').name"` | Pending | Package owner | Pending |
| npm version | `2.0.0-rc.1` | `VERSION`, `package.json`, lockfiles | `node -p "require('./package.json').version"` | Pending | Package owner | Pending |
| npm dist-tag | `next` for rc, `latest` only for GA | npm registry | `npm view ecc-universal dist-tags --json` | Pending | Package owner | Pending |
| Claude plugin slug | `ecc` / `ecc@ecc` install path | `.claude-plugin/plugin.json`, `.claude-plugin/marketplace.json` | `node tests/hooks/hooks.test.js` | Pending | Plugin owner | Pending |
| Claude plugin manifest | `2.0.0-rc.1`, no unsupported `agents` or explicit `hooks` fields | `.claude-plugin/plugin.json`, `.claude-plugin/PLUGIN_SCHEMA_NOTES.md` | `claude plugin validate .claude-plugin/plugin.json` | Pending | Plugin owner | Pending |
| Codex plugin manifest | `2.0.0-rc.1` with shared skill source | `.codex-plugin/plugin.json` | `node tests/docs/ecc2-release-surface.test.js` | Pending | Plugin owner | Pending |
| OpenCode package | `ecc-universal` plugin module | `.opencode/package.json`, `.opencode/index.ts` | `npm run build:opencode` | Pending | Package owner | Pending |
| Agent metadata | `2.0.0-rc.1` | `agent.yaml`, `.agents/plugins/marketplace.json` | `node tests/scripts/catalog.test.js` | Pending | Release owner | Pending |
| Migration copy | rc.1 upgrade path, not GA claim | `release-notes.md`, `quickstart.md`, `HERMES-SETUP.md` | `npx markdownlint-cli docs/releases/2.0.0-rc.1/*.md` | Pending | Docs owner | Pending |
| Product name | Everything Claude Code / ECC | `README.md`, `CHANGELOG.md`, release notes | `rg -n "Everything Claude Code" README.md CHANGELOG.md docs/releases/2.0.0-rc.1` | `publication-evidence-2026-05-12.md` | Release owner | Evidence recorded |
| GitHub repo | `affaan-m/everything-claude-code` | Git remote and release URLs | `git remote get-url origin` | `publication-evidence-2026-05-12.md` | Release owner | Evidence recorded |
| Git tag | `v2.0.0-rc.1` | GitHub releases | `gh release view v2.0.0-rc.1 --repo affaan-m/everything-claude-code` | `release not found` | Release owner | Blocked until release approval |
| npm package | `ecc-universal` | `package.json` | `node -p "require('./package.json').name"` | `publication-evidence-2026-05-12.md` | Package owner | Evidence recorded |
| npm version | `2.0.0-rc.1` | `VERSION`, `package.json`, lockfiles | `node -p "require('./package.json').version"` | `publication-evidence-2026-05-12.md` | Package owner | Evidence recorded |
| npm dist-tag | `next` for rc, `latest` only for GA | npm registry | `npm view ecc-universal dist-tags --json` | Current registry only has `latest: 1.10.0`; `next` is pending publish | Package owner | Blocked until publish approval |
| Claude plugin slug | `ecc` / `ecc@ecc` install path | `.claude-plugin/plugin.json`, `.claude-plugin/marketplace.json` | `node tests/hooks/hooks.test.js` | `publication-evidence-2026-05-12.md` | Plugin owner | Evidence recorded |
| Claude plugin manifest | `2.0.0-rc.1`, no unsupported `agents` or explicit `hooks` fields | `.claude-plugin/plugin.json`, `.claude-plugin/PLUGIN_SCHEMA_NOTES.md` | `claude plugin validate .claude-plugin/plugin.json` | `publication-evidence-2026-05-12.md` | Plugin owner | Evidence recorded |
| Codex plugin manifest | `2.0.0-rc.1` with shared skill source | `.codex-plugin/plugin.json` | `node tests/docs/ecc2-release-surface.test.js` | `publication-evidence-2026-05-12.md` | Plugin owner | Evidence recorded |
| OpenCode package | `ecc-universal` plugin module | `.opencode/package.json`, `.opencode/index.ts` | `npm run build:opencode` | `publication-evidence-2026-05-12.md` | Package owner | Evidence recorded |
| Agent metadata | `2.0.0-rc.1` | `agent.yaml`, `.agents/plugins/marketplace.json` | `node tests/scripts/catalog.test.js` | `publication-evidence-2026-05-12.md` | Release owner | Evidence recorded |
| Migration copy | rc.1 upgrade path, not GA claim | `release-notes.md`, `quickstart.md`, `HERMES-SETUP.md` | `npx markdownlint-cli docs/releases/2.0.0-rc.1/*.md` | Pending final lint on release commit | Docs owner | Pending |
## Publication Gates
| Gate | Required evidence | Fresh check | Blocker field | Owner | Status |
| --- | --- | --- | --- | --- | --- |
| GitHub release | Tag exists, release notes use final URLs, assets attached if needed | `gh release view v2.0.0-rc.1 --json tagName,url,isPrerelease` | `Blocker:` | Release owner | Pending |
| npm package | `npm pack --dry-run` has expected files, version matches, rc goes to `next` | `npm pack --dry-run` and `npm publish --tag next --dry-run` where supported | `Blocker:` | Package owner | Pending |
| Claude plugin | Manifest validates, marketplace JSON points to public repo, install docs match slug | `claude plugin validate .claude-plugin/plugin.json` | `Blocker:` | Plugin owner | Pending |
| Codex plugin | Manifest version matches package and docs, hook limitations are explicit | `node tests/docs/ecc2-release-surface.test.js` | `Blocker:` | Plugin owner | Pending |
| OpenCode package | Build output is regenerated from source and package metadata is current | `npm run build:opencode` | `Blocker:` | Package owner | Pending |
| GitHub release | Tag exists, release notes use final URLs, assets attached if needed | `gh release view v2.0.0-rc.1 --json tagName,url,isPrerelease` | `Blocker: release not found on 2026-05-12` | Release owner | Pending approval |
| npm package | `npm pack --dry-run` has expected files, version matches, rc goes to `next` | `npm pack --dry-run` and `npm publish --tag next --dry-run` where supported | `Blocker: actual publish requires approval; dry run passed with next tag` | Package owner | Dry-run passed |
| Claude plugin | Manifest validates, marketplace JSON points to public repo, install docs match slug | `claude plugin validate .claude-plugin/plugin.json`; `claude plugin tag .claude-plugin --dry-run`; isolated temp-home install smoke | `Blocker: real tag creation/push requires approval` | Plugin owner | Clean-checkout dry-run and install smoke recorded |
| Codex plugin | Manifest version matches package and docs, hook limitations are explicit | `node tests/docs/ecc2-release-surface.test.js` | `Blocker: marketplace submission path still manual/owner-gated` | Plugin owner | Evidence recorded |
| OpenCode package | Build output is regenerated from source and package metadata is current | `npm run build:opencode` | `Blocker: none for local build; public distribution still follows npm/plugin release` | Package owner | Evidence recorded |
| ECC Tools billing reference | Any billing claim links to verified Marketplace/App state | `gh api repos/ECC-Tools/ECC-Tools` plus app/marketplace URL check | `Blocker:` | ECC Tools owner | Pending |
| Announcement copy | X, LinkedIn, GitHub release, and longform copy point to live URLs | `rg -n "TODO" docs/releases/2.0.0-rc.1` and repeat for `TBD` | `Blocker:` | Release owner | Pending |
@@ -48,7 +50,7 @@ Record the exact commit SHA and command output before any publication action:
| Observability readiness | `npm run observability:ready` | 16/16 passing | Pending |
| Root suite | `node tests/run-all.js` | 0 failures | Pending |
| Markdown lint | `npx markdownlint-cli '**/*.md' --ignore node_modules` | 0 failures | Pending |
| Package surface | `node tests/scripts/npm-publish-surface.test.js` | 0 failures | Pending |
| Package surface | `node tests/scripts/npm-publish-surface.test.js` | 0 failures; no Python bytecode in npm tarball | `2/2` passed in May 12 evidence pass |
| Release surface | `node tests/docs/ecc2-release-surface.test.js` | 0 failures | Pending |
| Optional Rust surface | `cd ecc2 && cargo test` | 0 failures or explicit deferral | Pending |
@@ -56,8 +58,8 @@ Record the exact commit SHA and command output before any publication action:
- `main` has unreviewed release-surface changes after the evidence was recorded.
- `npm view ecc-universal dist-tags --json` contradicts the intended rc/GA tag.
- Claude plugin validation is unavailable and no manual install smoke test is
recorded.
- Claude plugin validation is unavailable or no clean-checkout install smoke
test is recorded for the intended release commit.
- Release notes or announcement drafts still contain placeholder URLs,
`TODO`, `TBD`, private workspace paths, or personal operator references.
- Billing, Marketplace, or plugin-submission copy claims a live surface before

View File

@@ -0,0 +1,49 @@
# AgentShield Policy Exception Playbook
Candidate id: `sarif-backed-timeboxed-exception-review`
Use this playbook when AgentShield organization-policy output produces a
finding that may need remediation, a time-boxed exception, or explicit
enforcement.
## Accepted Path
1. Identify the AgentShield finding id, category, severity, affected file or
MCP/hook surface, and policy pack or organization baseline.
2. Retrieve scanner evidence before judgment:
- SARIF/code-scanning result, especially `agentshield-policy/*`
- JSON/HTML report evidence
- terminal or GitHub Action job-summary counts
3. Record lifecycle fields for any exception request: owner, ticket, scope,
expiry, rationale, and whether it is active, expiring soon, or expired.
4. Keep expired exceptions rejected or enforced until new evidence exists.
5. Decide whether immediate remediation is possible. If not, only promote a
narrow time-boxed exception tied to the named owner, ticket, scope, and
expiry.
6. Keep AgentShield code, policy packs, enforcement settings, release state,
and live security posture out of the read-only evaluator run.
## Rejected Path
Do not blanket suppress a policy category, policy pack, or organization gate
because a finding is inconvenient.
Do not downgrade critical/high findings without SARIF or report evidence and a
current owner, ticket, scope, and expiry.
Do not treat expired exceptions as active. Expired means the policy gate should
remain enforced until a maintainer creates a fresh, bounded exception or fixes
the underlying issue.
## Minimum Validation
- `npx ecc-agentshield scan --format json`
- AgentShield SARIF/code-scanning artifact or report evidence
- `npx ecc-agentshield scan --format html` when executive review evidence is
needed
- Current exception lifecycle fields: owner, ticket, scope, expiry, status
- `node tests/docs/evaluator-rag-prototype.test.js`
- `git diff --check`
Record the scanner evidence, lifecycle state, policy-pack source, and
remediation-versus-exception decision in the maintainer PR body or handoff.

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "agentshield-policy-exception",
"run_id": "2026-05-12-agentshield-policy-exception-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"sarif_report_evidence": 0.95,
"exception_lifecycle": 0.93,
"ownership_specificity": 0.9,
"remediation_decision": 0.88,
"blanket_suppression_safety": 1
},
"findings": [
{
"id": "sarif-report-match-required",
"severity": "warning",
"summary": "AgentShield policy exceptions must name SARIF or report evidence before a remediation or exception playbook can be promoted."
},
{
"id": "expired-exception-enforcement",
"severity": "warning",
"summary": "Expired exceptions must remain rejected or enforced; the evaluator cannot treat stale approvals as active evidence."
},
{
"id": "bounded-owner-fields",
"severity": "info",
"summary": "Accepted exceptions preserve owner, ticket, scope, expiry, policy-pack source, and affected surface fields."
}
],
"recommended_next_action": {
"candidate_id": "sarif-backed-timeboxed-exception-review",
"action": "Use the promoted playbook for future AgentShield policy exception requests before changing gates, suppressing categories, or accepting security risk."
}
}

View File

@@ -0,0 +1,62 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "agentshield-policy-exception",
"title": "Gate AgentShield policy exceptions with report and SARIF evidence",
"mode": "read_only_prototype",
"objective": "Given an AgentShield organization-policy finding or proposed exception, retrieve report, SARIF, lifecycle, and ownership evidence before promoting a remediation or time-boxed exception playbook.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/ECC-2.0-GA-ROADMAP.md",
"purpose": "Durable record of AgentShield policy gates, SARIF output, policy packs, reports, corpus benchmark, and exception lifecycle audit evidence"
},
{
"kind": "repo_command",
"path": "commands/security-scan.md",
"purpose": "ECC command contract for running AgentShield and separating scanner facts from follow-up judgment"
},
{
"kind": "repo_skill",
"path": "skills/security-scan/SKILL.md",
"purpose": "Operator-facing AgentShield scan workflow and output-format guidance"
},
{
"kind": "external_pr_evidence",
"repo": "affaan-m/agentshield",
"prs": [
55,
56,
57,
59,
60,
62
],
"purpose": "Policy gate, SARIF, policy-pack, HTML report, corpus benchmark, and exception lifecycle implementation evidence"
}
],
"retrieval_questions": [
"Which AgentShield policy finding, category, severity, and affected file or MCP/hook surface triggered the request?",
"Is there SARIF/code-scanning evidence for an `agentshield-policy/*` result, and does it match the report finding?",
"Is the exception active, expiring soon, or expired?",
"Does the exception include owner, ticket, scope, expiry, and rationale fields?",
"Which policy pack or organization baseline produced the finding?",
"Is remediation possible now, or is a bounded exception safer than a blanket suppression?"
],
"forbidden_actions": [
"approving policy exceptions without SARIF or report evidence",
"treating expired exceptions as active",
"blanket-suppressing AgentShield policy packs or organization-policy gates",
"downgrading critical/high findings without owner, ticket, scope, and expiry",
"editing AgentShield code or policy files from this ECC evaluator run",
"publishing or enforcing new security policy from this read-only evaluator run"
],
"acceptance_gates": [
"SARIF or report evidence is named",
"finding id, category, severity, and affected surface are preserved",
"policy pack or organization baseline is named",
"owner, ticket, scope, and expiry state are recorded",
"expired exceptions stay rejected or enforced",
"remediation versus time-boxed exception decision is explicit",
"at least one blanket suppression candidate is rejected"
]
}

View File

@@ -0,0 +1,45 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "agentshield-policy-exception",
"run_id": "2026-05-12-agentshield-policy-exception-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "A policy finding or exception request references AgentShield organization-policy output. The evaluator records the affected finding without editing AgentShield code, policy packs, or enforcement settings.",
"evidence": [
"docs/ECC-2.0-GA-ROADMAP.md",
"commands/security-scan.md"
]
},
{
"phase": "retrieval",
"summary": "Retrieved SARIF/report evidence, policy-pack source, exception lifecycle state, owner, ticket, scope, expiry, and whether remediation is immediately available.",
"evidence": [
"agentshield-policy/* SARIF result",
"AgentShield report exception counts",
"skills/security-scan/SKILL.md"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: SARIF-backed time-boxed exception review, and blanket policy suppression for the affected category.",
"candidate_ids": [
"sarif-backed-timeboxed-exception-review",
"blanket-policy-suppression"
]
},
{
"phase": "verification",
"summary": "Accepted the evidence-backed exception review because it preserves finding details and lifecycle fields. Rejected blanket suppression because it bypasses policy gates and ignores expired exceptions.",
"evidence": [
"examples/evaluator-rag-prototype/agentshield-policy-exception/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only AgentShield policy exception playbook. The evaluator does not modify AgentShield code, policy packs, enforcement settings, release state, or live security posture.",
"promoted_candidate_id": "sarif-backed-timeboxed-exception-review"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "agentshield-policy-exception",
"run_id": "2026-05-12-agentshield-policy-exception-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "sarif-backed-timeboxed-exception-review",
"decision": "accepted",
"score": 0.93,
"reasons": [
"names SARIF/code-scanning or report evidence for the AgentShield finding",
"preserves finding id, category, severity, affected surface, and policy-pack source",
"records owner, ticket, scope, expiry, and active/expiring/expired lifecycle state",
"rejects expired exceptions and requires remediation or a time-boxed exception",
"keeps AgentShield code, policy packs, enforcement settings, and release actions out of the read-only evaluator run"
],
"rollback": "Do not apply the future exception or suppression; re-run AgentShield, restore the prior organization policy, and keep the finding enforced until owner/ticket/scope/expiry evidence is current."
},
{
"candidate_id": "blanket-policy-suppression",
"decision": "rejected",
"score": 0.11,
"reasons": [
"has no SARIF or report evidence",
"blanket-suppresses AgentShield policy packs and organization-policy gates",
"treats expired exceptions as active",
"drops owner, ticket, scope, and expiry fields",
"would edit AgentShield or policy gate behavior from an ECC evaluator run"
],
"rollback": "Do not suppress the policy category; restart from scanner evidence, lifecycle state, and a bounded remediation or exception request."
}
],
"promoted_candidate_id": "sarif-backed-timeboxed-exception-review"
}

View File

@@ -0,0 +1,41 @@
# Billing Marketplace Readiness Playbook
Use this playbook when release copy or roadmap text mentions ECC Tools
billing, Marketplace availability, account recovery, plans, seats,
entitlements, or subscription state.
## Accepted Path
1. Start from `docs/releases/2.0.0-rc.1/publication-readiness.md`.
2. Check the current repo and public listing surfaces:
- `gh api repos/ECC-Tools/ECC-Tools`
- `https://github.com/marketplace/ecc-tools`
3. Classify every billing or Marketplace claim as:
- `verified`
- `blocked`
- `remove-before-publication`
4. Keep roadmap acceptance criteria separate from live product claims.
5. Update release copy only after the evidence points to a live URL or command
result.
6. Leave tag creation, npm publish, plugin submission, marketplace edits,
subscription changes, and announcement posting approval-gated.
## Rejected Path
Do not say billing is live because a roadmap item exists, a dry run passed, or a
Marketplace URL is known. Roadmap intent and dry-run publication evidence are
not a billing state.
Do not edit plan limits, subscriptions, seats, entitlements, or Marketplace
metadata from the evaluator run. Those are product/operator actions and require
their own approval path.
## Validation Gates
- `rg -n "billing|Billing|Marketplace|marketplace|subscription|seat|entitlement|plan" README.md docs/releases/2.0.0-rc.1 docs/ECC-2.0-GA-ROADMAP.md`
- `gh api repos/ECC-Tools/ECC-Tools`
- Manual live check of `https://github.com/marketplace/ecc-tools`
- `npx --yes markdownlint-cli docs/releases/2.0.0-rc.1/*.md docs/ECC-2.0-GA-ROADMAP.md`
- `git diff --check`
Record the evidence in a maintainer-owned PR before release copy is published.

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "billing-marketplace-readiness",
"run_id": "2026-05-12-billing-marketplace-readiness-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"claim_evidence": 0.82,
"publication_safety": 1,
"marketplace_specificity": 0.84,
"billing_scope_control": 1,
"announcement_safety": 1
},
"findings": [
{
"id": "billing-claim-gate-needed",
"severity": "warning",
"summary": "Release docs require a fresh ECC Tools billing/App/Marketplace check before launch copy can claim live billing readiness."
},
{
"id": "dry-run-not-live-state",
"severity": "warning",
"summary": "May 12 evidence proves package/plugin dry runs and clean install smoke, but it does not prove a live Marketplace billing state."
},
{
"id": "safe-next-action",
"severity": "info",
"summary": "The reusable next action is a read-only evidence checklist that classifies each launch-copy billing claim before publication."
}
],
"recommended_next_action": {
"candidate_id": "evidence-backed-billing-check",
"action": "Run the promoted billing/Marketplace claim-verification checklist before any launch copy, GitHub release text, or social copy says billing is live."
}
}

View File

@@ -0,0 +1,55 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "billing-marketplace-readiness",
"title": "Verify billing and Marketplace claims before launch copy",
"mode": "read_only_prototype",
"objective": "Given rc.1 release docs and ECC Tools billing roadmap evidence, separate verified Marketplace/App/billing state from assumptions before any announcement or publication action.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/releases/2.0.0-rc.1/publication-readiness.md",
"purpose": "Release gate that blocks billing and Marketplace claims until fresh evidence exists"
},
{
"kind": "repo_doc",
"path": "docs/releases/2.0.0-rc.1/publication-evidence-2026-05-12.md",
"purpose": "Dry-run publication evidence and explicit remaining blocker list"
},
{
"kind": "roadmap",
"path": "docs/ECC-2.0-GA-ROADMAP.md",
"purpose": "ECC Tools billing audit acceptance criteria and remaining release blockers"
},
{
"kind": "github_api",
"command": "gh api repos/ECC-Tools/ECC-Tools",
"purpose": "Fresh repository access and app-surface evidence before launch claims"
},
{
"kind": "public_url",
"url": "https://github.com/marketplace/ecc-tools",
"purpose": "Marketplace listing that must be checked live before copy says billing is ready"
}
],
"retrieval_questions": [
"Which billing or Marketplace claims are already backed by repo evidence?",
"Which claims still need a live Marketplace, App, subscription, plan, or entitlement check?",
"Which announcement docs mention billing or Marketplace status?",
"Which publication actions remain approval-gated and must not run during this evaluator pass?"
],
"forbidden_actions": [
"creating or editing GitHub Marketplace listings",
"changing plan limits, subscriptions, seats, or entitlements",
"creating release tags",
"publishing packages or plugins",
"posting announcement copy",
"claiming live billing readiness from dry-run evidence alone"
],
"acceptance_gates": [
"launch-copy claims are classified as verified, blocked, or remove-before-publication",
"Marketplace and App checks name the exact URL or command needed",
"billing claims link to fresh evidence rather than roadmap intent",
"publication actions remain approval-gated",
"at least one overclaim candidate is rejected"
]
}

View File

@@ -0,0 +1,45 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "billing-marketplace-readiness",
"run_id": "2026-05-12-billing-marketplace-readiness-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "Publication readiness still marks ECC Tools billing references and announcement copy as pending. Dry-run publication evidence says billing/App/Marketplace claims must be verified before launch copy uses them.",
"evidence": [
"docs/releases/2.0.0-rc.1/publication-readiness.md",
"docs/releases/2.0.0-rc.1/publication-evidence-2026-05-12.md"
]
},
{
"phase": "retrieval",
"summary": "Retrieved the release gate, dry-run evidence, roadmap billing acceptance criteria, and the public Marketplace URL that requires a live operator check.",
"evidence": [
"docs/ECC-2.0-GA-ROADMAP.md",
"gh api repos/ECC-Tools/ECC-Tools",
"https://github.com/marketplace/ecc-tools"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: evidence-backed billing claim verification, and announcement-first billing copy that treats roadmap intent as live billing readiness.",
"candidate_ids": [
"evidence-backed-billing-check",
"announcement-first-billing-copy"
]
},
{
"phase": "verification",
"summary": "Accepted the evidence-backed check and rejected announcement-first copy because billing and Marketplace surfaces remain pending until verified by fresh URLs or API output.",
"evidence": [
"examples/evaluator-rag-prototype/billing-marketplace-readiness/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only verification playbook. No Marketplace edits, subscription changes, tags, package publishes, plugin submission, or announcement posts are performed.",
"promoted_candidate_id": "evidence-backed-billing-check"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "billing-marketplace-readiness",
"run_id": "2026-05-12-billing-marketplace-readiness-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "evidence-backed-billing-check",
"decision": "accepted",
"score": 0.91,
"reasons": [
"keeps the run read-only",
"requires fresh Marketplace or GitHub API evidence",
"classifies launch-copy claims before publication",
"separates roadmap intent from live billing state",
"keeps release, package, plugin, billing, and announcement actions approval-gated"
],
"rollback": "Remove or revert any release-copy edits that cite unverified billing claims; no live billing state is changed by this playbook."
},
{
"candidate_id": "announcement-first-billing-copy",
"decision": "rejected",
"score": 0.18,
"reasons": [
"treats roadmap acceptance criteria as live billing evidence",
"does not require a fresh Marketplace listing check",
"could publish announcement copy before release URLs exist",
"does not classify unsupported claims for removal",
"risks implying subscription or entitlement readiness without proof"
],
"rollback": "Do not publish this copy; keep billing and Marketplace language blocked until the evidence checklist passes."
}
],
"promoted_candidate_id": "evidence-backed-billing-check"
}

View File

@@ -0,0 +1,41 @@
# Candidate Playbook: Maintainer-Owned Stale Salvage
Candidate id: `maintainer-salvage-branch`
## Use When
- A stale or conflicted PR was closed to keep the public queue usable.
- The closed diff contains a useful focused idea, skill, command, doc, test, or
bug fix.
- The contributor may not have time or interest to rebase.
## Steps
1. Record the source PR, author, useful concept, and closure reason in
`docs/stale-pr-salvage-ledger.md`.
2. Re-read the closed PR diff against current `main`.
3. Decide whether the patch can be cherry-picked safely. Prefer reimplementation
when current architecture has moved.
4. Create a maintainer-owned branch with one focused salvage unit.
5. Preserve attribution in the PR body and, when useful, in the commit body.
6. Update the catalog, docs, tests, or release evidence required by the touched
surface.
7. Run the same validation gates a normal change would require.
8. After merge, update the ledger from pending/salvage-branch to landed,
already-present, superseded, skipped, or translator/manual review.
## Reject Conditions
- The patch is bulk generated churn.
- The patch is stale localization that needs translator/manual review.
- The patch imports personal paths, secrets, local settings, or private operator context.
- The patch bypasses current install, catalog, plugin, or release architecture.
- The branch would mix unrelated salvage units into one PR.
## Minimum Validation
- Targeted test for the touched surface.
- `git diff --check`.
- Markdown lint when docs are touched.
- Catalog/install validation when skills, agents, commands, or plugin surfaces
are touched.

View File

@@ -0,0 +1,46 @@
# CI Failure Diagnosis Playbook
Candidate id: `log-backed-minimal-fix`
Use this playbook when a PR, maintainer branch, or release-readiness branch has
one or more red GitHub Actions checks.
## Accepted Path
1. Capture PR and branch context:
- `gh pr view <pr-number> --json files,statusCheckRollup,headRefName,baseRefName`
- `gh run view <run-id> --json jobs`
2. Fetch the failed log evidence:
- `gh run view <run-id> --log-failed`
3. Record the failing job, step, OS, Node/Python/Rust version, package manager,
and shortest useful error excerpt.
4. Compare the failing step to the PR changed files.
5. Search current docs, tests, and prior PRs for a known matching failure mode.
6. Promote the smallest fix path only when it includes a local reproduction or
regression command.
7. After a separate implementation branch exists, rerun the focused local gate,
then wait for the full GitHub Actions matrix before merge.
## Rejected Path
Do not keep rerunning CI until a transient green result appears without
recording the original failure and why it is safe to ignore.
Do not weaken tests, skip matrix legs, or broaden the patch to unrelated files
just to make the check pass.
Do not claim release readiness from a branch with required checks still red.
## Minimum Validation
- `gh run view <run-id> --log-failed`
- Focused local command matching the failing surface, such as:
- `node tests/<matching-test>.js`
- `npm run harness:audit -- --format json`
- `npm run observability:ready`
- `cargo test`
- `git diff --check`
- Full required GitHub Actions matrix before merge
Record the failed-log excerpt and the chosen regression command in the
maintainer PR body or handoff before merging the fix.

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "ci-failure-diagnosis",
"run_id": "2026-05-12-ci-failure-diagnosis-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"failure_evidence": 0.92,
"scope_control": 0.9,
"regression_specificity": 0.86,
"matrix_safety": 1,
"publication_safety": 1
},
"findings": [
{
"id": "log-first-required",
"severity": "warning",
"summary": "A CI fix candidate must start from the exact failed job, step, platform, runtime, package manager, and log excerpt rather than from a generic rerun."
},
{
"id": "changed-file-scope-needed",
"severity": "info",
"summary": "Changed-file context should narrow the fix to the surface that can affect the failing step, especially in a broad OS/runtime matrix."
},
{
"id": "regression-gate-needed",
"severity": "warning",
"summary": "A promoted fix playbook must name a local reproduction or regression command before the branch is allowed to merge."
}
],
"recommended_next_action": {
"candidate_id": "log-backed-minimal-fix",
"action": "Use the promoted CI failure diagnosis playbook whenever a PR check goes red before implementing or rerunning fixes."
}
}

View File

@@ -0,0 +1,57 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "ci-failure-diagnosis",
"title": "Diagnose CI failures from captured logs before proposing fixes",
"mode": "read_only_prototype",
"objective": "Given a failed CI run on a PR or maintainer branch, retrieve the exact failing job, captured log excerpt, changed-file context, and prior known-fix evidence before promoting a fix playbook.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/ECC-2.0-GA-ROADMAP.md",
"purpose": "Records ECC-Tools CI failure-mode predictive follow-ups and the evaluator/RAG corpus expansion need"
},
{
"kind": "repo_doc",
"path": "docs/architecture/evaluator-rag-prototype.md",
"purpose": "Defines the artifact contract and promotion rules for evaluator/RAG scenarios"
},
{
"kind": "github_actions",
"command": "gh run view <run-id> --log-failed",
"purpose": "Primary evidence for the failing job, failing step, and deterministic error text"
},
{
"kind": "github_pr",
"command": "gh pr view <pr-number> --json files,statusCheckRollup,headRefName,baseRefName",
"purpose": "Changed-file and check-rollup context for scoping the fix"
},
{
"kind": "repo_test",
"command": "node tests/run-all.js",
"purpose": "Local regression gate after a candidate fix is implemented outside the read-only evaluator run"
}
],
"retrieval_questions": [
"Which job, step, platform, runtime, and package manager failed?",
"What is the smallest failing log excerpt that explains the failure?",
"Which changed files are plausibly connected to the failing step?",
"Is there a prior known-fix, troubleshooting note, or fixture that matches this failure mode?",
"Which local command reproduces or guards the failure before a fix can merge?"
],
"forbidden_actions": [
"rerunning CI until it passes without diagnosing the failure",
"pushing speculative fixes without a captured failing log excerpt",
"editing unrelated files to make the matrix green",
"weakening or deleting tests to silence a failure",
"merging or publishing while required checks are red",
"creating release tags or posting announcements from this evaluator run"
],
"acceptance_gates": [
"failing job and step are named",
"captured log excerpt is linked or summarized",
"changed-file context is compared to the failing step",
"known-fix or no-known-fix status is recorded",
"local reproduction or regression command is named",
"at least one rerun-only candidate is rejected"
]
}

View File

@@ -0,0 +1,45 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "ci-failure-diagnosis",
"run_id": "2026-05-12-ci-failure-diagnosis-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "A PR or maintainer branch has a red GitHub Actions matrix. The evaluator records status without rerunning, merging, or editing code.",
"evidence": [
"gh pr view <pr-number> --json statusCheckRollup,files",
"gh run view <run-id> --json jobs"
]
},
{
"phase": "retrieval",
"summary": "Retrieved failed-job logs, changed-file context, current roadmap CI failure-mode requirements, and existing local regression commands.",
"evidence": [
"gh run view <run-id> --log-failed",
"docs/ECC-2.0-GA-ROADMAP.md",
"tests/run-all.js"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: log-backed minimal fix with regression coverage, and rerun-only optimism that treats CI flake as proven without evidence.",
"candidate_ids": [
"log-backed-minimal-fix",
"rerun-only-green-wait"
]
},
{
"phase": "verification",
"summary": "Accepted the log-backed minimal fix because it names failing evidence, scope, and validation. Rejected rerun-only waiting because it does not explain the failure or preserve a regression guard.",
"evidence": [
"examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only CI triage playbook. The evaluator does not push a fix, rerun CI, merge, publish, or weaken checks.",
"promoted_candidate_id": "log-backed-minimal-fix"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "ci-failure-diagnosis",
"run_id": "2026-05-12-ci-failure-diagnosis-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "log-backed-minimal-fix",
"decision": "accepted",
"score": 0.93,
"reasons": [
"requires failed job, step, platform, runtime, and log evidence",
"compares changed files to the failing surface before proposing a fix",
"names a focused local reproduction or regression command",
"keeps required checks intact",
"keeps merge, release, package, plugin, billing, and announcement actions approval-gated"
],
"rollback": "Revert the future implementation PR or restore the original failing test fixture; no code is changed by this read-only playbook."
},
{
"candidate_id": "rerun-only-green-wait",
"decision": "rejected",
"score": 0.17,
"reasons": [
"does not preserve the failing log excerpt",
"does not identify job, step, platform, runtime, or package manager",
"does not compare failure surface to changed files",
"does not add or name a regression gate",
"risks merging a flaky or still-unexplained CI failure"
],
"rollback": "Do not treat this as a fix; restart diagnosis from captured failed logs and changed-file context."
}
],
"promoted_candidate_id": "log-backed-minimal-fix"
}

View File

@@ -0,0 +1,60 @@
# Deep Analyzer Evidence Playbook
Candidate id: `corpus-backed-analyzer-change`
Use this playbook when a PR changes repository analysis, commit analysis,
architecture classification, workflow detection, pattern detection, or
deep-analysis risk-taxonomy behavior.
## Accepted Path
1. Name the changed analyzer surface and source file.
2. Retrieve the Deep Analyzer Evidence contract from `../ECC-Tools/README.md`
and the follow-up logic in `../ECC-Tools/src/lib/analyzer.ts`.
3. Match the change to maintained corpus or reference evidence:
- `../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts`
- `../ECC-Tools/src/analyzers/deep-analyzer-corpus.test.ts`
- `../ECC-Tools/src/lib/analyzer.compare.test.ts`
4. Compare expected outputs for the affected behavior:
- folder type;
- module organization;
- test location;
- primary language;
- commit message type;
- detected workflow names.
5. Add or update analyzer corpus, expected-output snapshots, fixtures,
benchmarks, golden cases, evals, or reference sets for the same changed
surface.
6. Run the relevant validation gate from `../ECC-Tools/`:
- `npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts`
- `npm run typecheck`
- `npm run lint`
7. Record the corpus case, expected-output comparison, validation output, and
rollback notes in the maintainer PR body or handoff.
## Rejected Path
Do not promote analyzer threshold, classification, or risk-taxonomy changes
without corpus, snapshot, fixture, benchmark, golden, eval, or reference-set
evidence.
Do not suppress the `Deep Analyzer Evidence` PR-risk bucket just because the
change is small. Suppress it only when co-located evidence covers the same
analyzer surface.
Do not rely only on broad manual review notes. Analyzer changes need
representative repository shapes or commit-history cases with expected outputs.
Do not post PR comments, create check runs, sync Linear, publish packages, edit
plugins, or create release artifacts from the evaluator run.
## Minimum Validation
- `npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts`
- `npm run typecheck`
- `npm run lint`
- `git diff --check`
- Markdown lint when docs or playbooks are touched
Preserve source attribution for analyzer evidence and include rollback guidance
for the future maintainer PR.

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "deep-analyzer-evidence",
"run_id": "2026-05-12-deep-analyzer-evidence-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"corpus_retrieval": 0.95,
"expected_output_comparison": 0.91,
"representative_case_coverage": 0.89,
"taxonomy_gap_safety": 0.93,
"publication_safety": 1
},
"findings": [
{
"id": "corpus-required",
"severity": "warning",
"summary": "Deep-analysis behavior changes need maintained corpus, snapshot, fixture, benchmark, golden, eval, or reference-set evidence before promotion."
},
{
"id": "expected-output-required",
"severity": "warning",
"summary": "Analyzer changes should compare expected folder type, module organization, test location, primary language, commit pattern, or workflow outputs."
},
{
"id": "read-only-routing",
"severity": "info",
"summary": "The evaluator can recommend a maintainer PR but cannot post PR comments, check runs, Linear sync updates, packages, plugins, or release actions itself."
}
],
"recommended_next_action": {
"candidate_id": "corpus-backed-analyzer-change",
"action": "Use the promoted deep-analyzer evidence playbook for PRs that change repository, commit, architecture, workflow, pattern, or risk-taxonomy analysis behavior."
}
}

View File

@@ -0,0 +1,57 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "deep-analyzer-evidence",
"title": "Require analyzer corpus evidence before promoting deep-analysis changes",
"mode": "read_only_prototype",
"objective": "Given a change to repository, commit, architecture, pattern, or deep-analysis logic, retrieve maintained analyzer corpus evidence and expected-output comparisons before promoting analyzer behavior or risk-taxonomy changes.",
"sources": [
{
"kind": "sibling_repo_doc",
"path": "../ECC-Tools/README.md",
"purpose": "Public description of deep-analyzer predictive follow-ups and the Deep Analyzer Evidence PR-risk bucket"
},
{
"kind": "sibling_repo_source",
"path": "../ECC-Tools/src/lib/analyzer.ts",
"purpose": "Predictive follow-up logic that flags analyzer changes without corpus, snapshot, fixture, or benchmark evidence"
},
{
"kind": "sibling_repo_source",
"path": "../ECC-Tools/src/lib/pr-risk-taxonomy.ts",
"purpose": "Non-blocking PR-risk taxonomy bucket for deep-analyzer evidence"
},
{
"kind": "sibling_repo_fixture",
"path": "../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts",
"purpose": "Maintained corpus cases for representative repository shapes, commit histories, and expected analyzer outputs"
},
{
"kind": "sibling_repo_test",
"command": "npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts",
"purpose": "Regression evidence for analyzer corpus outputs and deep-analyzer follow-up generation"
}
],
"retrieval_questions": [
"Which analyzer surface changed: repository structure, architecture, code style, commit messages, workflow detection, pattern detection, or risk taxonomy?",
"Which maintained corpus case or reference set covers the same analyzer behavior?",
"Do expected outputs compare folder type, module organization, test location, primary language, commit type, and workflow names?",
"Does the PR add analyzer corpus, snapshot, fixture, benchmark, golden, eval, or reference-set evidence alongside analyzer code changes?",
"Does the evaluator keep PR comments, check runs, Linear sync, package changes, and publication actions out of the read-only pass?"
],
"forbidden_actions": [
"promoting repository, commit, architecture, or deep-analysis changes without analyzer corpus evidence",
"suppressing the Deep Analyzer Evidence risk bucket without co-located corpus, snapshot, fixture, or benchmark evidence",
"changing analyzer thresholds or classifications without expected-output comparison",
"relying only on broad manual review notes instead of representative repository and commit-history cases",
"posting PR comments, check runs, or Linear sync updates from this read-only evaluator run",
"changing package, plugin, release, or publication state from this evaluator run"
],
"acceptance_gates": [
"changed analyzer surface is named",
"maintained corpus or reference-set path is included",
"expected analyzer outputs are compared",
"representative repository shape or commit history is described",
"regression command is named",
"at least one no-corpus analyzer change is rejected"
]
}

View File

@@ -0,0 +1,45 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "deep-analyzer-evidence",
"run_id": "2026-05-12-deep-analyzer-evidence-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "A deep-analysis PR changes repository, commit, architecture, workflow, pattern, or risk-taxonomy behavior. The evaluator records the touched analyzer surface and remains read-only.",
"evidence": [
"../ECC-Tools/src/lib/analyzer.ts",
"../ECC-Tools/src/lib/pr-risk-taxonomy.ts"
]
},
{
"phase": "retrieval",
"summary": "Retrieved the maintained analyzer corpus, corpus regression test, and follow-up tests that distinguish corpus-backed analyzer changes from no-evidence analyzer rewrites.",
"evidence": [
"../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts",
"../ECC-Tools/src/analyzers/deep-analyzer-corpus.test.ts",
"../ECC-Tools/src/lib/analyzer.compare.test.ts"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: corpus-backed analyzer change, and threshold-only analyzer rewrite without expected-output evidence.",
"candidate_ids": [
"corpus-backed-analyzer-change",
"threshold-only-analyzer-rewrite"
]
},
{
"phase": "verification",
"summary": "Accepted the corpus-backed analyzer change because it names representative repository/commit cases and expected-output comparisons. Rejected the threshold-only rewrite because it lacks corpus or benchmark evidence.",
"evidence": [
"examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only deep-analyzer evidence playbook. Future analyzer edits must move through maintainer PRs with corpus evidence, regression commands, and rollback notes.",
"promoted_candidate_id": "corpus-backed-analyzer-change"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "deep-analyzer-evidence",
"run_id": "2026-05-12-deep-analyzer-evidence-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "corpus-backed-analyzer-change",
"decision": "accepted",
"score": 0.92,
"reasons": [
"names the changed analyzer surface and matching maintained corpus case",
"compares expected analyzer outputs for representative repository and commit-history inputs",
"keeps Deep Analyzer Evidence taxonomy behavior tied to co-located corpus or benchmark evidence",
"names the regression command that exercises corpus and follow-up behavior",
"keeps PR comments, check runs, Linear sync, and publication actions out of the evaluator run"
],
"rollback": "Revert the future analyzer PR and restore the prior corpus expectations; no hosted check-run, Linear, package, or publication state changes in this read-only playbook."
},
{
"candidate_id": "threshold-only-analyzer-rewrite",
"decision": "rejected",
"score": 0.13,
"reasons": [
"changes analyzer thresholds without corpus evidence",
"does not compare expected outputs against representative repository or commit-history cases",
"does not update analyzer corpus, snapshot, fixture, benchmark, golden, eval, or reference-set artifacts",
"would suppress Deep Analyzer Evidence risk without proof",
"does not name a regression command"
],
"rollback": "Do not promote this analyzer rewrite; restart from maintained corpus inputs, expected-output snapshots, and a focused maintainer PR."
}
],
"promoted_candidate_id": "corpus-backed-analyzer-change"
}

View File

@@ -0,0 +1,49 @@
# Harness Config Quality Playbook
Candidate id: `adapter-matrix-backed-drift-check`
Use this playbook when a PR, install change, or setup recommendation touches
MCP, plugins, hooks, commands, agents, rules, install targets, or harness
adapter surfaces.
## Accepted Path
1. Identify the touched harness/config surface.
2. Retrieve the adapter state from
`docs/architecture/harness-adapter-compliance.md` or
`scripts/lib/harness-adapter-compliance.js`.
3. Record whether the harness is `Native`, `Adapter-backed`,
`Instruction-backed`, or `Reference-only`.
4. Name the install/onramp path and verification command from the matrix.
5. Preserve existing user and project config by using merge, dry-run, or
explicit no-overwrite behavior.
6. Run the relevant validation gate:
- `npm run harness:adapters -- --check`
- `npm run harness:audit -- --format json`
- `node tests/lib/install-targets.test.js`
- `node tests/opencode-plugin-hooks.test.js`
- `node tests/docs/mcp-management-docs.test.js`
7. Promote a config recommendation only when the evidence matches the harness
state and the config preservation behavior is explicit.
## Rejected Path
Do not claim Claude hook parity for Codex, Gemini, Zed, OpenCode, or other
harnesses unless the adapter matrix and tests prove it.
Do not overwrite `settings.json`, MCP configs, plugin manifests, rule files, or
command surfaces without a merge/dry-run path and a rollback note.
Do not toggle live MCP servers, publish plugins, or edit user-level harness
config from the evaluator run.
## Minimum Validation
- `npm run harness:adapters -- --check`
- `npm run harness:audit -- --format json`
- Focused install, plugin, MCP, or hook test for the changed surface
- `git diff --check`
- Markdown lint when docs are touched
Record the adapter state, risk note, validation commands, and config
preservation behavior in the maintainer PR body or handoff.

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "harness-config-quality",
"run_id": "2026-05-12-harness-config-quality-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"adapter_evidence": 0.94,
"config_preservation": 0.88,
"verification_specificity": 0.9,
"parity_claim_safety": 1,
"publication_safety": 1
},
"findings": [
{
"id": "adapter-state-required",
"severity": "warning",
"summary": "Harness recommendations must retrieve the adapter state before claiming native support or runtime enforcement."
},
{
"id": "config-overwrite-risk",
"severity": "warning",
"summary": "MCP, hook, plugin, command, and rule changes must preserve existing user/project config and use dry-run or merge behavior when available."
},
{
"id": "verification-command-needed",
"severity": "info",
"summary": "The accepted playbook names harness adapter, harness audit, install-target, or plugin-hook regression gates before a config change can merge."
}
],
"recommended_next_action": {
"candidate_id": "adapter-matrix-backed-drift-check",
"action": "Use the promoted harness-config quality playbook for PRs or setup work touching MCP, plugin, hook, command, agent, rule, or adapter surfaces."
}
}

View File

@@ -0,0 +1,57 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "harness-config-quality",
"title": "Detect harness config drift before changing adapters or installs",
"mode": "read_only_prototype",
"objective": "Given a change to MCP, plugin, hook, command, agent, or harness adapter surfaces, retrieve the adapter matrix and validation evidence before promoting a setup recommendation or config change.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/architecture/harness-adapter-compliance.md",
"purpose": "Public adapter matrix that names harness state, install/onramp paths, verification commands, and risk notes"
},
{
"kind": "repo_source",
"path": "scripts/lib/harness-adapter-compliance.js",
"purpose": "Structured source of truth for the adapter compliance matrix"
},
{
"kind": "repo_config",
"path": "hooks/hooks.json",
"purpose": "Claude hook surface that must not be assumed portable without adapter evidence"
},
{
"kind": "repo_config",
"path": "mcp-configs/mcp-servers.json",
"purpose": "Reference MCP config that can drift from harness-specific runtime semantics"
},
{
"kind": "repo_test",
"command": "npm run harness:adapters -- --check",
"purpose": "Adapter matrix consistency gate"
}
],
"retrieval_questions": [
"Which harness or config surface changed: MCP, plugin, hook, command, agent, rule, or adapter?",
"Does the adapter matrix classify this harness as native, adapter-backed, instruction-backed, or reference-only?",
"Which install path, verification command, risk note, owner, and source doc apply?",
"Does the recommendation preserve existing user config rather than overwriting it?",
"Which compatibility regression or harness audit command proves the setup still works?"
],
"forbidden_actions": [
"claiming native support for instruction-backed or reference-only harnesses",
"copying Claude hook semantics into Codex, Gemini, Zed, or OpenCode without adapter evidence",
"silently overwriting existing user MCP, hook, plugin, command, or rule config",
"disabling or enabling live MCP servers from a read-only evaluator run",
"shipping an adapter change without a verification command",
"publishing packages or plugins from this evaluator run"
],
"acceptance_gates": [
"adapter state is retrieved from the matrix",
"install or onramp path is named",
"verification command is named",
"risk note is preserved",
"config-preservation behavior is explicit",
"at least one unsupported parity claim is rejected"
]
}

View File

@@ -0,0 +1,45 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "harness-config-quality",
"run_id": "2026-05-12-harness-config-quality-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "A setup recommendation or PR touches MCP, plugin, hook, command, agent, rule, or adapter surfaces. The evaluator records the surface without editing local or user-level config.",
"evidence": [
"docs/architecture/harness-adapter-compliance.md",
"scripts/lib/harness-adapter-compliance.js"
]
},
{
"phase": "retrieval",
"summary": "Retrieved the adapter state, install/onramp path, verification commands, risk notes, and config-preservation tests for the affected harness.",
"evidence": [
"npm run harness:adapters -- --check",
"npm run harness:audit -- --format json",
"node tests/lib/install-targets.test.js"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: adapter-matrix-backed drift check, and unsupported hook parity claim that copies Claude semantics into every harness.",
"candidate_ids": [
"adapter-matrix-backed-drift-check",
"unsupported-hook-parity-claim"
]
},
{
"phase": "verification",
"summary": "Accepted the matrix-backed drift check because it names state, install path, verification, and preservation behavior. Rejected unsupported hook parity because it overclaims portability.",
"evidence": [
"examples/evaluator-rag-prototype/harness-config-quality/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only harness-config quality playbook. The evaluator does not overwrite configs, toggle MCP servers, publish plugins, or claim native support.",
"promoted_candidate_id": "adapter-matrix-backed-drift-check"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "harness-config-quality",
"run_id": "2026-05-12-harness-config-quality-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "adapter-matrix-backed-drift-check",
"decision": "accepted",
"score": 0.92,
"reasons": [
"retrieves adapter state before making a support claim",
"names install or onramp path and verification commands",
"preserves existing user and project config",
"keeps runtime MCP toggles and plugin publication out of the evaluator run",
"requires focused compatibility regression coverage"
],
"rollback": "Revert the future adapter/config PR or restore the prior config merge behavior; no live user config is changed by this read-only playbook."
},
{
"candidate_id": "unsupported-hook-parity-claim",
"decision": "rejected",
"score": 0.16,
"reasons": [
"claims native support without adapter matrix evidence",
"copies Claude hook semantics into instruction-backed harnesses",
"does not name a verification command",
"does not preserve existing MCP or hook config",
"risks publishing or installing unsupported plugin behavior"
],
"rollback": "Do not publish this setup recommendation; restart from adapter state, risk note, and config-preservation evidence."
}
],
"promoted_candidate_id": "adapter-matrix-backed-drift-check"
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "stale-pr-salvage-maintainer-branch",
"run_id": "2026-05-12-cleanup-salvage-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"source_attribution": 1,
"blast_radius_control": 1,
"manual_review_respected": 1,
"validation_specificity": 0.8,
"publication_safety": 1
},
"findings": [
{
"id": "salvage-policy-usable",
"severity": "info",
"summary": "The stale-salvage ledger and maintainer PR examples provide enough evidence to promote a reusable maintainer-owned salvage playbook."
},
{
"id": "translation-tail-blocked",
"severity": "warning",
"summary": "Localization tails remain useful but must stay translator/manual-review only."
},
{
"id": "release-actions-blocked",
"severity": "warning",
"summary": "Release, npm, plugin, billing, and announcement actions remain outside this evaluator run and require separate approval."
}
],
"recommended_next_action": {
"candidate_id": "maintainer-salvage-branch",
"action": "Use the promoted playbook for future stale cleanup batches and add additional evaluator/RAG scenarios for CI failure diagnosis, harness-config drift, billing readiness, and AgentShield policy exceptions."
}
}

View File

@@ -0,0 +1,56 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "stale-pr-salvage-maintainer-branch",
"title": "Recover useful stale PR work through maintainer-owned branches",
"mode": "read_only_prototype",
"objective": "Given a closed stale PR batch, identify useful work, reject unsafe bulk imports, and promote only a maintainer-owned salvage playbook with attribution and validation.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/stale-pr-salvage-ledger.md",
"purpose": "Durable source-to-disposition mapping for stale PR cleanup"
},
{
"kind": "repo_doc",
"path": "docs/legacy-artifact-inventory.md",
"purpose": "Import guardrails for legacy and private-context material"
},
{
"kind": "roadmap",
"path": "docs/ECC-2.0-GA-ROADMAP.md",
"purpose": "Operating rule and current execution lane"
},
{
"kind": "github_pr",
"url": "https://github.com/affaan-m/everything-claude-code/pull/1815",
"purpose": "Example maintainer-owned stale salvage PR with attribution"
},
{
"kind": "github_pr",
"url": "https://github.com/affaan-m/everything-claude-code/pull/1818",
"purpose": "Example gap pass classifying already-present and skipped stale work"
}
],
"retrieval_questions": [
"Which closed PRs contain useful work that is not already present?",
"Which files or concepts are unsafe to cherry-pick without manual review?",
"Which current docs, skills, commands, or tests are the correct integration points?",
"Which validation gates are required before the salvage work can merge?"
],
"forbidden_actions": [
"closing, reopening, or commenting on PRs",
"merging PRs",
"creating release tags",
"publishing packages or plugins",
"copying private paths, secrets, or raw personal context",
"blindly cherry-picking bulk localization"
],
"acceptance_gates": [
"source attribution is preserved",
"salvage ledger or equivalent tracker is updated",
"translation/manual-review tails remain blocked",
"candidate action is reversible and maintainer-owned",
"validation commands are named",
"at least one unsafe candidate is rejected"
]
}

View File

@@ -0,0 +1,57 @@
# Skill Quality Evidence Playbook
Candidate id: `evidence-backed-skill-amendment`
Use this playbook when a PR or follow-up proposes adding, rewriting, or
amending a skill, agent, command, or rule guidance surface.
## Accepted Path
1. Name the changed guidance surface and source file.
2. Retrieve the quality contract from `docs/SKILL-DEVELOPMENT-GUIDE.md`.
3. Compare the proposed change to nearby focused examples under `skills/*/SKILL.md`.
4. Record the evidence source that justifies the change:
- observed skill-run failure;
- user feedback;
- repeated review finding;
- reference-set gap;
- failing example or regression test.
5. Keep the scope narrow. One skill should cover one domain, workflow, or
reusable pattern.
6. Add or update examples only when they can be validated.
7. Run the relevant validation gate:
- `node scripts/ci/validate-skills.js`
- `node tests/lib/skill-improvement.test.js`
- `node tests/lib/skill-evolution.test.js`
- `npm run catalog:check`
- language-specific example commands such as `npx tsc --noEmit`,
`python -m py_compile`, or `go build` when examples are touched.
8. Record validation output, source attribution, and rollback notes in the
maintainer PR body or handoff.
## Rejected Path
Do not promote a vague skill rewrite because the prose "sounds better" without
observed failure evidence, examples, or a reference set.
Do not merge multi-domain catch-all skills that duplicate focused skills or make
activation less predictable.
Do not copy private operator context, secrets, tokens, personal paths, customer
data, or unpublished release claims into skills.
Do not update package manifests, plugin manifests, catalogs, release notes, or
publication state from the evaluator run.
## Minimum Validation
- `node scripts/ci/validate-skills.js`
- `npm run catalog:check` when catalog/package-visible skill surfaces change
- Focused skill-improvement or skill-evolution regression test when amendment
behavior changes
- Language-specific compile/lint checks for touched examples
- `git diff --check`
- Markdown lint when docs or playbooks are touched
Preserve source attribution for contributed skill material and include rollback
guidance for the future maintainer PR.

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "skill-quality-evidence",
"run_id": "2026-05-12-skill-quality-evidence-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"skill_contract_retrieval": 0.94,
"observed_failure_evidence": 0.88,
"example_quality": 0.9,
"validation_specificity": 0.93,
"publication_safety": 1
},
"findings": [
{
"id": "examples-required",
"severity": "warning",
"summary": "Skill-quality changes need working examples or regression evidence; prose-only rewrites are not enough for promotion."
},
{
"id": "observation-source-required",
"severity": "warning",
"summary": "Skill amendments should cite observed failure, user feedback, or a reference-set gap rather than broad style preference."
},
{
"id": "publication-stays-blocked",
"severity": "info",
"summary": "The evaluator can recommend a maintainer PR, but it cannot update package, plugin, catalog, or publication state itself."
}
],
"recommended_next_action": {
"candidate_id": "evidence-backed-skill-amendment",
"action": "Use the promoted skill-quality playbook for PRs that add, rewrite, or amend skills, agents, commands, or rules guidance."
}
}

View File

@@ -0,0 +1,57 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "skill-quality-evidence",
"title": "Require examples and validation before promoting skill guidance changes",
"mode": "read_only_prototype",
"objective": "Given a change to skills, agents, commands, or rules guidance, retrieve the skill development contract and observed skill-run evidence before promoting an amendment or new skill-quality recommendation.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/SKILL-DEVELOPMENT-GUIDE.md",
"purpose": "Public skill quality contract for frontmatter, focused scope, examples, testing, and submission evidence"
},
{
"kind": "repo_source",
"path": "scripts/ci/validate-skills.js",
"purpose": "Curated skill structure and frontmatter validation gate"
},
{
"kind": "repo_source",
"path": "scripts/lib/skill-improvement/",
"purpose": "Observation, health, amendment, and evaluation helpers for evidence-backed skill evolution"
},
{
"kind": "repo_test",
"command": "node tests/lib/skill-improvement.test.js",
"purpose": "Regression coverage for observation-backed skill amendment and evaluation scaffolds"
},
{
"kind": "repo_test",
"command": "node scripts/ci/validate-skills.js",
"purpose": "Skill structure validation before catalog or package changes merge"
}
],
"retrieval_questions": [
"Which skill, agent, command, or rule surface changed?",
"Does the change preserve focused scope, clear activation text, and working examples?",
"Which validation command proves frontmatter, catalog, example, or behavior quality?",
"Does observed failure or user feedback justify the amendment?",
"Does the candidate avoid private context, secrets, personal paths, and publication actions?"
],
"forbidden_actions": [
"promoting a skill rewrite without examples, validation, or observed failure evidence",
"adding broad multi-domain skills that duplicate existing focused skills",
"shipping code examples that are uncompiled, untested, or disconnected from the skill guidance",
"copying private operator context, secrets, tokens, or personal paths into skills",
"changing package, plugin, catalog, or publication state from this evaluator run",
"claiming a skill-quality improvement without a reference set or regression command"
],
"acceptance_gates": [
"changed skill or guidance surface is named",
"source evidence includes the skill development guide or current skill examples",
"observed failure, user feedback, or reference-set gap is recorded",
"validation command is named",
"example or regression evidence is attached",
"at least one vague no-evidence rewrite is rejected"
]
}

View File

@@ -0,0 +1,46 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "skill-quality-evidence",
"run_id": "2026-05-12-skill-quality-evidence-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "A skill or guidance PR proposes updated instructions. The evaluator records the changed surface and stays read-only; it does not edit skills, package manifests, catalogs, or publication state.",
"evidence": [
"docs/SKILL-DEVELOPMENT-GUIDE.md",
"scripts/ci/validate-skills.js"
]
},
{
"phase": "retrieval",
"summary": "Retrieved the skill quality contract, existing focused skill examples, observation-backed amendment helpers, and validation commands for skill structure and regression evidence.",
"evidence": [
"node scripts/ci/validate-skills.js",
"node tests/lib/skill-improvement.test.js",
"node tests/lib/skill-evolution.test.js",
"npm run catalog:check"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: evidence-backed skill amendment, and broad rewrite with no examples or validation.",
"candidate_ids": [
"evidence-backed-skill-amendment",
"vague-skill-rewrite"
]
},
{
"phase": "verification",
"summary": "Accepted the evidence-backed amendment because it names observed failure evidence, examples, and validation commands. Rejected the vague rewrite because it lacks a reference set and testable examples.",
"evidence": [
"examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only skill-quality evidence playbook. Future skill edits must move through maintainer PRs with source attribution, validation, and rollback notes.",
"promoted_candidate_id": "evidence-backed-skill-amendment"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "skill-quality-evidence",
"run_id": "2026-05-12-skill-quality-evidence-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "evidence-backed-skill-amendment",
"decision": "accepted",
"score": 0.91,
"reasons": [
"retrieves the skill development guide and existing focused skill examples",
"records observed failure, user feedback, or reference-set gap before proposing an amendment",
"names validation commands for skill structure, examples, catalog consistency, and regression behavior",
"keeps package, plugin, catalog, and publication actions out of the evaluator run",
"includes rollback guidance for reverting the future maintainer PR"
],
"rollback": "Revert the future skill-amendment PR and restore the prior SKILL.md content; no installed user skill or publication surface changes in this read-only playbook."
},
{
"candidate_id": "vague-skill-rewrite",
"decision": "rejected",
"score": 0.14,
"reasons": [
"does not name observed failure evidence or user feedback",
"rewrites broad skill guidance without focused scope",
"does not include working examples or a reference set",
"does not name a regression command",
"risks changing catalog or publication state from evaluator output"
],
"rollback": "Do not promote this rewrite; restart from observed skill-run evidence, example validation, and a focused maintainer PR."
}
],
"promoted_candidate_id": "evidence-backed-skill-amendment"
}

View File

@@ -0,0 +1,46 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "stale-pr-salvage-maintainer-branch",
"run_id": "2026-05-12-cleanup-salvage-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "Public PR, issue, and discussion queues are clear; release publication remains approval-gated; stale-salvage ledger has landed, skipped, superseded, and manual-review states.",
"evidence": [
"docs/ECC-2.0-GA-ROADMAP.md",
"docs/stale-pr-salvage-ledger.md"
]
},
{
"phase": "retrieval",
"summary": "Retrieved stale PR source mappings, existing maintainer salvage examples, legacy import rules, and manual-review localization tails.",
"evidence": [
"docs/stale-pr-salvage-ledger.md",
"docs/legacy-artifact-inventory.md",
"https://github.com/affaan-m/everything-claude-code/pull/1815",
"https://github.com/affaan-m/everything-claude-code/pull/1818"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: maintainer-owned salvage branch with attribution, and blind cherry-pick of stale translations.",
"candidate_ids": [
"maintainer-salvage-branch",
"blind-cherry-pick-translations"
]
},
{
"phase": "verification",
"summary": "Accepted the maintainer-owned salvage branch and rejected blind translation cherry-picking because it violates manual-review and attribution gates.",
"evidence": [
"examples/evaluator-rag-prototype/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the maintainer-owned salvage branch playbook as a reusable process. No repository, GitHub, release, billing, or plugin publication action is performed by this prototype.",
"promoted_candidate_id": "maintainer-salvage-branch"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "stale-pr-salvage-maintainer-branch",
"run_id": "2026-05-12-cleanup-salvage-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "maintainer-salvage-branch",
"decision": "accepted",
"score": 0.94,
"reasons": [
"preserves source PR attribution",
"keeps work on a fresh maintainer-owned branch",
"updates the salvage ledger",
"names validation gates",
"does not perform release or publication actions"
],
"rollback": "Close the maintainer PR or revert its merge commit; source PR state remains unchanged."
},
{
"candidate_id": "blind-cherry-pick-translations",
"decision": "rejected",
"score": 0.21,
"reasons": [
"bulk localization requires translator/manual review",
"does not preserve enough source attribution",
"could import stale generated docs",
"does not name validation gates",
"risks bypassing current catalog and install architecture"
],
"rollback": "Do not create this branch; keep the localization tail in translator/manual-review state."
}
],
"promoted_candidate_id": "maintainer-salvage-branch"
}

6
package-lock.json generated
View File

@@ -1044,9 +1044,9 @@
"license": "MIT"
},
"node_modules/fast-uri": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
"integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz",
"integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==",
"funding": [
{
"type": "github",

View File

@@ -270,7 +270,12 @@
"skills/windows-desktop-e2e/",
"skills/workspace-surface-audit/",
"skills/x-api/",
"the-security-guide.md"
"the-security-guide.md",
"!**/__pycache__/**",
"!**/*.pyc",
"!**/*.pyo",
"!**/*.pyd",
"!**/.pytest_cache/**"
],
"bin": {
"ecc": "scripts/ecc.js",

View File

@@ -0,0 +1,417 @@
'use strict';
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const repoRoot = path.resolve(__dirname, '..', '..');
const fixtureRoot = path.join(repoRoot, 'examples', 'evaluator-rag-prototype');
let passed = 0;
let failed = 0;
function test(name, fn) {
try {
fn();
console.log(`${name}`);
passed++;
} catch (error) {
console.log(`${name}`);
console.log(` Error: ${error.message}`);
failed++;
}
}
function read(relativePath) {
return fs.readFileSync(path.join(repoRoot, relativePath), 'utf8');
}
function readJson(fileName) {
return JSON.parse(fs.readFileSync(path.join(fixtureRoot, fileName), 'utf8'));
}
function readFixtureJson(relativePath) {
return JSON.parse(fs.readFileSync(path.join(fixtureRoot, relativePath), 'utf8'));
}
console.log('\n=== Testing evaluator RAG prototype ===\n');
test('architecture doc records the artifact contract and reference pressure', () => {
const source = read('docs/architecture/evaluator-rag-prototype.md');
for (const required of [
'Scenario spec',
'Trace',
'Report',
'Candidate playbook',
'Verifier result',
'Meta-Harness',
'Autocontext',
'Claude HUD',
'Hermes Agent',
'dmux, Orca, Superset, and Ghast',
'ECC Tools'
]) {
assert.ok(source.includes(required), `Missing doc requirement: ${required}`);
}
});
test('fixtures use one scenario id and declare read-only behavior', () => {
const scenario = readJson('scenario.json');
const trace = readJson('trace.json');
const report = readJson('report.json');
const verifier = readJson('verifier-result.json');
assert.strictEqual(scenario.schema_version, 'ecc.evaluator-rag.scenario.v1');
assert.strictEqual(trace.schema_version, 'ecc.evaluator-rag.trace.v1');
assert.strictEqual(report.schema_version, 'ecc.evaluator-rag.report.v1');
assert.strictEqual(verifier.schema_version, 'ecc.evaluator-rag.verifier.v1');
for (const artifact of [trace, report, verifier]) {
assert.strictEqual(artifact.scenario_id, scenario.scenario_id);
assert.strictEqual(artifact.read_only, true);
}
});
test('trace covers the full self-improving harness loop', () => {
const trace = readJson('trace.json');
const phases = trace.events.map(event => event.phase);
for (const phase of ['observation', 'retrieval', 'proposal', 'verification', 'promotion']) {
assert.ok(phases.includes(phase), `Missing trace phase ${phase}`);
}
assert.ok(trace.events.some(event => event.promoted_candidate_id === 'maintainer-salvage-branch'));
});
test('scenario blocks unsafe write actions and release actions', () => {
const scenario = readJson('scenario.json');
const forbidden = scenario.forbidden_actions.join('\n');
for (const blocked of [
'closing, reopening, or commenting on PRs',
'merging PRs',
'creating release tags',
'publishing packages or plugins',
'copying private paths, secrets, or raw personal context',
'blindly cherry-picking bulk localization'
]) {
assert.ok(forbidden.includes(blocked), `Missing forbidden action: ${blocked}`);
}
});
test('verifier accepts maintainer salvage and rejects blind translation imports', () => {
const verifier = readJson('verifier-result.json');
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'maintainer-salvage-branch');
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'blind-cherry-pick-translations');
assert.ok(accepted, 'Missing accepted maintainer salvage candidate');
assert.ok(rejected, 'Missing rejected blind cherry-pick candidate');
assert.strictEqual(accepted.decision, 'accepted');
assert.strictEqual(rejected.decision, 'rejected');
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
assert.ok(accepted.score > rejected.score);
assert.ok(rejected.reasons.join('\n').includes('translator/manual review'));
});
test('candidate playbook preserves stale-salvage operating rules', () => {
const playbook = read('examples/evaluator-rag-prototype/candidate-playbook.md');
for (const required of [
'docs/stale-pr-salvage-ledger.md',
'source PR',
'maintainer-owned branch',
'Preserve attribution',
'translator/manual review',
'private operator context',
'git diff --check'
]) {
assert.ok(playbook.includes(required), `Missing playbook rule: ${required}`);
}
});
test('roadmap points to the evaluator RAG prototype and keeps hosted integration open', () => {
const roadmap = read('docs/ECC-2.0-GA-ROADMAP.md');
assert.ok(roadmap.includes('docs/architecture/evaluator-rag-prototype.md'));
assert.ok(roadmap.includes('examples/evaluator-rag-prototype/'));
assert.ok(roadmap.includes('Local corpus complete; hosted integration remains future'));
});
test('billing readiness scenario rejects launch copy overclaims', () => {
const scenario = readFixtureJson('billing-marketplace-readiness/scenario.json');
const trace = readFixtureJson('billing-marketplace-readiness/trace.json');
const report = readFixtureJson('billing-marketplace-readiness/report.json');
const verifier = readFixtureJson('billing-marketplace-readiness/verifier-result.json');
const playbook = read('examples/evaluator-rag-prototype/billing-marketplace-readiness/candidate-playbook.md');
assert.strictEqual(scenario.scenario_id, 'billing-marketplace-readiness');
assert.strictEqual(trace.scenario_id, scenario.scenario_id);
assert.strictEqual(report.scenario_id, scenario.scenario_id);
assert.strictEqual(verifier.scenario_id, scenario.scenario_id);
assert.strictEqual(trace.read_only, true);
assert.strictEqual(report.read_only, true);
assert.strictEqual(verifier.read_only, true);
for (const blocked of [
'creating or editing GitHub Marketplace listings',
'changing plan limits, subscriptions, seats, or entitlements',
'posting announcement copy',
'claiming live billing readiness from dry-run evidence alone'
]) {
assert.ok(scenario.forbidden_actions.includes(blocked), `Missing billing forbidden action: ${blocked}`);
}
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'evidence-backed-billing-check');
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'announcement-first-billing-copy');
assert.ok(accepted, 'Missing accepted billing evidence candidate');
assert.ok(rejected, 'Missing rejected announcement-overclaim candidate');
assert.strictEqual(accepted.decision, 'accepted');
assert.strictEqual(rejected.decision, 'rejected');
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
assert.ok(rejected.reasons.join('\n').includes('roadmap acceptance criteria'));
assert.ok(playbook.includes('remove-before-publication'));
assert.ok(playbook.includes('https://github.com/marketplace/ecc-tools'));
});
test('ci failure diagnosis scenario rejects rerun-only fixes', () => {
const scenario = readFixtureJson('ci-failure-diagnosis/scenario.json');
const trace = readFixtureJson('ci-failure-diagnosis/trace.json');
const report = readFixtureJson('ci-failure-diagnosis/report.json');
const verifier = readFixtureJson('ci-failure-diagnosis/verifier-result.json');
const playbook = read('examples/evaluator-rag-prototype/ci-failure-diagnosis/candidate-playbook.md');
assert.strictEqual(scenario.scenario_id, 'ci-failure-diagnosis');
assert.strictEqual(trace.scenario_id, scenario.scenario_id);
assert.strictEqual(report.scenario_id, scenario.scenario_id);
assert.strictEqual(verifier.scenario_id, scenario.scenario_id);
assert.strictEqual(trace.read_only, true);
assert.strictEqual(report.read_only, true);
assert.strictEqual(verifier.read_only, true);
for (const blocked of [
'rerunning CI until it passes without diagnosing the failure',
'pushing speculative fixes without a captured failing log excerpt',
'weakening or deleting tests to silence a failure',
'merging or publishing while required checks are red'
]) {
assert.ok(scenario.forbidden_actions.includes(blocked), `Missing CI forbidden action: ${blocked}`);
}
for (const required of [
'failing job and step are named',
'captured log excerpt is linked or summarized',
'changed-file context is compared to the failing step',
'local reproduction or regression command is named'
]) {
assert.ok(scenario.acceptance_gates.includes(required), `Missing CI acceptance gate: ${required}`);
}
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'log-backed-minimal-fix');
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'rerun-only-green-wait');
assert.ok(accepted, 'Missing accepted log-backed CI candidate');
assert.ok(rejected, 'Missing rejected rerun-only CI candidate');
assert.strictEqual(accepted.decision, 'accepted');
assert.strictEqual(rejected.decision, 'rejected');
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
assert.ok(rejected.reasons.join('\n').includes('failing log excerpt'));
assert.ok(playbook.includes('gh run view <run-id> --log-failed'));
assert.ok(playbook.includes('Full required GitHub Actions matrix before merge'));
});
test('harness config quality scenario rejects unsupported parity claims', () => {
const scenario = readFixtureJson('harness-config-quality/scenario.json');
const trace = readFixtureJson('harness-config-quality/trace.json');
const report = readFixtureJson('harness-config-quality/report.json');
const verifier = readFixtureJson('harness-config-quality/verifier-result.json');
const playbook = read('examples/evaluator-rag-prototype/harness-config-quality/candidate-playbook.md');
assert.strictEqual(scenario.scenario_id, 'harness-config-quality');
assert.strictEqual(trace.scenario_id, scenario.scenario_id);
assert.strictEqual(report.scenario_id, scenario.scenario_id);
assert.strictEqual(verifier.scenario_id, scenario.scenario_id);
assert.strictEqual(trace.read_only, true);
assert.strictEqual(report.read_only, true);
assert.strictEqual(verifier.read_only, true);
for (const blocked of [
'claiming native support for instruction-backed or reference-only harnesses',
'copying Claude hook semantics into Codex, Gemini, Zed, or OpenCode without adapter evidence',
'silently overwriting existing user MCP, hook, plugin, command, or rule config',
'publishing packages or plugins from this evaluator run'
]) {
assert.ok(scenario.forbidden_actions.includes(blocked), `Missing harness forbidden action: ${blocked}`);
}
for (const required of [
'adapter state is retrieved from the matrix',
'install or onramp path is named',
'verification command is named',
'config-preservation behavior is explicit'
]) {
assert.ok(scenario.acceptance_gates.includes(required), `Missing harness acceptance gate: ${required}`);
}
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'adapter-matrix-backed-drift-check');
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'unsupported-hook-parity-claim');
assert.ok(accepted, 'Missing accepted adapter-matrix candidate');
assert.ok(rejected, 'Missing rejected unsupported parity candidate');
assert.strictEqual(accepted.decision, 'accepted');
assert.strictEqual(rejected.decision, 'rejected');
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
assert.ok(rejected.reasons.join('\n').includes('native support'));
assert.ok(playbook.includes('npm run harness:adapters -- --check'));
assert.ok(playbook.includes('node tests/docs/mcp-management-docs.test.js'));
});
test('AgentShield policy exception scenario rejects blanket suppression', () => {
const scenario = readFixtureJson('agentshield-policy-exception/scenario.json');
const trace = readFixtureJson('agentshield-policy-exception/trace.json');
const report = readFixtureJson('agentshield-policy-exception/report.json');
const verifier = readFixtureJson('agentshield-policy-exception/verifier-result.json');
const playbook = read('examples/evaluator-rag-prototype/agentshield-policy-exception/candidate-playbook.md');
assert.strictEqual(scenario.scenario_id, 'agentshield-policy-exception');
assert.strictEqual(trace.scenario_id, scenario.scenario_id);
assert.strictEqual(report.scenario_id, scenario.scenario_id);
assert.strictEqual(verifier.scenario_id, scenario.scenario_id);
assert.strictEqual(trace.read_only, true);
assert.strictEqual(report.read_only, true);
assert.strictEqual(verifier.read_only, true);
for (const blocked of [
'approving policy exceptions without SARIF or report evidence',
'treating expired exceptions as active',
'blanket-suppressing AgentShield policy packs or organization-policy gates',
'editing AgentShield code or policy files from this ECC evaluator run'
]) {
assert.ok(scenario.forbidden_actions.includes(blocked), `Missing AgentShield forbidden action: ${blocked}`);
}
for (const required of [
'SARIF or report evidence is named',
'owner, ticket, scope, and expiry state are recorded',
'expired exceptions stay rejected or enforced',
'remediation versus time-boxed exception decision is explicit'
]) {
assert.ok(scenario.acceptance_gates.includes(required), `Missing AgentShield acceptance gate: ${required}`);
}
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'sarif-backed-timeboxed-exception-review');
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'blanket-policy-suppression');
assert.ok(accepted, 'Missing accepted AgentShield exception candidate');
assert.ok(rejected, 'Missing rejected blanket suppression candidate');
assert.strictEqual(accepted.decision, 'accepted');
assert.strictEqual(rejected.decision, 'rejected');
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
assert.ok(rejected.reasons.join('\n').includes('blanket-suppresses'));
assert.ok(playbook.includes('agentshield-policy/*'));
assert.ok(playbook.includes('owner, ticket, scope, expiry'));
assert.ok(playbook.includes('npx ecc-agentshield scan --format json'));
});
test('skill quality evidence scenario rejects vague rewrites', () => {
const scenario = readFixtureJson('skill-quality-evidence/scenario.json');
const trace = readFixtureJson('skill-quality-evidence/trace.json');
const report = readFixtureJson('skill-quality-evidence/report.json');
const verifier = readFixtureJson('skill-quality-evidence/verifier-result.json');
const playbook = read('examples/evaluator-rag-prototype/skill-quality-evidence/candidate-playbook.md');
assert.strictEqual(scenario.scenario_id, 'skill-quality-evidence');
assert.strictEqual(trace.scenario_id, scenario.scenario_id);
assert.strictEqual(report.scenario_id, scenario.scenario_id);
assert.strictEqual(verifier.scenario_id, scenario.scenario_id);
assert.strictEqual(trace.read_only, true);
assert.strictEqual(report.read_only, true);
assert.strictEqual(verifier.read_only, true);
for (const blocked of [
'promoting a skill rewrite without examples, validation, or observed failure evidence',
'adding broad multi-domain skills that duplicate existing focused skills',
'copying private operator context, secrets, tokens, or personal paths into skills',
'claiming a skill-quality improvement without a reference set or regression command'
]) {
assert.ok(scenario.forbidden_actions.includes(blocked), `Missing skill-quality forbidden action: ${blocked}`);
}
for (const required of [
'changed skill or guidance surface is named',
'observed failure, user feedback, or reference-set gap is recorded',
'validation command is named',
'example or regression evidence is attached'
]) {
assert.ok(scenario.acceptance_gates.includes(required), `Missing skill-quality acceptance gate: ${required}`);
}
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'evidence-backed-skill-amendment');
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'vague-skill-rewrite');
assert.ok(accepted, 'Missing accepted skill-quality candidate');
assert.ok(rejected, 'Missing rejected vague rewrite candidate');
assert.strictEqual(accepted.decision, 'accepted');
assert.strictEqual(rejected.decision, 'rejected');
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
assert.ok(rejected.reasons.join('\n').includes('does not include working examples'));
assert.ok(playbook.includes('docs/SKILL-DEVELOPMENT-GUIDE.md'));
assert.ok(playbook.includes('node scripts/ci/validate-skills.js'));
assert.ok(playbook.includes('observed skill-run failure'));
});
test('deep analyzer evidence scenario rejects no-corpus analyzer changes', () => {
const scenario = readFixtureJson('deep-analyzer-evidence/scenario.json');
const trace = readFixtureJson('deep-analyzer-evidence/trace.json');
const report = readFixtureJson('deep-analyzer-evidence/report.json');
const verifier = readFixtureJson('deep-analyzer-evidence/verifier-result.json');
const playbook = read('examples/evaluator-rag-prototype/deep-analyzer-evidence/candidate-playbook.md');
assert.strictEqual(scenario.scenario_id, 'deep-analyzer-evidence');
assert.strictEqual(trace.scenario_id, scenario.scenario_id);
assert.strictEqual(report.scenario_id, scenario.scenario_id);
assert.strictEqual(verifier.scenario_id, scenario.scenario_id);
assert.strictEqual(trace.read_only, true);
assert.strictEqual(report.read_only, true);
assert.strictEqual(verifier.read_only, true);
for (const blocked of [
'promoting repository, commit, architecture, or deep-analysis changes without analyzer corpus evidence',
'suppressing the Deep Analyzer Evidence risk bucket without co-located corpus, snapshot, fixture, or benchmark evidence',
'changing analyzer thresholds or classifications without expected-output comparison',
'posting PR comments, check runs, or Linear sync updates from this read-only evaluator run'
]) {
assert.ok(scenario.forbidden_actions.includes(blocked), `Missing deep-analyzer forbidden action: ${blocked}`);
}
for (const required of [
'changed analyzer surface is named',
'maintained corpus or reference-set path is included',
'expected analyzer outputs are compared',
'representative repository shape or commit history is described',
'regression command is named'
]) {
assert.ok(scenario.acceptance_gates.includes(required), `Missing deep-analyzer acceptance gate: ${required}`);
}
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'corpus-backed-analyzer-change');
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'threshold-only-analyzer-rewrite');
assert.ok(accepted, 'Missing accepted deep-analyzer candidate');
assert.ok(rejected, 'Missing rejected threshold-only analyzer candidate');
assert.strictEqual(accepted.decision, 'accepted');
assert.strictEqual(rejected.decision, 'rejected');
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
assert.ok(rejected.reasons.join('\n').includes('does not compare expected outputs'));
assert.ok(playbook.includes('../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts'));
assert.ok(playbook.includes('npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts'));
assert.ok(playbook.includes('Deep Analyzer Evidence'));
});
if (failed > 0) {
console.log(`\nFailed: ${failed}`);
process.exit(1);
}
console.log(`\nPassed: ${passed}`);

View File

@@ -71,9 +71,16 @@ function buildExpectedPublishPaths(repoRoot) {
"agent.yaml",
"VERSION",
]
const exclusionPaths = [
"!**/__pycache__/**",
"!**/*.pyc",
"!**/*.pyo",
"!**/*.pyd",
"!**/.pytest_cache/**",
]
const combined = new Set(
[...modules.flatMap((module) => module.paths || []), ...extraPaths].map(normalizePublishPath)
[...modules.flatMap((module) => module.paths || []), ...extraPaths, ...exclusionPaths].map(normalizePublishPath)
)
return [...combined]
@@ -139,6 +146,17 @@ function main() {
`npm pack should not include ${excludedPath}`
)
}
for (const packagedPath of packagedPaths) {
assert.ok(
!packagedPath.includes("__pycache__/"),
`npm pack should not include Python bytecode cache path ${packagedPath}`
)
assert.ok(
!/\.py[cod]$/.test(packagedPath),
`npm pack should not include Python bytecode file ${packagedPath}`
)
}
}],
]