mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-03-30 13:43:26 +08:00
* feat(skills): add skill-comply — automated behavioral compliance measurement Automated compliance measurement for skills, rules, and agent definitions. Generates behavioral specs, runs scenarios at 3 strictness levels, classifies tool calls via LLM, and produces self-contained reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(skill-comply): address bot review feedback - AGENTS.md: fix stale skill count (115 → 117) in project structure - run.py: replace remaining print() with logger, add zero-division guard, create parent dirs for --output path - runner.py: add returncode check for claude subprocess, clarify relative_to path traversal validation - parser.py: use is_file() instead of exists(), catch KeyError for missing trace fields, add file check in parse_spec - classifier.py: log warnings on malformed classification output, guard against non-dict JSON responses - grader.py: filter negative indices from LLM classification Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
171 lines
6.3 KiB
Python
171 lines
6.3 KiB
Python
"""Generate Markdown compliance reports."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from scripts.grader import ComplianceResult
|
|
from scripts.parser import ComplianceSpec, ObservationEvent
|
|
from scripts.scenario_generator import Scenario
|
|
|
|
|
|
def generate_report(
|
|
skill_path: Path,
|
|
spec: ComplianceSpec,
|
|
results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
|
|
scenarios: list[Scenario] | None = None,
|
|
) -> str:
|
|
"""Generate a Markdown compliance report.
|
|
|
|
Args:
|
|
skill_path: Path to the skill file that was tested.
|
|
spec: The compliance spec used for grading.
|
|
results: List of (scenario_level_name, ComplianceResult, observations) tuples.
|
|
scenarios: Original scenario definitions with prompts.
|
|
"""
|
|
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
overall = _overall_compliance(results)
|
|
threshold = spec.threshold_promote_to_hook
|
|
|
|
lines: list[str] = []
|
|
lines.append(f"# skill-comply Report: {skill_path.name}")
|
|
lines.append(f"Generated: {now}")
|
|
lines.append("")
|
|
|
|
# Summary
|
|
lines.append("## Summary")
|
|
lines.append("")
|
|
lines.append(f"| Metric | Value |")
|
|
lines.append(f"|--------|-------|")
|
|
lines.append(f"| Skill | `{skill_path}` |")
|
|
lines.append(f"| Spec | {spec.id} |")
|
|
lines.append(f"| Scenarios | {len(results)} |")
|
|
lines.append(f"| Overall Compliance | {overall:.0%} |")
|
|
lines.append(f"| Threshold | {threshold:.0%} |")
|
|
|
|
promote_steps = _steps_to_promote(spec, results, threshold)
|
|
if promote_steps:
|
|
step_names = ", ".join(promote_steps)
|
|
lines.append(f"| Recommendation | **Promote {step_names} to hooks** |")
|
|
else:
|
|
lines.append(f"| Recommendation | All steps above threshold — no hook promotion needed |")
|
|
lines.append("")
|
|
|
|
# Expected Behavioral Sequence
|
|
lines.append("## Expected Behavioral Sequence")
|
|
lines.append("")
|
|
lines.append("| # | Step | Required | Description |")
|
|
lines.append("|---|------|----------|-------------|")
|
|
for i, step in enumerate(spec.steps, 1):
|
|
req = "Yes" if step.required else "No"
|
|
lines.append(f"| {i} | {step.id} | {req} | {step.detector.description} |")
|
|
lines.append("")
|
|
|
|
# Scenario Results
|
|
lines.append("## Scenario Results")
|
|
lines.append("")
|
|
lines.append("| Scenario | Compliance | Failed Steps |")
|
|
lines.append("|----------|-----------|----------------|")
|
|
for level_name, result, _obs in results:
|
|
failed = [s.step_id for s in result.steps if not s.detected
|
|
and any(sp.id == s.step_id and sp.required for sp in spec.steps)]
|
|
failed_str = ", ".join(failed) if failed else "—"
|
|
lines.append(f"| {level_name} | {result.compliance_rate:.0%} | {failed_str} |")
|
|
lines.append("")
|
|
|
|
# Scenario Prompts
|
|
if scenarios:
|
|
lines.append("## Scenario Prompts")
|
|
lines.append("")
|
|
for s in scenarios:
|
|
lines.append(f"### {s.level_name} (Level {s.level})")
|
|
lines.append("")
|
|
for prompt_line in s.prompt.splitlines():
|
|
lines.append(f"> {prompt_line}")
|
|
lines.append("")
|
|
|
|
# Hook Promotion Recommendations (optional/advanced)
|
|
if promote_steps:
|
|
lines.append("## Advanced: Hook Promotion Recommendations (optional)")
|
|
lines.append("")
|
|
for step_id in promote_steps:
|
|
rate = _step_compliance_rate(step_id, results)
|
|
step = next(s for s in spec.steps if s.id == step_id)
|
|
lines.append(
|
|
f"- **{step_id}** (compliance {rate:.0%}): {step.description}"
|
|
)
|
|
lines.append("")
|
|
|
|
# Per-scenario details with timeline
|
|
lines.append("## Detail")
|
|
lines.append("")
|
|
for level_name, result, observations in results:
|
|
lines.append(f"### {level_name} (Compliance: {result.compliance_rate:.0%})")
|
|
lines.append("")
|
|
lines.append("| Step | Required | Detected | Reason |")
|
|
lines.append("|------|----------|----------|--------|")
|
|
for sr in result.steps:
|
|
req = "Yes" if any(
|
|
sp.id == sr.step_id and sp.required for sp in spec.steps
|
|
) else "No"
|
|
det = "YES" if sr.detected else "NO"
|
|
reason = sr.failure_reason or "—"
|
|
lines.append(f"| {sr.step_id} | {req} | {det} | {reason} |")
|
|
lines.append("")
|
|
|
|
# Timeline: show what the agent actually did
|
|
if observations:
|
|
# Build reverse index: event_index → step_id
|
|
index_to_step: dict[int, str] = {}
|
|
for step_id, indices in result.classification.items():
|
|
for idx in indices:
|
|
index_to_step[idx] = step_id
|
|
|
|
lines.append(f"**Tool Call Timeline ({len(observations)} calls)**")
|
|
lines.append("")
|
|
lines.append("| # | Tool | Input | Output | Classified As |")
|
|
lines.append("|---|------|-------|--------|------|")
|
|
for i, obs in enumerate(observations):
|
|
step_label = index_to_step.get(i, "—")
|
|
input_summary = obs.input[:100].replace("|", "\\|").replace("\n", " ")
|
|
output_summary = obs.output[:50].replace("|", "\\|").replace("\n", " ")
|
|
lines.append(
|
|
f"| {i} | {obs.tool} | {input_summary} | {output_summary} | {step_label} |"
|
|
)
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _overall_compliance(results: list[tuple[str, ComplianceResult, list[ObservationEvent]]]) -> float:
|
|
if not results:
|
|
return 0.0
|
|
return sum(r.compliance_rate for _, r, _obs in results) / len(results)
|
|
|
|
|
|
def _step_compliance_rate(
|
|
step_id: str,
|
|
results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
|
|
) -> float:
|
|
detected = sum(
|
|
1 for _, r, _obs in results
|
|
for s in r.steps if s.step_id == step_id and s.detected
|
|
)
|
|
return detected / len(results) if results else 0.0
|
|
|
|
|
|
def _steps_to_promote(
|
|
spec: ComplianceSpec,
|
|
results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
|
|
threshold: float,
|
|
) -> list[str]:
|
|
promote = []
|
|
for step in spec.steps:
|
|
if not step.required:
|
|
continue
|
|
rate = _step_compliance_rate(step.id, results)
|
|
if rate < threshold:
|
|
promote.append(step.id)
|
|
return promote
|