Files
everything-claude-code/skills/skill-comply/scripts/report.py
Shimo a2e465c74d feat(skills): add skill-comply — automated behavioral compliance measurement (#724)
* feat(skills): add skill-comply — automated behavioral compliance measurement

Automated compliance measurement for skills, rules, and agent definitions.
Generates behavioral specs, runs scenarios at 3 strictness levels,
classifies tool calls via LLM, and produces self-contained reports.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix(skill-comply): address bot review feedback

- AGENTS.md: fix stale skill count (115 → 117) in project structure
- run.py: replace remaining print() with logger, add zero-division guard,
  create parent dirs for --output path
- runner.py: add returncode check for claude subprocess, clarify
  relative_to path traversal validation
- parser.py: use is_file() instead of exists(), catch KeyError for
  missing trace fields, add file check in parse_spec
- classifier.py: log warnings on malformed classification output,
  guard against non-dict JSON responses
- grader.py: filter negative indices from LLM classification

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 21:51:49 -07:00

171 lines
6.3 KiB
Python

"""Generate Markdown compliance reports."""
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
from scripts.grader import ComplianceResult
from scripts.parser import ComplianceSpec, ObservationEvent
from scripts.scenario_generator import Scenario
def generate_report(
skill_path: Path,
spec: ComplianceSpec,
results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
scenarios: list[Scenario] | None = None,
) -> str:
"""Generate a Markdown compliance report.
Args:
skill_path: Path to the skill file that was tested.
spec: The compliance spec used for grading.
results: List of (scenario_level_name, ComplianceResult, observations) tuples.
scenarios: Original scenario definitions with prompts.
"""
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
overall = _overall_compliance(results)
threshold = spec.threshold_promote_to_hook
lines: list[str] = []
lines.append(f"# skill-comply Report: {skill_path.name}")
lines.append(f"Generated: {now}")
lines.append("")
# Summary
lines.append("## Summary")
lines.append("")
lines.append(f"| Metric | Value |")
lines.append(f"|--------|-------|")
lines.append(f"| Skill | `{skill_path}` |")
lines.append(f"| Spec | {spec.id} |")
lines.append(f"| Scenarios | {len(results)} |")
lines.append(f"| Overall Compliance | {overall:.0%} |")
lines.append(f"| Threshold | {threshold:.0%} |")
promote_steps = _steps_to_promote(spec, results, threshold)
if promote_steps:
step_names = ", ".join(promote_steps)
lines.append(f"| Recommendation | **Promote {step_names} to hooks** |")
else:
lines.append(f"| Recommendation | All steps above threshold — no hook promotion needed |")
lines.append("")
# Expected Behavioral Sequence
lines.append("## Expected Behavioral Sequence")
lines.append("")
lines.append("| # | Step | Required | Description |")
lines.append("|---|------|----------|-------------|")
for i, step in enumerate(spec.steps, 1):
req = "Yes" if step.required else "No"
lines.append(f"| {i} | {step.id} | {req} | {step.detector.description} |")
lines.append("")
# Scenario Results
lines.append("## Scenario Results")
lines.append("")
lines.append("| Scenario | Compliance | Failed Steps |")
lines.append("|----------|-----------|----------------|")
for level_name, result, _obs in results:
failed = [s.step_id for s in result.steps if not s.detected
and any(sp.id == s.step_id and sp.required for sp in spec.steps)]
failed_str = ", ".join(failed) if failed else ""
lines.append(f"| {level_name} | {result.compliance_rate:.0%} | {failed_str} |")
lines.append("")
# Scenario Prompts
if scenarios:
lines.append("## Scenario Prompts")
lines.append("")
for s in scenarios:
lines.append(f"### {s.level_name} (Level {s.level})")
lines.append("")
for prompt_line in s.prompt.splitlines():
lines.append(f"> {prompt_line}")
lines.append("")
# Hook Promotion Recommendations (optional/advanced)
if promote_steps:
lines.append("## Advanced: Hook Promotion Recommendations (optional)")
lines.append("")
for step_id in promote_steps:
rate = _step_compliance_rate(step_id, results)
step = next(s for s in spec.steps if s.id == step_id)
lines.append(
f"- **{step_id}** (compliance {rate:.0%}): {step.description}"
)
lines.append("")
# Per-scenario details with timeline
lines.append("## Detail")
lines.append("")
for level_name, result, observations in results:
lines.append(f"### {level_name} (Compliance: {result.compliance_rate:.0%})")
lines.append("")
lines.append("| Step | Required | Detected | Reason |")
lines.append("|------|----------|----------|--------|")
for sr in result.steps:
req = "Yes" if any(
sp.id == sr.step_id and sp.required for sp in spec.steps
) else "No"
det = "YES" if sr.detected else "NO"
reason = sr.failure_reason or ""
lines.append(f"| {sr.step_id} | {req} | {det} | {reason} |")
lines.append("")
# Timeline: show what the agent actually did
if observations:
# Build reverse index: event_index → step_id
index_to_step: dict[int, str] = {}
for step_id, indices in result.classification.items():
for idx in indices:
index_to_step[idx] = step_id
lines.append(f"**Tool Call Timeline ({len(observations)} calls)**")
lines.append("")
lines.append("| # | Tool | Input | Output | Classified As |")
lines.append("|---|------|-------|--------|------|")
for i, obs in enumerate(observations):
step_label = index_to_step.get(i, "")
input_summary = obs.input[:100].replace("|", "\\|").replace("\n", " ")
output_summary = obs.output[:50].replace("|", "\\|").replace("\n", " ")
lines.append(
f"| {i} | {obs.tool} | {input_summary} | {output_summary} | {step_label} |"
)
lines.append("")
return "\n".join(lines)
def _overall_compliance(results: list[tuple[str, ComplianceResult, list[ObservationEvent]]]) -> float:
if not results:
return 0.0
return sum(r.compliance_rate for _, r, _obs in results) / len(results)
def _step_compliance_rate(
step_id: str,
results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
) -> float:
detected = sum(
1 for _, r, _obs in results
for s in r.steps if s.step_id == step_id and s.detected
)
return detected / len(results) if results else 0.0
def _steps_to_promote(
spec: ComplianceSpec,
results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
threshold: float,
) -> list[str]:
promote = []
for step in spec.steps:
if not step.required:
continue
rate = _step_compliance_rate(step.id, results)
if rate < threshold:
promote.append(step.id)
return promote