Files
Shimo a2e465c74d feat(skills): add skill-comply — automated behavioral compliance measurement (#724)
* feat(skills): add skill-comply — automated behavioral compliance measurement

Automated compliance measurement for skills, rules, and agent definitions.
Generates behavioral specs, runs scenarios at 3 strictness levels,
classifies tool calls via LLM, and produces self-contained reports.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix(skill-comply): address bot review feedback

- AGENTS.md: fix stale skill count (115 → 117) in project structure
- run.py: replace remaining print() with logger, add zero-division guard,
  create parent dirs for --output path
- runner.py: add returncode check for claude subprocess, clarify
  relative_to path traversal validation
- parser.py: use is_file() instead of exists(), catch KeyError for
  missing trace fields, add file check in parse_spec
- classifier.py: log warnings on malformed classification output,
  guard against non-dict JSON responses
- grader.py: filter negative indices from LLM classification

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 21:51:49 -07:00

128 lines
4.2 KiB
Python

"""CLI entry point for skill-comply."""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
from typing import Any
import yaml
from scripts.grader import grade
from scripts.report import generate_report
from scripts.runner import run_scenario
from scripts.scenario_generator import generate_scenarios
from scripts.spec_generator import generate_spec
logger = logging.getLogger(__name__)
def main() -> None:
logging.basicConfig(level=logging.INFO, format="%(message)s")
parser = argparse.ArgumentParser(
description="skill-comply: Measure skill compliance rates",
)
parser.add_argument(
"skill",
type=Path,
help="Path to skill/rule file to test",
)
parser.add_argument(
"--model",
default="sonnet",
help="Model for scenario execution (default: sonnet)",
)
parser.add_argument(
"--gen-model",
default="haiku",
help="Model for spec/scenario generation (default: haiku)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Generate spec and scenarios without executing",
)
parser.add_argument(
"--output",
type=Path,
default=None,
help="Output report path (default: results/<skill-name>.md)",
)
args = parser.parse_args()
if not args.skill.is_file():
logger.error("Error: Skill file not found: %s", args.skill)
sys.exit(1)
results_dir = Path(__file__).parent.parent / "results"
results_dir.mkdir(exist_ok=True)
# Step 1: Generate compliance spec
logger.info("[1/4] Generating compliance spec from %s...", args.skill.name)
spec = generate_spec(args.skill, model=args.gen_model)
logger.info(" %d steps extracted", len(spec.steps))
# Step 2: Generate scenarios
spec_yaml = yaml.dump({
"steps": [
{"id": s.id, "description": s.description, "required": s.required}
for s in spec.steps
]
})
logger.info("[2/4] Generating scenarios (3 prompt strictness levels)...")
scenarios = generate_scenarios(args.skill, spec_yaml, model=args.gen_model)
logger.info(" %d scenarios generated", len(scenarios))
for s in scenarios:
logger.info(" - %s: %s", s.level_name, s.description[:60])
if args.dry_run:
logger.info("\n[dry-run] Spec and scenarios generated. Skipping execution.")
logger.info("\nSpec: %s (%d steps)", spec.id, len(spec.steps))
for step in spec.steps:
marker = "*" if step.required else " "
logger.info(" [%s] %s: %s", marker, step.id, step.description)
return
# Step 3: Execute scenarios
logger.info("[3/4] Executing scenarios (model=%s)...", args.model)
graded_results: list[tuple[str, Any, list[Any]]] = []
for scenario in scenarios:
logger.info(" Running %s...", scenario.level_name)
run = run_scenario(scenario, model=args.model)
result = grade(spec, list(run.observations))
graded_results.append((scenario.level_name, result, list(run.observations)))
logger.info(" %s: %.0f%%", scenario.level_name, result.compliance_rate * 100)
# Step 4: Generate report
skill_name = args.skill.parent.name if args.skill.stem == "SKILL" else args.skill.stem
output_path = args.output or results_dir / f"{skill_name}.md"
logger.info("[4/4] Generating report...")
report = generate_report(args.skill, spec, graded_results, scenarios=scenarios)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(report)
logger.info(" Report saved to %s", output_path)
# Summary
if not graded_results:
logger.warning("No scenarios were executed.")
return
overall = sum(r.compliance_rate for _, r, _obs in graded_results) / len(graded_results)
logger.info("\n%s", "=" * 50)
logger.info("Overall Compliance: %.0f%%", overall * 100)
if overall < spec.threshold_promote_to_hook:
logger.info(
"Recommendation: Some steps have low compliance. "
"Consider promoting them to hooks. See the report for details."
)
if __name__ == "__main__":
main()