feat(skills): add skill-comply — automated behavioral compliance measurement (#724)

* feat(skills): add skill-comply — automated behavioral compliance measurement Automated compliance measurement for skills, rules, and agent definitions. Generates behavioral specs, runs scenarios at 3 strictness levels, classifies tool calls via LLM, and produces self-contained reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(skill-comply): address bot review feedback - AGENTS.md: fix stale skill count (115 → 117) in project structure - run.py: replace remaining print() with logger, add zero-division guard, create parent dirs for --output path - runner.py: add returncode check for claude subprocess, clarify relative_to path traversal validation - parser.py: use is_file() instead of exists(), catch KeyError for missing trace fields, add file check in parse_spec - classifier.py: log warnings on malformed classification output, guard against non-dict JSON responses - grader.py: filter negative indices from LLM classification Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-06-14 12:11:27 +08:00 · 2026-03-23 13:51:49 +09:00
parent 0f22cb4450
commit a2e465c74d
23 changed files with 1418 additions and 4 deletions
@@ -0,0 +1,70 @@
+"""Generate pressure scenarios from skill + spec using LLM."""
+
+from __future__ import annotations
+
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+import yaml
+
+from scripts.utils import extract_yaml
+
+PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
+
+
+@dataclass(frozen=True)
+class Scenario:
+    id: str
+    level: int
+    level_name: str
+    description: str
+    prompt: str
+    setup_commands: tuple[str, ...]
+
+
+def generate_scenarios(
+    skill_path: Path,
+    spec_yaml: str,
+    model: str = "haiku",
+) -> list[Scenario]:
+    """Generate 3 scenarios with decreasing prompt strictness.
+
+    Calls claude -p with the scenario_generator prompt, parses YAML output.
+    """
+    skill_content = skill_path.read_text()
+    prompt_template = (PROMPTS_DIR / "scenario_generator.md").read_text()
+    prompt = (
+        prompt_template
+        .replace("{skill_content}", skill_content)
+        .replace("{spec_yaml}", spec_yaml)
+    )
+
+    result = subprocess.run(
+        ["claude", "-p", prompt, "--model", model, "--output-format", "text"],
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+
+    if result.returncode != 0:
+        raise RuntimeError(f"claude -p failed: {result.stderr}")
+
+    if not result.stdout.strip():
+        raise RuntimeError("claude -p returned empty output")
+
+    raw_yaml = extract_yaml(result.stdout)
+    parsed = yaml.safe_load(raw_yaml)
+
+    scenarios: list[Scenario] = []
+    for s in parsed["scenarios"]:
+        scenarios.append(Scenario(
+            id=s["id"],
+            level=s["level"],
+            level_name=s["level_name"],
+            description=s["description"],
+            prompt=s["prompt"].strip(),
+            setup_commands=tuple(s.get("setup_commands", [])),
+        ))
+
+    return sorted(scenarios, key=lambda s: s.level)