feat(skills): add skill-comply — automated behavioral compliance measurement (#724)

* feat(skills): add skill-comply — automated behavioral compliance measurement Automated compliance measurement for skills, rules, and agent definitions. Generates behavioral specs, runs scenarios at 3 strictness levels, classifies tool calls via LLM, and produces self-contained reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(skill-comply): address bot review feedback - AGENTS.md: fix stale skill count (115 → 117) in project structure - run.py: replace remaining print() with logger, add zero-division guard, create parent dirs for --output path - runner.py: add returncode check for claude subprocess, clarify relative_to path traversal validation - parser.py: use is_file() instead of exists(), catch KeyError for missing trace fields, add file check in parse_spec - classifier.py: log warnings on malformed classification output, guard against non-dict JSON responses - grader.py: filter negative indices from LLM classification Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-19 23:33:07 +08:00 · 2026-03-23 13:51:49 +09:00
parent 0f22cb4450
commit a2e465c74d
23 changed files with 1418 additions and 4 deletions
--- a/skills/skill-comply/scripts/parser.py
+++ b/skills/skill-comply/scripts/parser.py
@@ -0,0 +1,107 @@
+"""Parse observation traces (JSONL) and compliance specs (YAML)."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+import yaml
+
+
+@dataclass(frozen=True)
+class ObservationEvent:
+    timestamp: str
+    event: str
+    tool: str
+    session: str
+    input: str
+    output: str
+
+
+@dataclass(frozen=True)
+class Detector:
+    description: str
+    after_step: str | None = None
+    before_step: str | None = None
+
+
+@dataclass(frozen=True)
+class Step:
+    id: str
+    description: str
+    required: bool
+    detector: Detector
+
+
+@dataclass(frozen=True)
+class ComplianceSpec:
+    id: str
+    name: str
+    source_rule: str
+    version: str
+    steps: tuple[Step, ...]
+    threshold_promote_to_hook: float
+
+
+def parse_trace(path: Path) -> list[ObservationEvent]:
+    """Parse a JSONL observation trace file into sorted events."""
+    if not path.is_file():
+        raise FileNotFoundError(f"Trace file not found: {path}")
+
+    text = path.read_text().strip()
+    if not text:
+        return []
+
+    events: list[ObservationEvent] = []
+    for i, line in enumerate(text.splitlines(), 1):
+        try:
+            raw = json.loads(line)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON at line {i}: {e}") from e
+        try:
+            events.append(ObservationEvent(
+                timestamp=raw["timestamp"],
+                event=raw["event"],
+                tool=raw["tool"],
+                session=raw["session"],
+                input=raw.get("input", ""),
+                output=raw.get("output", ""),
+            ))
+        except KeyError as e:
+            raise ValueError(f"Missing required field {e} at line {i}") from e
+
+    return sorted(events, key=lambda e: e.timestamp)
+
+
+def parse_spec(path: Path) -> ComplianceSpec:
+    """Parse a YAML compliance spec file."""
+    if not path.is_file():
+        raise FileNotFoundError(f"Spec file not found: {path}")
+    raw = yaml.safe_load(path.read_text())
+
+    steps: list[Step] = []
+    for s in raw["steps"]:
+        d = s["detector"]
+        steps.append(Step(
+            id=s["id"],
+            description=s["description"],
+            required=s["required"],
+            detector=Detector(
+                description=d["description"],
+                after_step=d.get("after_step"),
+                before_step=d.get("before_step"),
+            ),
+        ))
+
+    if "scoring" not in raw:
+        raise KeyError("Missing 'scoring' section in compliance spec")
+
+    return ComplianceSpec(
+        id=raw["id"],
+        name=raw["name"],
+        source_rule=raw["source_rule"],
+        version=raw["version"],
+        steps=tuple(steps),
+        threshold_promote_to_hook=raw["scoring"]["threshold_promote_to_hook"],
+    )