Files
everything-claude-code/skills/skill-comply/tests/test_grader.py
Shimo a2e465c74d feat(skills): add skill-comply — automated behavioral compliance measurement (#724)
* feat(skills): add skill-comply — automated behavioral compliance measurement

Automated compliance measurement for skills, rules, and agent definitions.
Generates behavioral specs, runs scenarios at 3 strictness levels,
classifies tool calls via LLM, and produces self-contained reports.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix(skill-comply): address bot review feedback

- AGENTS.md: fix stale skill count (115 → 117) in project structure
- run.py: replace remaining print() with logger, add zero-division guard,
  create parent dirs for --output path
- runner.py: add returncode check for claude subprocess, clarify
  relative_to path traversal validation
- parser.py: use is_file() instead of exists(), catch KeyError for
  missing trace fields, add file check in parse_spec
- classifier.py: log warnings on malformed classification output,
  guard against non-dict JSON responses
- grader.py: filter negative indices from LLM classification

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 21:51:49 -07:00

138 lines
5.9 KiB
Python

"""Tests for grader module — compliance scoring with LLM classification."""
from pathlib import Path
from unittest.mock import patch
import pytest
from scripts.grader import ComplianceResult, StepResult, grade
from scripts.parser import parse_spec, parse_trace
FIXTURES = Path(__file__).parent.parent / "fixtures"
@pytest.fixture
def tdd_spec():
return parse_spec(FIXTURES / "tdd_spec.yaml")
@pytest.fixture
def compliant_trace():
return parse_trace(FIXTURES / "compliant_trace.jsonl")
@pytest.fixture
def noncompliant_trace():
return parse_trace(FIXTURES / "noncompliant_trace.jsonl")
def _mock_compliant_classification(spec, trace, model="haiku"): # noqa: ARG001
"""Simulate LLM correctly classifying a compliant trace."""
return {
"write_test": [0],
"run_test_red": [1],
"write_impl": [2],
"run_test_green": [3],
"refactor": [4],
}
def _mock_noncompliant_classification(spec, trace, model="haiku"):
"""Simulate LLM classifying a noncompliant trace (impl before test)."""
return {
"write_impl": [0], # src/fib.py written first
"write_test": [1], # test written second
"run_test_green": [2], # only a passing test run
}
def _mock_empty_classification(spec, trace, model="haiku"):
return {}
class TestGradeCompliant:
@patch("scripts.grader.classify_events", side_effect=_mock_compliant_classification)
def test_returns_compliance_result(self, mock_cls, tdd_spec, compliant_trace) -> None:
result = grade(tdd_spec, compliant_trace)
assert isinstance(result, ComplianceResult)
@patch("scripts.grader.classify_events", side_effect=_mock_compliant_classification)
def test_full_compliance(self, mock_cls, tdd_spec, compliant_trace) -> None:
result = grade(tdd_spec, compliant_trace)
assert result.compliance_rate == 1.0
@patch("scripts.grader.classify_events", side_effect=_mock_compliant_classification)
def test_all_required_steps_detected(self, mock_cls, tdd_spec, compliant_trace) -> None:
result = grade(tdd_spec, compliant_trace)
required_results = [s for s in result.steps if s.step_id in
("write_test", "run_test_red", "write_impl", "run_test_green")]
assert all(s.detected for s in required_results)
@patch("scripts.grader.classify_events", side_effect=_mock_compliant_classification)
def test_optional_step_detected(self, mock_cls, tdd_spec, compliant_trace) -> None:
result = grade(tdd_spec, compliant_trace)
refactor = next(s for s in result.steps if s.step_id == "refactor")
assert refactor.detected is True
@patch("scripts.grader.classify_events", side_effect=_mock_compliant_classification)
def test_no_hook_promotion_recommended(self, mock_cls, tdd_spec, compliant_trace) -> None:
result = grade(tdd_spec, compliant_trace)
assert result.recommend_hook_promotion is False
@patch("scripts.grader.classify_events", side_effect=_mock_compliant_classification)
def test_step_evidence_not_empty(self, mock_cls, tdd_spec, compliant_trace) -> None:
result = grade(tdd_spec, compliant_trace)
for step in result.steps:
if step.detected:
assert len(step.evidence) > 0
class TestGradeNoncompliant:
@patch("scripts.grader.classify_events", side_effect=_mock_noncompliant_classification)
def test_low_compliance(self, mock_cls, tdd_spec, noncompliant_trace) -> None:
result = grade(tdd_spec, noncompliant_trace)
assert result.compliance_rate < 1.0
@patch("scripts.grader.classify_events", side_effect=_mock_noncompliant_classification)
def test_write_test_fails_ordering(self, mock_cls, tdd_spec, noncompliant_trace) -> None:
"""write_test has before_step=write_impl, but test is written AFTER impl."""
result = grade(tdd_spec, noncompliant_trace)
write_test = next(s for s in result.steps if s.step_id == "write_test")
assert write_test.detected is False
@patch("scripts.grader.classify_events", side_effect=_mock_noncompliant_classification)
def test_run_test_red_not_detected(self, mock_cls, tdd_spec, noncompliant_trace) -> None:
result = grade(tdd_spec, noncompliant_trace)
run_red = next(s for s in result.steps if s.step_id == "run_test_red")
assert run_red.detected is False
@patch("scripts.grader.classify_events", side_effect=_mock_noncompliant_classification)
def test_hook_promotion_recommended(self, mock_cls, tdd_spec, noncompliant_trace) -> None:
result = grade(tdd_spec, noncompliant_trace)
assert result.recommend_hook_promotion is True
@patch("scripts.grader.classify_events", side_effect=_mock_noncompliant_classification)
def test_failure_reasons_present(self, mock_cls, tdd_spec, noncompliant_trace) -> None:
result = grade(tdd_spec, noncompliant_trace)
failed_steps = [s for s in result.steps if not s.detected and s.step_id != "refactor"]
for step in failed_steps:
assert step.failure_reason is not None
class TestGradeEdgeCases:
@patch("scripts.grader.classify_events", side_effect=_mock_empty_classification)
def test_empty_trace(self, mock_cls, tdd_spec) -> None:
result = grade(tdd_spec, [])
assert result.compliance_rate == 0.0
assert result.recommend_hook_promotion is True
@patch("scripts.grader.classify_events", side_effect=_mock_compliant_classification)
def test_compliance_rate_is_ratio_of_required_only(self, mock_cls, tdd_spec, compliant_trace) -> None:
result = grade(tdd_spec, compliant_trace)
assert result.compliance_rate == 1.0
@patch("scripts.grader.classify_events", side_effect=_mock_compliant_classification)
def test_spec_id_in_result(self, mock_cls, tdd_spec, compliant_trace) -> None:
result = grade(tdd_spec, compliant_trace)
assert result.spec_id == "tdd-workflow"