From 219bd1ff88e860a2c504a98a84d7f748b0dbcd8b Mon Sep 17 00:00:00 2001
From: Tatsuya Shimomoto <shimo4228@gmail.com>
Date: Sat, 14 Feb 2026 12:16:05 +0900
Subject: [PATCH] feat(skills): add cost-aware-llm-pipeline skill

Cost optimization patterns for LLM API usage combining model routing,
budget tracking, retry logic, and prompt caching.
---
 skills/cost-aware-llm-pipeline/SKILL.md | 182 ++++++++++++++++++++++++
 1 file changed, 182 insertions(+)
 create mode 100644 skills/cost-aware-llm-pipeline/SKILL.md

diff --git a/skills/cost-aware-llm-pipeline/SKILL.md b/skills/cost-aware-llm-pipeline/SKILL.md
new file mode 100644
index 00000000..25ec549c
--- /dev/null
+++ b/skills/cost-aware-llm-pipeline/SKILL.md
@@ -0,0 +1,182 @@
+---
+name: cost-aware-llm-pipeline
+description: Cost optimization patterns for LLM API usage — model routing by task complexity, budget tracking, retry logic, and prompt caching.
+---
+
+# Cost-Aware LLM Pipeline
+
+Patterns for controlling LLM API costs while maintaining quality. Combines model routing, budget tracking, retry logic, and prompt caching into a composable pipeline.
+
+## When to Activate
+
+- Building applications that call LLM APIs (Claude, GPT, etc.)
+- Processing batches of items with varying complexity
+- Need to stay within a budget for API spend
+- Optimizing cost without sacrificing quality on complex tasks
+
+## Core Concepts
+
+### 1. Model Routing by Task Complexity
+
+Automatically select cheaper models for simple tasks, reserving expensive models for complex ones.
+
+```python
+MODEL_SONNET = "claude-sonnet-4-5-20250929"
+MODEL_HAIKU = "claude-haiku-4-5-20251001"
+
+_SONNET_TEXT_THRESHOLD = 10_000  # chars
+_SONNET_ITEM_THRESHOLD = 30     # items
+
+def select_model(
+    text_length: int,
+    item_count: int,
+    force_model: str | None = None,
+) -> str:
+    """Select model based on task complexity."""
+    if force_model is not None:
+        return force_model
+    if text_length >= _SONNET_TEXT_THRESHOLD or item_count >= _SONNET_ITEM_THRESHOLD:
+        return MODEL_SONNET  # Complex task
+    return MODEL_HAIKU  # Simple task (3-4x cheaper)
+```
+
+### 2. Immutable Cost Tracking
+
+Track cumulative spend with frozen dataclasses. Each API call returns a new tracker — never mutates state.
+
+```python
+from dataclasses import dataclass
+
+@dataclass(frozen=True, slots=True)
+class CostRecord:
+    model: str
+    input_tokens: int
+    output_tokens: int
+    cost_usd: float
+
+@dataclass(frozen=True, slots=True)
+class CostTracker:
+    budget_limit: float = 1.00
+    records: tuple[CostRecord, ...] = ()
+
+    def add(self, record: CostRecord) -> "CostTracker":
+        """Return new tracker with added record (never mutates self)."""
+        return CostTracker(
+            budget_limit=self.budget_limit,
+            records=(*self.records, record),
+        )
+
+    @property
+    def total_cost(self) -> float:
+        return sum(r.cost_usd for r in self.records)
+
+    @property
+    def over_budget(self) -> bool:
+        return self.total_cost > self.budget_limit
+```
+
+### 3. Narrow Retry Logic
+
+Retry only on transient errors. Fail fast on authentication or bad request errors.
+
+```python
+from anthropic import (
+    APIConnectionError,
+    InternalServerError,
+    RateLimitError,
+)
+
+_RETRYABLE_ERRORS = (APIConnectionError, RateLimitError, InternalServerError)
+_MAX_RETRIES = 3
+
+def call_with_retry(func, *, max_retries: int = _MAX_RETRIES):
+    """Retry only on transient errors, fail fast on others."""
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except _RETRYABLE_ERRORS:
+            if attempt == max_retries - 1:
+                raise
+            time.sleep(2 ** attempt)  # Exponential backoff
+    # AuthenticationError, BadRequestError etc. → raise immediately
+```
+
+### 4. Prompt Caching
+
+Cache long system prompts to avoid resending them on every request.
+
+```python
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": system_prompt,
+                "cache_control": {"type": "ephemeral"},  # Cache this
+            },
+            {
+                "type": "text",
+                "text": user_input,  # Variable part
+            },
+        ],
+    }
+]
+```
+
+## Composition
+
+Combine all four techniques in a single pipeline function:
+
+```python
+def process(text: str, config: Config, tracker: CostTracker) -> tuple[Result, CostTracker]:
+    # 1. Route model
+    model = select_model(len(text), estimated_items, config.force_model)
+
+    # 2. Check budget
+    if tracker.over_budget:
+        raise BudgetExceededError(tracker.total_cost, tracker.budget_limit)
+
+    # 3. Call with retry + caching
+    response = call_with_retry(lambda: client.messages.create(
+        model=model,
+        messages=build_cached_messages(system_prompt, text),
+    ))
+
+    # 4. Track cost (immutable)
+    record = CostRecord(model=model, input_tokens=..., output_tokens=..., cost_usd=...)
+    tracker = tracker.add(record)
+
+    return parse_result(response), tracker
+```
+
+## Pricing Reference (2025-2026)
+
+| Model | Input ($/1M tokens) | Output ($/1M tokens) | Relative Cost |
+|-------|---------------------|----------------------|---------------|
+| Haiku 4.5 | $0.80 | $4.00 | 1x |
+| Sonnet 4.5 | $3.00 | $15.00 | ~4x |
+| Opus 4.5 | $15.00 | $75.00 | ~19x |
+
+## Best Practices
+
+- **Start with the cheapest model** and only route to expensive models when complexity thresholds are met
+- **Set explicit budget limits** before processing batches — fail early rather than overspend
+- **Log model selection decisions** so you can tune thresholds based on real data
+- **Use prompt caching** for system prompts over 1024 tokens — saves both cost and latency
+- **Never retry on authentication or validation errors** — only transient failures (network, rate limit, server error)
+
+## Anti-Patterns to Avoid
+
+- Using the most expensive model for all requests regardless of complexity
+- Retrying on all errors (wastes budget on permanent failures)
+- Mutating cost tracking state (makes debugging and auditing difficult)
+- Hardcoding model names throughout the codebase (use constants or config)
+- Ignoring prompt caching for repetitive system prompts
+
+## When to Use
+
+- Any application calling Claude, OpenAI, or similar LLM APIs
+- Batch processing pipelines where cost adds up quickly
+- Multi-model architectures that need intelligent routing
+- Production systems that need budget guardrails