From 219bd1ff88e860a2c504a98a84d7f748b0dbcd8b Mon Sep 17 00:00:00 2001 From: Tatsuya Shimomoto Date: Sat, 14 Feb 2026 12:16:05 +0900 Subject: [PATCH] feat(skills): add cost-aware-llm-pipeline skill Cost optimization patterns for LLM API usage combining model routing, budget tracking, retry logic, and prompt caching. --- skills/cost-aware-llm-pipeline/SKILL.md | 182 ++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 skills/cost-aware-llm-pipeline/SKILL.md diff --git a/skills/cost-aware-llm-pipeline/SKILL.md b/skills/cost-aware-llm-pipeline/SKILL.md new file mode 100644 index 00000000..25ec549c --- /dev/null +++ b/skills/cost-aware-llm-pipeline/SKILL.md @@ -0,0 +1,182 @@ +--- +name: cost-aware-llm-pipeline +description: Cost optimization patterns for LLM API usage — model routing by task complexity, budget tracking, retry logic, and prompt caching. +--- + +# Cost-Aware LLM Pipeline + +Patterns for controlling LLM API costs while maintaining quality. Combines model routing, budget tracking, retry logic, and prompt caching into a composable pipeline. + +## When to Activate + +- Building applications that call LLM APIs (Claude, GPT, etc.) +- Processing batches of items with varying complexity +- Need to stay within a budget for API spend +- Optimizing cost without sacrificing quality on complex tasks + +## Core Concepts + +### 1. Model Routing by Task Complexity + +Automatically select cheaper models for simple tasks, reserving expensive models for complex ones. + +```python +MODEL_SONNET = "claude-sonnet-4-5-20250929" +MODEL_HAIKU = "claude-haiku-4-5-20251001" + +_SONNET_TEXT_THRESHOLD = 10_000 # chars +_SONNET_ITEM_THRESHOLD = 30 # items + +def select_model( + text_length: int, + item_count: int, + force_model: str | None = None, +) -> str: + """Select model based on task complexity.""" + if force_model is not None: + return force_model + if text_length >= _SONNET_TEXT_THRESHOLD or item_count >= _SONNET_ITEM_THRESHOLD: + return MODEL_SONNET # Complex task + return MODEL_HAIKU # Simple task (3-4x cheaper) +``` + +### 2. Immutable Cost Tracking + +Track cumulative spend with frozen dataclasses. Each API call returns a new tracker — never mutates state. + +```python +from dataclasses import dataclass + +@dataclass(frozen=True, slots=True) +class CostRecord: + model: str + input_tokens: int + output_tokens: int + cost_usd: float + +@dataclass(frozen=True, slots=True) +class CostTracker: + budget_limit: float = 1.00 + records: tuple[CostRecord, ...] = () + + def add(self, record: CostRecord) -> "CostTracker": + """Return new tracker with added record (never mutates self).""" + return CostTracker( + budget_limit=self.budget_limit, + records=(*self.records, record), + ) + + @property + def total_cost(self) -> float: + return sum(r.cost_usd for r in self.records) + + @property + def over_budget(self) -> bool: + return self.total_cost > self.budget_limit +``` + +### 3. Narrow Retry Logic + +Retry only on transient errors. Fail fast on authentication or bad request errors. + +```python +from anthropic import ( + APIConnectionError, + InternalServerError, + RateLimitError, +) + +_RETRYABLE_ERRORS = (APIConnectionError, RateLimitError, InternalServerError) +_MAX_RETRIES = 3 + +def call_with_retry(func, *, max_retries: int = _MAX_RETRIES): + """Retry only on transient errors, fail fast on others.""" + for attempt in range(max_retries): + try: + return func() + except _RETRYABLE_ERRORS: + if attempt == max_retries - 1: + raise + time.sleep(2 ** attempt) # Exponential backoff + # AuthenticationError, BadRequestError etc. → raise immediately +``` + +### 4. Prompt Caching + +Cache long system prompts to avoid resending them on every request. + +```python +messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": system_prompt, + "cache_control": {"type": "ephemeral"}, # Cache this + }, + { + "type": "text", + "text": user_input, # Variable part + }, + ], + } +] +``` + +## Composition + +Combine all four techniques in a single pipeline function: + +```python +def process(text: str, config: Config, tracker: CostTracker) -> tuple[Result, CostTracker]: + # 1. Route model + model = select_model(len(text), estimated_items, config.force_model) + + # 2. Check budget + if tracker.over_budget: + raise BudgetExceededError(tracker.total_cost, tracker.budget_limit) + + # 3. Call with retry + caching + response = call_with_retry(lambda: client.messages.create( + model=model, + messages=build_cached_messages(system_prompt, text), + )) + + # 4. Track cost (immutable) + record = CostRecord(model=model, input_tokens=..., output_tokens=..., cost_usd=...) + tracker = tracker.add(record) + + return parse_result(response), tracker +``` + +## Pricing Reference (2025-2026) + +| Model | Input ($/1M tokens) | Output ($/1M tokens) | Relative Cost | +|-------|---------------------|----------------------|---------------| +| Haiku 4.5 | $0.80 | $4.00 | 1x | +| Sonnet 4.5 | $3.00 | $15.00 | ~4x | +| Opus 4.5 | $15.00 | $75.00 | ~19x | + +## Best Practices + +- **Start with the cheapest model** and only route to expensive models when complexity thresholds are met +- **Set explicit budget limits** before processing batches — fail early rather than overspend +- **Log model selection decisions** so you can tune thresholds based on real data +- **Use prompt caching** for system prompts over 1024 tokens — saves both cost and latency +- **Never retry on authentication or validation errors** — only transient failures (network, rate limit, server error) + +## Anti-Patterns to Avoid + +- Using the most expensive model for all requests regardless of complexity +- Retrying on all errors (wastes budget on permanent failures) +- Mutating cost tracking state (makes debugging and auditing difficult) +- Hardcoding model names throughout the codebase (use constants or config) +- Ignoring prompt caching for repetitive system prompts + +## When to Use + +- Any application calling Claude, OpenAI, or similar LLM APIs +- Batch processing pipelines where cost adds up quickly +- Multi-model architectures that need intelligent routing +- Production systems that need budget guardrails