From b0cbd3df1533b396d281a6886d5132f623393a9c Mon Sep 17 00:00:00 2001 From: zack-anthropic Date: Fri, 6 Mar 2026 12:06:23 -0800 Subject: [PATCH] skill-creator: drop ANTHROPIC_API_KEY requirement from description optimizer (#547) improve_description.py now calls `claude -p` as a subprocess instead of the Anthropic SDK, so users no longer need a separate ANTHROPIC_API_KEY to run the description optimization loop. Same auth pattern run_eval.py already used for the triggering eval. Prompts go over stdin (they embed the full SKILL.md body). Strips CLAUDECODE env var to allow nesting inside a Claude Code session. The over-1024-char retry is now a fresh single-turn call that inlines the too-long version rather than a multi-turn followup. SKILL.md: dropped the stale "extended thinking" reference to match. --- skills/skill-creator/SKILL.md | 10 +- .../scripts/improve_description.py | 103 +++++++++--------- skills/skill-creator/scripts/run_loop.py | 4 - 3 files changed, 59 insertions(+), 58 deletions(-) diff --git a/skills/skill-creator/SKILL.md b/skills/skill-creator/SKILL.md index 942bfe89..65b3a402 100644 --- a/skills/skill-creator/SKILL.md +++ b/skills/skill-creator/SKILL.md @@ -1,6 +1,6 @@ --- name: skill-creator -description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy. +description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, edit, or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy. --- # Skill Creator @@ -391,7 +391,7 @@ Use the model ID from your system prompt (the one powering the current session) While it runs, periodically tail the output to give the user updates on which iteration it's on and what the scores look like. -This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description (running each query 3 times to get a reliable trigger rate), then calls Claude with extended thinking to propose improvements based on what failed. It re-evaluates each new description on both train and test, iterating up to 5 times. When it's done, it opens an HTML report in the browser showing the results per iteration and returns JSON with `best_description` — selected by test score rather than train score to avoid overfitting. +This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description (running each query 3 times to get a reliable trigger rate), then calls Claude to propose improvements based on what failed. It re-evaluates each new description on both train and test, iterating up to 5 times. When it's done, it opens an HTML report in the browser showing the results per iteration and returns JSON with `best_description` — selected by test score rather than train score to avoid overfitting. ### How skill triggering works @@ -435,6 +435,11 @@ In Claude.ai, the core workflow is the same (draft → test → review → impro **Packaging**: The `package_skill.py` script works anywhere with Python and a filesystem. On Claude.ai, you can run it and the user can download the resulting `.skill` file. +**Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. In this case: +- **Preserve the original name.** Note the skill's directory name and `name` frontmatter field -- use them unchanged. E.g., if the installed skill is `research-helper`, output `research-helper.skill` (not `research-helper-v2`). +- **Copy to a writeable location before editing.** The installed skill path may be read-only. Copy to `/tmp/skill-name/`, edit there, and package from the copy. +- **If packaging manually, stage in `/tmp/` first**, then copy to the output directory -- direct writes may fail due to permissions. + --- ## Cowork-Specific Instructions @@ -447,6 +452,7 @@ If you're in Cowork, the main things to know are: - Feedback works differently: since there's no running server, the viewer's "Submit All Reviews" button will download `feedback.json` as a file. You can then read it from there (you may have to request access first). - Packaging works — `package_skill.py` just needs Python and a filesystem. - Description optimization (`run_loop.py` / `run_eval.py`) should work in Cowork just fine since it uses `claude -p` via subprocess, not a browser, but please save it until you've fully finished making the skill and the user agrees it's in good shape. +- **Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. Follow the update guidance in the claude.ai section above. --- diff --git a/skills/skill-creator/scripts/improve_description.py b/skills/skill-creator/scripts/improve_description.py index a270777b..06bcec76 100755 --- a/skills/skill-creator/scripts/improve_description.py +++ b/skills/skill-creator/scripts/improve_description.py @@ -2,22 +2,52 @@ """Improve a skill description based on eval results. Takes eval results (from run_eval.py) and generates an improved description -using Claude with extended thinking. +by calling `claude -p` as a subprocess (same auth pattern as run_eval.py — +uses the session's Claude Code auth, no separate ANTHROPIC_API_KEY needed). """ import argparse import json +import os import re +import subprocess import sys from pathlib import Path -import anthropic - from scripts.utils import parse_skill_md +def _call_claude(prompt: str, model: str | None, timeout: int = 300) -> str: + """Run `claude -p` with the prompt on stdin and return the text response. + + Prompt goes over stdin (not argv) because it embeds the full SKILL.md + body and can easily exceed comfortable argv length. + """ + cmd = ["claude", "-p", "--output-format", "text"] + if model: + cmd.extend(["--model", model]) + + # Remove CLAUDECODE env var to allow nesting claude -p inside a + # Claude Code session. The guard is for interactive terminal conflicts; + # programmatic subprocess usage is safe. Same pattern as run_eval.py. + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + result = subprocess.run( + cmd, + input=prompt, + capture_output=True, + text=True, + env=env, + timeout=timeout, + ) + if result.returncode != 0: + raise RuntimeError( + f"claude -p exited {result.returncode}\nstderr: {result.stderr}" + ) + return result.stdout + + def improve_description( - client: anthropic.Anthropic, skill_name: str, skill_content: str, current_description: str, @@ -99,7 +129,7 @@ Based on the failures, write a new and improved description that is more likely 1. Avoid overfitting 2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description. -Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. +Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. There is a hard limit of 1024 characters — descriptions over that will be truncated, so stay comfortably under it. Here are some tips that we've found to work well in writing these descriptions: - The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does" @@ -111,70 +141,41 @@ I'd encourage you to be creative and mix up the style in different iterations si Please respond with only the new description text in tags, nothing else.""" - response = client.messages.create( - model=model, - max_tokens=16000, - thinking={ - "type": "enabled", - "budget_tokens": 10000, - }, - messages=[{"role": "user", "content": prompt}], - ) + text = _call_claude(prompt, model) - # Extract thinking and text from response - thinking_text = "" - text = "" - for block in response.content: - if block.type == "thinking": - thinking_text = block.thinking - elif block.type == "text": - text = block.text - - # Parse out the tags match = re.search(r"(.*?)", text, re.DOTALL) description = match.group(1).strip().strip('"') if match else text.strip().strip('"') - # Log the transcript transcript: dict = { "iteration": iteration, "prompt": prompt, - "thinking": thinking_text, "response": text, "parsed_description": description, "char_count": len(description), "over_limit": len(description) > 1024, } - # If over 1024 chars, ask the model to shorten it + # Safety net: the prompt already states the 1024-char hard limit, but if + # the model blew past it anyway, make one fresh single-turn call that + # quotes the too-long version and asks for a shorter rewrite. (The old + # SDK path did this as a true multi-turn; `claude -p` is one-shot, so we + # inline the prior output into the new prompt instead.) if len(description) > 1024: - shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in tags." - shorten_response = client.messages.create( - model=model, - max_tokens=16000, - thinking={ - "type": "enabled", - "budget_tokens": 10000, - }, - messages=[ - {"role": "user", "content": prompt}, - {"role": "assistant", "content": text}, - {"role": "user", "content": shorten_prompt}, - ], + shorten_prompt = ( + f"{prompt}\n\n" + f"---\n\n" + f"A previous attempt produced this description, which at " + f"{len(description)} characters is over the 1024-character hard limit:\n\n" + f'"{description}"\n\n' + f"Rewrite it to be under 1024 characters while keeping the most " + f"important trigger words and intent coverage. Respond with only " + f"the new description in tags." ) - - shorten_thinking = "" - shorten_text = "" - for block in shorten_response.content: - if block.type == "thinking": - shorten_thinking = block.thinking - elif block.type == "text": - shorten_text = block.text - + shorten_text = _call_claude(shorten_prompt, model) match = re.search(r"(.*?)", shorten_text, re.DOTALL) shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"') transcript["rewrite_prompt"] = shorten_prompt - transcript["rewrite_thinking"] = shorten_thinking transcript["rewrite_response"] = shorten_text transcript["rewrite_description"] = shortened transcript["rewrite_char_count"] = len(shortened) @@ -216,9 +217,7 @@ def main(): print(f"Current: {current_description}", file=sys.stderr) print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr) - client = anthropic.Anthropic() new_description = improve_description( - client=client, skill_name=name, skill_content=content, current_description=current_description, diff --git a/skills/skill-creator/scripts/run_loop.py b/skills/skill-creator/scripts/run_loop.py index 36f9b4e0..30a263d6 100755 --- a/skills/skill-creator/scripts/run_loop.py +++ b/skills/skill-creator/scripts/run_loop.py @@ -15,8 +15,6 @@ import time import webbrowser from pathlib import Path -import anthropic - from scripts.generate_report import generate_html from scripts.improve_description import improve_description from scripts.run_eval import find_project_root, run_eval @@ -75,7 +73,6 @@ def run_loop( train_set = eval_set test_set = [] - client = anthropic.Anthropic() history = [] exit_reason = "unknown" @@ -200,7 +197,6 @@ def run_loop( for h in history ] new_description = improve_description( - client=client, skill_name=name, skill_content=content, current_description=current_description,