From c9962bf83e38ac8ffcc174dc768ff002b8b731b2 Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Thu, 30 Apr 2026 09:20:23 -0400 Subject: [PATCH] fix: harden agent instruction surfaces --- .agents/skills/frontend-patterns/SKILL.md | 6 ++ .codex/AGENTS.md | 6 ++ .kiro/skills/search-first/SKILL.md | 6 ++ skills/autonomous-agent-harness/SKILL.md | 6 ++ skills/defi-amm-security/SKILL.md | 6 ++ tests/ci/agent-instruction-safety.test.js | 98 +++++++++++++++++++++++ 6 files changed, 128 insertions(+) create mode 100644 tests/ci/agent-instruction-safety.test.js diff --git a/.agents/skills/frontend-patterns/SKILL.md b/.agents/skills/frontend-patterns/SKILL.md index 51ed4d36..1caf1e40 100644 --- a/.agents/skills/frontend-patterns/SKILL.md +++ b/.agents/skills/frontend-patterns/SKILL.md @@ -17,6 +17,12 @@ Modern frontend patterns for React, Next.js, and performant user interfaces. - Handling client-side routing and navigation - Building accessible, responsive UI patterns +## Privacy and Data Boundaries + +Frontend examples should use synthetic or domain-generic data. Do not collect, log, persist, or display credentials, access tokens, SSNs, health data, payment details, private emails, phone numbers, or other sensitive personal data unless the user explicitly requests a scoped implementation with appropriate validation, redaction, and access controls. + +Avoid adding analytics, tracking pixels, third-party scripts, or external data sinks without explicit approval. When handling user data, prefer least-privilege APIs, client-side redaction before logging, and server-side validation for every boundary. + ## Component Patterns ### Composition Over Inheritance diff --git a/.codex/AGENTS.md b/.codex/AGENTS.md index 52301662..7c6cfa75 100644 --- a/.codex/AGENTS.md +++ b/.codex/AGENTS.md @@ -60,6 +60,12 @@ The sync script (`scripts/sync-ecc-to-codex.sh`) uses a Node-based TOML parser t - **`--update-mcp`** — explicitly replaces all ECC-managed servers with the latest recommended config (safely removes subtables like `[mcp_servers.supabase.env]`). - **User config is always preserved** — custom servers, args, env vars, and credentials outside ECC-managed sections are never touched. +## External Action Boundaries + +Treat networked tools as read-only by default. Search, inspect, and draft freely within the user's requested scope, but require explicit user approval before posting, publishing, pushing, merging, opening paid jobs, dispatching remote agents, changing third-party resources, or modifying credentials. + +When approval is ambiguous, produce a local plan or draft artifact instead of taking the external action. Preserve user config and private state unless the user specifically asks for a scoped change. + ## Multi-Agent Support Codex now supports multi-agent workflows behind the experimental `features.multi_agent` flag. diff --git a/.kiro/skills/search-first/SKILL.md b/.kiro/skills/search-first/SKILL.md index e6af22af..d9f0c2d2 100644 --- a/.kiro/skills/search-first/SKILL.md +++ b/.kiro/skills/search-first/SKILL.md @@ -21,6 +21,12 @@ Use this skill when: - The user asks "add X functionality" and you're about to write code - Before creating a new utility, helper, or abstraction +## Scope and Approval Rules + +Default to read-only research: inspect the repo, package metadata, docs, and public examples before recommending a dependency or integration. Do not install packages, configure MCP servers, publish artifacts, open PRs, or make external write actions from this skill unless the user has explicitly approved that action in the current task. + +When a candidate requires credentials, paid services, network writes, or project-wide config changes, return a recommendation and approval checkpoint instead of applying it directly. + ## Workflow ``` diff --git a/skills/autonomous-agent-harness/SKILL.md b/skills/autonomous-agent-harness/SKILL.md index c614cd39..3a8cba27 100644 --- a/skills/autonomous-agent-harness/SKILL.md +++ b/skills/autonomous-agent-harness/SKILL.md @@ -8,6 +8,12 @@ origin: ECC Turn Claude Code into a persistent, self-directing agent system using only native features and MCP servers. +## Consent and Safety Boundaries + +Autonomous operation must be explicitly requested and scoped by the user. Do not create schedules, dispatch remote agents, write persistent memory, use computer control, post externally, modify third-party resources, or act on private communications unless the user has approved that capability and the target workspace for the current setup. + +Prefer dry-run plans and local queue files before enabling recurring or event-driven actions. Keep credentials, private workspace exports, personal datasets, and account-specific automations out of reusable ECC artifacts. + ## When to Activate - User wants an agent that runs continuously or on a schedule diff --git a/skills/defi-amm-security/SKILL.md b/skills/defi-amm-security/SKILL.md index faf8aca7..d41fb36b 100644 --- a/skills/defi-amm-security/SKILL.md +++ b/skills/defi-amm-security/SKILL.md @@ -20,6 +20,12 @@ Critical vulnerability patterns and hardened implementations for Solidity AMM co Use this as a checklist-plus-pattern library. Review every user entrypoint against the categories below and prefer the hardened examples over hand-rolled variants. +## Execution Safety + +The shell commands in this skill are local audit examples. Run them only in a trusted checkout or disposable sandbox, and do not splice untrusted contract names, paths, RPC URLs, private keys, or user-supplied flags into shell commands. Ask before installing tools or running long fuzzing/static-analysis jobs that may consume significant local or paid resources. + +Never include secrets, private keys, seed phrases, API tokens, or mainnet signing credentials in command examples, logs, or reports. + ## Examples ### Reentrancy: enforce CEI order diff --git a/tests/ci/agent-instruction-safety.test.js b/tests/ci/agent-instruction-safety.test.js new file mode 100644 index 00000000..1e0f465a --- /dev/null +++ b/tests/ci/agent-instruction-safety.test.js @@ -0,0 +1,98 @@ +#!/usr/bin/env node +/** + * Validate safety guardrails on agent-facing instruction artifacts. + */ + +const assert = require('assert'); +const fs = require('fs'); +const path = require('path'); + +const repoRoot = path.resolve(__dirname, '..', '..'); + +const guardrails = [ + { + path: '.codex/AGENTS.md', + heading: '## External Action Boundaries', + requiredPatterns: [ + /read-only by default/i, + /explicit user approval/i, + /posting, publishing, pushing, merging/i, + ], + }, + { + path: '.kiro/skills/search-first/SKILL.md', + heading: '## Scope and Approval Rules', + requiredPatterns: [ + /Default to read-only research/i, + /Do not install packages/i, + /approval checkpoint/i, + ], + }, + { + path: 'skills/autonomous-agent-harness/SKILL.md', + heading: '## Consent and Safety Boundaries', + requiredPatterns: [ + /explicitly requested and scoped/i, + /Do not create schedules/i, + /Prefer dry-run plans/i, + ], + }, + { + path: 'skills/defi-amm-security/SKILL.md', + heading: '## Execution Safety', + requiredPatterns: [ + /local audit examples/i, + /trusted checkout or disposable sandbox/i, + /private keys, seed phrases/i, + ], + }, + { + path: '.agents/skills/frontend-patterns/SKILL.md', + heading: '## Privacy and Data Boundaries', + requiredPatterns: [ + /synthetic or domain-generic data/i, + /Do not collect, log, persist, or display/i, + /analytics, tracking pixels/i, + ], + }, +]; + +function test(name, fn) { + try { + fn(); + console.log(` ✓ ${name}`); + return true; + } catch (error) { + console.log(` ✗ ${name}`); + console.log(` Error: ${error.message}`); + return false; + } +} + +function read(relativePath) { + return fs.readFileSync(path.join(repoRoot, relativePath), 'utf8'); +} + +function run() { + console.log('\n=== Testing agent instruction safety guardrails ===\n'); + + let passed = 0; + let failed = 0; + + if (test('flagged instruction artifacts keep scoped safety sections', () => { + for (const guardrail of guardrails) { + const source = read(guardrail.path); + assert.ok(source.includes(guardrail.heading), `${guardrail.path} missing ${guardrail.heading}`); + for (const pattern of guardrail.requiredPatterns) { + assert.ok(pattern.test(source), `${guardrail.path} missing ${pattern}`); + } + } + })) passed++; else failed++; + + console.log(`\nPassed: ${passed}`); + console.log(`Failed: ${failed}`); + + process.exit(failed > 0 ? 1 : 0); +} + +run();