mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-03-30 21:53:28 +08:00
feat: add MCP health-check hook (#711)
This commit is contained in:
588
scripts/hooks/mcp-health-check.js
Normal file
588
scripts/hooks/mcp-health-check.js
Normal file
@@ -0,0 +1,588 @@
|
||||
#!/usr/bin/env node
|
||||
'use strict';
|
||||
|
||||
/**
|
||||
* MCP health-check hook.
|
||||
*
|
||||
* Compatible with Claude Code's existing hook events:
|
||||
* - PreToolUse: probe MCP server health before MCP tool execution
|
||||
* - PostToolUseFailure: mark unhealthy servers, attempt reconnect, and re-probe
|
||||
*
|
||||
* The hook persists health state outside the conversation context so it
|
||||
* survives compaction and later turns.
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const os = require('os');
|
||||
const path = require('path');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const { spawn, spawnSync } = require('child_process');
|
||||
|
||||
const MAX_STDIN = 1024 * 1024;
|
||||
const DEFAULT_TTL_MS = 2 * 60 * 1000;
|
||||
const DEFAULT_TIMEOUT_MS = 5000;
|
||||
const DEFAULT_BACKOFF_MS = 30 * 1000;
|
||||
const MAX_BACKOFF_MS = 10 * 60 * 1000;
|
||||
const HEALTHY_HTTP_CODES = new Set([200, 201, 202, 204, 301, 302, 303, 304, 307, 308, 405]);
|
||||
const RECONNECT_STATUS_CODES = new Set([401, 403, 429, 503]);
|
||||
const FAILURE_PATTERNS = [
|
||||
{ code: 401, pattern: /\b401\b|unauthori[sz]ed|auth(?:entication)?\s+(?:failed|expired|invalid)/i },
|
||||
{ code: 403, pattern: /\b403\b|forbidden|permission denied/i },
|
||||
{ code: 429, pattern: /\b429\b|rate limit|too many requests/i },
|
||||
{ code: 503, pattern: /\b503\b|service unavailable|overloaded|temporarily unavailable/i },
|
||||
{ code: 'transport', pattern: /ECONNREFUSED|ENOTFOUND|EAI_AGAIN|timed? out|socket hang up|connection (?:failed|lost|reset|closed)/i }
|
||||
];
|
||||
|
||||
function envNumber(name, fallback) {
|
||||
const value = Number(process.env[name]);
|
||||
return Number.isFinite(value) && value >= 0 ? value : fallback;
|
||||
}
|
||||
|
||||
function stateFilePath() {
|
||||
if (process.env.ECC_MCP_HEALTH_STATE_PATH) {
|
||||
return path.resolve(process.env.ECC_MCP_HEALTH_STATE_PATH);
|
||||
}
|
||||
return path.join(os.homedir(), '.claude', 'mcp-health-cache.json');
|
||||
}
|
||||
|
||||
function configPaths() {
|
||||
if (process.env.ECC_MCP_CONFIG_PATH) {
|
||||
return process.env.ECC_MCP_CONFIG_PATH
|
||||
.split(path.delimiter)
|
||||
.map(entry => entry.trim())
|
||||
.filter(Boolean)
|
||||
.map(entry => path.resolve(entry));
|
||||
}
|
||||
|
||||
const cwd = process.cwd();
|
||||
const home = os.homedir();
|
||||
|
||||
return [
|
||||
path.join(cwd, '.claude.json'),
|
||||
path.join(cwd, '.claude', 'settings.json'),
|
||||
path.join(home, '.claude.json'),
|
||||
path.join(home, '.claude', 'settings.json')
|
||||
];
|
||||
}
|
||||
|
||||
function readJsonFile(filePath) {
|
||||
try {
|
||||
return JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function loadState(filePath) {
|
||||
const state = readJsonFile(filePath);
|
||||
if (!state || typeof state !== 'object' || Array.isArray(state)) {
|
||||
return { version: 1, servers: {} };
|
||||
}
|
||||
|
||||
if (!state.servers || typeof state.servers !== 'object' || Array.isArray(state.servers)) {
|
||||
state.servers = {};
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
function saveState(filePath, state) {
|
||||
try {
|
||||
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
||||
fs.writeFileSync(filePath, JSON.stringify(state, null, 2));
|
||||
} catch {
|
||||
// Never block the hook on state persistence errors.
|
||||
}
|
||||
}
|
||||
|
||||
function readRawStdin() {
|
||||
return new Promise(resolve => {
|
||||
let raw = '';
|
||||
process.stdin.setEncoding('utf8');
|
||||
process.stdin.on('data', chunk => {
|
||||
if (raw.length < MAX_STDIN) {
|
||||
const remaining = MAX_STDIN - raw.length;
|
||||
raw += chunk.substring(0, remaining);
|
||||
}
|
||||
});
|
||||
process.stdin.on('end', () => resolve(raw));
|
||||
process.stdin.on('error', () => resolve(raw));
|
||||
});
|
||||
}
|
||||
|
||||
function safeParse(raw) {
|
||||
try {
|
||||
return raw.trim() ? JSON.parse(raw) : {};
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
function extractMcpTarget(input) {
|
||||
const toolName = String(input.tool_name || input.name || '');
|
||||
const explicitServer = input.server
|
||||
|| input.mcp_server
|
||||
|| input.tool_input?.server
|
||||
|| input.tool_input?.mcp_server
|
||||
|| input.tool_input?.connector
|
||||
|| null;
|
||||
const explicitTool = input.tool
|
||||
|| input.mcp_tool
|
||||
|| input.tool_input?.tool
|
||||
|| input.tool_input?.mcp_tool
|
||||
|| null;
|
||||
|
||||
if (explicitServer) {
|
||||
return {
|
||||
server: String(explicitServer),
|
||||
tool: explicitTool ? String(explicitTool) : toolName
|
||||
};
|
||||
}
|
||||
|
||||
if (!toolName.startsWith('mcp__')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const segments = toolName.slice(5).split('__');
|
||||
if (segments.length < 2 || !segments[0]) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
server: segments[0],
|
||||
tool: segments.slice(1).join('__')
|
||||
};
|
||||
}
|
||||
|
||||
function resolveServerConfig(serverName) {
|
||||
for (const filePath of configPaths()) {
|
||||
const data = readJsonFile(filePath);
|
||||
const server = data?.mcpServers?.[serverName]
|
||||
|| data?.mcp_servers?.[serverName]
|
||||
|| null;
|
||||
|
||||
if (server && typeof server === 'object' && !Array.isArray(server)) {
|
||||
return {
|
||||
config: server,
|
||||
source: filePath
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function markHealthy(state, serverName, now, details = {}) {
|
||||
state.servers[serverName] = {
|
||||
status: 'healthy',
|
||||
checkedAt: now,
|
||||
expiresAt: now + envNumber('ECC_MCP_HEALTH_TTL_MS', DEFAULT_TTL_MS),
|
||||
failureCount: 0,
|
||||
lastError: null,
|
||||
lastFailureCode: null,
|
||||
nextRetryAt: now,
|
||||
lastRestoredAt: now,
|
||||
...details
|
||||
};
|
||||
}
|
||||
|
||||
function markUnhealthy(state, serverName, now, failureCode, errorMessage) {
|
||||
const previous = state.servers[serverName] || {};
|
||||
const failureCount = Number(previous.failureCount || 0) + 1;
|
||||
const backoffBase = envNumber('ECC_MCP_HEALTH_BACKOFF_MS', DEFAULT_BACKOFF_MS);
|
||||
const nextRetryDelay = Math.min(backoffBase * (2 ** Math.max(failureCount - 1, 0)), MAX_BACKOFF_MS);
|
||||
|
||||
state.servers[serverName] = {
|
||||
status: 'unhealthy',
|
||||
checkedAt: now,
|
||||
expiresAt: now,
|
||||
failureCount,
|
||||
lastError: errorMessage || null,
|
||||
lastFailureCode: failureCode || null,
|
||||
nextRetryAt: now + nextRetryDelay,
|
||||
lastRestoredAt: previous.lastRestoredAt || null
|
||||
};
|
||||
}
|
||||
|
||||
function failureSummary(input) {
|
||||
const output = input.tool_output;
|
||||
const pieces = [
|
||||
typeof input.error === 'string' ? input.error : '',
|
||||
typeof input.message === 'string' ? input.message : '',
|
||||
typeof input.tool_response === 'string' ? input.tool_response : '',
|
||||
typeof output === 'string' ? output : '',
|
||||
typeof output?.output === 'string' ? output.output : '',
|
||||
typeof output?.stderr === 'string' ? output.stderr : '',
|
||||
typeof input.tool_input?.error === 'string' ? input.tool_input.error : ''
|
||||
].filter(Boolean);
|
||||
|
||||
return pieces.join('\n');
|
||||
}
|
||||
|
||||
function detectFailureCode(text) {
|
||||
const summary = String(text || '');
|
||||
for (const entry of FAILURE_PATTERNS) {
|
||||
if (entry.pattern.test(summary)) {
|
||||
return entry.code;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function requestHttp(urlString, headers, timeoutMs) {
|
||||
return new Promise(resolve => {
|
||||
let settled = false;
|
||||
let timedOut = false;
|
||||
|
||||
const url = new URL(urlString);
|
||||
const client = url.protocol === 'https:' ? https : http;
|
||||
|
||||
const req = client.request(
|
||||
url,
|
||||
{
|
||||
method: 'GET',
|
||||
headers,
|
||||
},
|
||||
res => {
|
||||
if (settled) return;
|
||||
settled = true;
|
||||
res.resume();
|
||||
resolve({
|
||||
ok: HEALTHY_HTTP_CODES.has(res.statusCode),
|
||||
statusCode: res.statusCode,
|
||||
reason: `HTTP ${res.statusCode}`
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
req.setTimeout(timeoutMs, () => {
|
||||
timedOut = true;
|
||||
req.destroy(new Error('timeout'));
|
||||
});
|
||||
|
||||
req.on('error', error => {
|
||||
if (settled) return;
|
||||
settled = true;
|
||||
resolve({
|
||||
ok: false,
|
||||
statusCode: null,
|
||||
reason: timedOut ? 'request timed out' : error.message
|
||||
});
|
||||
});
|
||||
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
function probeCommandServer(serverName, config) {
|
||||
return new Promise(resolve => {
|
||||
const command = config.command;
|
||||
const args = Array.isArray(config.args) ? config.args.map(arg => String(arg)) : [];
|
||||
const timeoutMs = envNumber('ECC_MCP_HEALTH_TIMEOUT_MS', DEFAULT_TIMEOUT_MS);
|
||||
const mergedEnv = {
|
||||
...process.env,
|
||||
...(config.env && typeof config.env === 'object' && !Array.isArray(config.env) ? config.env : {})
|
||||
};
|
||||
|
||||
let stderr = '';
|
||||
let done = false;
|
||||
|
||||
function finish(result) {
|
||||
if (done) return;
|
||||
done = true;
|
||||
resolve(result);
|
||||
}
|
||||
|
||||
let child;
|
||||
try {
|
||||
child = spawn(command, args, {
|
||||
env: mergedEnv,
|
||||
cwd: process.cwd(),
|
||||
stdio: ['pipe', 'ignore', 'pipe']
|
||||
});
|
||||
} catch (error) {
|
||||
finish({
|
||||
ok: false,
|
||||
statusCode: null,
|
||||
reason: error.message
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
child.stderr.on('data', chunk => {
|
||||
if (stderr.length < 4000) {
|
||||
const remaining = 4000 - stderr.length;
|
||||
stderr += String(chunk).slice(0, remaining);
|
||||
}
|
||||
});
|
||||
|
||||
child.on('error', error => {
|
||||
finish({
|
||||
ok: false,
|
||||
statusCode: null,
|
||||
reason: error.message
|
||||
});
|
||||
});
|
||||
|
||||
child.on('exit', (code, signal) => {
|
||||
finish({
|
||||
ok: false,
|
||||
statusCode: code,
|
||||
reason: stderr.trim() || `process exited before handshake (${signal || code || 'unknown'})`
|
||||
});
|
||||
});
|
||||
|
||||
const timer = setTimeout(() => {
|
||||
try {
|
||||
child.kill('SIGTERM');
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
|
||||
setTimeout(() => {
|
||||
try {
|
||||
child.kill('SIGKILL');
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}, 200).unref?.();
|
||||
|
||||
finish({
|
||||
ok: true,
|
||||
statusCode: null,
|
||||
reason: `${serverName} accepted a new stdio process`
|
||||
});
|
||||
}, timeoutMs);
|
||||
|
||||
if (typeof timer.unref === 'function') {
|
||||
timer.unref();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function probeServer(serverName, resolvedConfig) {
|
||||
const config = resolvedConfig.config;
|
||||
|
||||
if (config.type === 'http' || config.url) {
|
||||
const result = await requestHttp(config.url, config.headers || {}, envNumber('ECC_MCP_HEALTH_TIMEOUT_MS', DEFAULT_TIMEOUT_MS));
|
||||
|
||||
return {
|
||||
ok: result.ok,
|
||||
failureCode: RECONNECT_STATUS_CODES.has(result.statusCode) ? result.statusCode : null,
|
||||
reason: result.reason,
|
||||
source: resolvedConfig.source
|
||||
};
|
||||
}
|
||||
|
||||
if (config.command) {
|
||||
const result = await probeCommandServer(serverName, config);
|
||||
|
||||
return {
|
||||
ok: result.ok,
|
||||
failureCode: RECONNECT_STATUS_CODES.has(result.statusCode) ? result.statusCode : null,
|
||||
reason: result.reason,
|
||||
source: resolvedConfig.source
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
ok: false,
|
||||
failureCode: null,
|
||||
reason: 'unsupported MCP server config',
|
||||
source: resolvedConfig.source
|
||||
};
|
||||
}
|
||||
|
||||
function reconnectCommand(serverName) {
|
||||
const key = `ECC_MCP_RECONNECT_${String(serverName).toUpperCase().replace(/[^A-Z0-9]/g, '_')}`;
|
||||
const command = process.env[key] || process.env.ECC_MCP_RECONNECT_COMMAND || '';
|
||||
if (!command.trim()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return command.includes('{server}')
|
||||
? command.replace(/\{server\}/g, serverName)
|
||||
: command;
|
||||
}
|
||||
|
||||
function attemptReconnect(serverName) {
|
||||
const command = reconnectCommand(serverName);
|
||||
if (!command) {
|
||||
return { attempted: false, success: false, reason: 'no reconnect command configured' };
|
||||
}
|
||||
|
||||
const result = spawnSync(command, {
|
||||
shell: true,
|
||||
env: process.env,
|
||||
cwd: process.cwd(),
|
||||
encoding: 'utf8',
|
||||
timeout: envNumber('ECC_MCP_RECONNECT_TIMEOUT_MS', DEFAULT_TIMEOUT_MS)
|
||||
});
|
||||
|
||||
if (result.error) {
|
||||
return { attempted: true, success: false, reason: result.error.message };
|
||||
}
|
||||
|
||||
if (result.status !== 0) {
|
||||
return {
|
||||
attempted: true,
|
||||
success: false,
|
||||
reason: (result.stderr || result.stdout || `reconnect exited ${result.status}`).trim()
|
||||
};
|
||||
}
|
||||
|
||||
return { attempted: true, success: true, reason: 'reconnect command completed' };
|
||||
}
|
||||
|
||||
function shouldFailOpen() {
|
||||
return /^(1|true|yes)$/i.test(String(process.env.ECC_MCP_HEALTH_FAIL_OPEN || ''));
|
||||
}
|
||||
|
||||
function emitLogs(logs) {
|
||||
for (const line of logs) {
|
||||
process.stderr.write(`${line}\n`);
|
||||
}
|
||||
}
|
||||
|
||||
async function handlePreToolUse(rawInput, input, target, statePathValue, now) {
|
||||
const logs = [];
|
||||
const state = loadState(statePathValue);
|
||||
const previous = state.servers[target.server] || {};
|
||||
|
||||
if (previous.status === 'healthy' && Number(previous.expiresAt || 0) > now) {
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
if (previous.status === 'unhealthy' && Number(previous.nextRetryAt || 0) > now) {
|
||||
logs.push(
|
||||
`[MCPHealthCheck] ${target.server} is marked unhealthy until ${new Date(previous.nextRetryAt).toISOString()}; skipping ${target.tool || 'tool'}`
|
||||
);
|
||||
return { rawInput, exitCode: shouldFailOpen() ? 0 : 2, logs };
|
||||
}
|
||||
|
||||
const resolvedConfig = resolveServerConfig(target.server);
|
||||
if (!resolvedConfig) {
|
||||
logs.push(`[MCPHealthCheck] No MCP config found for ${target.server}; skipping preflight probe`);
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
const probe = await probeServer(target.server, resolvedConfig);
|
||||
if (probe.ok) {
|
||||
markHealthy(state, target.server, now, { source: resolvedConfig.source });
|
||||
saveState(statePathValue, state);
|
||||
|
||||
if (previous.status === 'unhealthy') {
|
||||
logs.push(`[MCPHealthCheck] ${target.server} connection restored`);
|
||||
}
|
||||
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
let reconnect = { attempted: false, success: false, reason: 'probe failed' };
|
||||
if (probe.failureCode || previous.status === 'unhealthy') {
|
||||
reconnect = attemptReconnect(target.server);
|
||||
if (reconnect.success) {
|
||||
const reprobe = await probeServer(target.server, resolvedConfig);
|
||||
if (reprobe.ok) {
|
||||
markHealthy(state, target.server, now, {
|
||||
source: resolvedConfig.source,
|
||||
restoredBy: 'reconnect-command'
|
||||
});
|
||||
saveState(statePathValue, state);
|
||||
logs.push(`[MCPHealthCheck] ${target.server} connection restored after reconnect`);
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
probe.reason = `${probe.reason}; reconnect reprobe failed: ${reprobe.reason}`;
|
||||
}
|
||||
}
|
||||
|
||||
markUnhealthy(state, target.server, now, probe.failureCode, probe.reason);
|
||||
saveState(statePathValue, state);
|
||||
|
||||
const reconnectSuffix = reconnect.attempted
|
||||
? ` Reconnect attempt: ${reconnect.success ? 'ok' : reconnect.reason}.`
|
||||
: '';
|
||||
logs.push(
|
||||
`[MCPHealthCheck] ${target.server} is unavailable (${probe.reason}). Blocking ${target.tool || 'tool'} so Claude can fall back to non-MCP tools.${reconnectSuffix}`
|
||||
);
|
||||
|
||||
return { rawInput, exitCode: shouldFailOpen() ? 0 : 2, logs };
|
||||
}
|
||||
|
||||
async function handlePostToolUseFailure(rawInput, input, target, statePathValue, now) {
|
||||
const logs = [];
|
||||
const summary = failureSummary(input);
|
||||
const failureCode = detectFailureCode(summary);
|
||||
|
||||
if (!failureCode) {
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
const state = loadState(statePathValue);
|
||||
markUnhealthy(state, target.server, now, failureCode, summary.slice(0, 500));
|
||||
saveState(statePathValue, state);
|
||||
|
||||
logs.push(`[MCPHealthCheck] ${target.server} reported ${failureCode}; marking server unhealthy and attempting reconnect`);
|
||||
|
||||
const reconnect = attemptReconnect(target.server);
|
||||
if (!reconnect.attempted) {
|
||||
logs.push(`[MCPHealthCheck] ${target.server} reconnect skipped: ${reconnect.reason}`);
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
if (!reconnect.success) {
|
||||
logs.push(`[MCPHealthCheck] ${target.server} reconnect failed: ${reconnect.reason}`);
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
const resolvedConfig = resolveServerConfig(target.server);
|
||||
if (!resolvedConfig) {
|
||||
logs.push(`[MCPHealthCheck] ${target.server} reconnect completed but no config was available for a follow-up probe`);
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
const reprobe = await probeServer(target.server, resolvedConfig);
|
||||
if (!reprobe.ok) {
|
||||
logs.push(`[MCPHealthCheck] ${target.server} reconnect command ran, but health probe still failed: ${reprobe.reason}`);
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
const refreshed = loadState(statePathValue);
|
||||
markHealthy(refreshed, target.server, now, {
|
||||
source: resolvedConfig.source,
|
||||
restoredBy: 'post-failure-reconnect'
|
||||
});
|
||||
saveState(statePathValue, refreshed);
|
||||
logs.push(`[MCPHealthCheck] ${target.server} connection restored`);
|
||||
return { rawInput, exitCode: 0, logs };
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const rawInput = await readRawStdin();
|
||||
const input = safeParse(rawInput);
|
||||
const target = extractMcpTarget(input);
|
||||
|
||||
if (!target) {
|
||||
process.stdout.write(rawInput);
|
||||
process.exit(0);
|
||||
return;
|
||||
}
|
||||
|
||||
const eventName = process.env.CLAUDE_HOOK_EVENT_NAME || 'PreToolUse';
|
||||
const now = Date.now();
|
||||
const statePathValue = stateFilePath();
|
||||
|
||||
const result = eventName === 'PostToolUseFailure'
|
||||
? await handlePostToolUseFailure(rawInput, input, target, statePathValue, now)
|
||||
: await handlePreToolUse(rawInput, input, target, statePathValue, now);
|
||||
|
||||
emitLogs(result.logs);
|
||||
process.stdout.write(result.rawInput);
|
||||
process.exit(result.exitCode);
|
||||
}
|
||||
|
||||
main().catch(error => {
|
||||
process.stderr.write(`[MCPHealthCheck] Unexpected error: ${error.message}\n`);
|
||||
process.exit(0);
|
||||
});
|
||||
Reference in New Issue
Block a user