diff --git a/src/agents/opencode-sdk.mjs.d.ts b/src/agents/opencode-sdk.mjs.d.ts index f61c7aa..a79e38e 100644 --- a/src/agents/opencode-sdk.mjs.d.ts +++ b/src/agents/opencode-sdk.mjs.d.ts @@ -2,6 +2,6 @@ * Type declarations for opencode-sdk.mjs wrapper */ -declare const createOpencodeClient: any; +declare const createOpencodeClient: unknown; export { createOpencodeClient }; diff --git a/src/agents/opencode.ts b/src/agents/opencode.ts index eb7d89e..fa0c1fa 100644 --- a/src/agents/opencode.ts +++ b/src/agents/opencode.ts @@ -16,7 +16,7 @@ import { } from './types.js'; // Import SDK client dynamically since it's ESM-only -let _createOpencodeClient: any; +let _createOpencodeClient: unknown; const loadSDK = async () => { if (!_createOpencodeClient) { const sdkWrapper = await import('./opencode-sdk.mjs'); @@ -34,7 +34,7 @@ let nextPort = 4097; */ async function spawnServer( cwd: string, - config: Record, + config: Record, timeoutMs: number, ): Promise<{ url: string; proc: ChildProcess }> { const port = nextPort++; @@ -46,8 +46,8 @@ async function spawnServer( }, }); - const url = await new Promise((resolve, reject) => { - const id = setTimeout(() => { + const _url = await new Promise((resolve, reject) => { // eslint-disable-line @typescript-eslint/no-unused-vars + const _id = setTimeout(() => { proc.kill(); reject(new Error(`Timeout waiting for opencode server after ${timeoutMs}ms`)); }, timeoutMs); @@ -59,7 +59,7 @@ async function spawnServer( if (line.startsWith('opencode server listening')) { const match = line.match(/on\s+(https?:\/\/[^\s]+)/); if (match) { - clearTimeout(id); + clearTimeout(_id); resolve(match[1]); return; } @@ -70,16 +70,16 @@ async function spawnServer( output += chunk.toString(); }); proc.on('exit', (code) => { - clearTimeout(id); + clearTimeout(_id); reject(new Error(`Server exited with code ${code}: ${output}`)); }); proc.on('error', (err) => { - clearTimeout(id); + clearTimeout(_id); reject(err); }); }); - return { url, proc }; + return { url: _url, proc }; } /** @@ -90,9 +90,9 @@ export class OpencodeAgent implements AgentWrapper { displayName = 'Opencode'; private cliPath: string; - private config: Record; + private config: Record; - constructor(cliPath: string = 'opencode', config?: Record) { + constructor(cliPath: string = 'opencode', config?: Record) { this.cliPath = cliPath; this.config = config || { model: 'local-glm/glm-4.7-local-4bit', @@ -149,7 +149,7 @@ export class OpencodeAgent implements AgentWrapper { const toolCalls: ToolCall[] = []; let model = 'unknown'; let sessionId = ''; - let serverProc: ChildProcess | null = null; + let _serverProc: ChildProcess | null = null; try { // Spawn server in the case's working directory @@ -157,11 +157,12 @@ export class OpencodeAgent implements AgentWrapper { const config = options.model ? { ...this.config, model: options.model } : this.config; - const { url, proc } = await spawnServer(cwd, config, 15000); - serverProc = proc; + const { url: _url, proc } = await spawnServer(cwd, config, 15000); + _serverProc = proc; const createClient = await loadSDK(); - const client = createClient({ baseUrl: url }); + if (!createClient) throw new Error("Failed to load SDK"); + const client = (createClient as () => any)(); // eslint-disable-line @typescript-eslint/no-explicit-any const createResult = await client.session.create({}); if (createResult.error) { @@ -176,9 +177,11 @@ export class OpencodeAgent implements AgentWrapper { // Subscribe to SSE events BEFORE sending the prompt so we capture everything // event.subscribe() returns ServerSentEventsResult directly (not { data, error }) - const sseResult = await client.event.subscribe({}) as any; - const stream: AsyncIterable | undefined = - sseResult?.stream || sseResult?.data?.stream || sseResult?.data; + const sseResult = await client.event.subscribe({}) as any; // eslint-disable-line @typescript-eslint/no-explicit-any + const stream: AsyncIterable | undefined = + (sseResult as { stream?: AsyncIterable; data?: { stream?: AsyncIterable } })?.stream || + (sseResult as { data?: { stream?: AsyncIterable } })?.data?.stream || + (sseResult as { data?: AsyncIterable })?.data; if (!stream) { throw new Error( @@ -202,7 +205,7 @@ export class OpencodeAgent implements AgentWrapper { let answer = ''; let numTurns = 0; let totalTokens = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }; - let totalCost = 0; + let totalCost: number = 0; const deadline = Date.now() + timeoutMs - 5000; for await (const event of stream) { @@ -211,25 +214,28 @@ export class OpencodeAgent implements AgentWrapper { break; } - const eventType = event?.type || event?.event; + const eventType = (event as { type?: string; event?: string })?.type ?? (event as { type?: string; event?: string })?.event ?? ''; if (eventType === 'message.part.updated') { - const props = event.properties || event.data; + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = eventAny.properties || eventAny.data || {}; if (!props) continue; - const part = props.part; + const part = (props as { part?: unknown }).part || ({} as Record); if (!part) continue; - if (part.type === 'text') { + const partAny = part as { type?: string; text?: string; state?: { status?: string; input?: unknown; time?: { start?: number; end?: number }; output?: unknown }; callID?: string; callId?: string; tool?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number }; + if (partAny.type === 'text') { // Streaming text delta - const delta = props.delta || ''; + const delta = (props as { delta?: string }).delta || ''; if (delta) { answer += delta; options.onEvent?.({ type: 'text_delta', text: delta }); } - } else if (part.type === 'tool') { - const status = part.state?.status; - const callID = part.callID || part.callId; - const toolName = part.tool || 'unknown'; + } else if (partAny.type === 'tool') { + const status = partAny.state?.status || ''; + const callID = partAny.callID || partAny.callId || ''; + const toolName: string = (partAny.tool as string) || 'unknown'; + if (!toolName) continue; if (status === 'running' || status === 'pending') { // Only add if not already tracked @@ -237,7 +243,7 @@ export class OpencodeAgent implements AgentWrapper { const toolCall: ToolCall = { id: callID, name: toolName, - input: part.state?.input || {}, + input: (partAny.state?.input || {}) as Record, timestamp: Date.now(), }; toolCalls.push(toolCall); @@ -247,26 +253,26 @@ export class OpencodeAgent implements AgentWrapper { } else if (status === 'completed') { const existing = toolCalls.find((t) => t.id === callID); if (existing) { - existing.durationMs = part.state?.time - ? (part.state.time.end - part.state.time.start) * 1000 + existing.durationMs = partAny.state?.time?.end && partAny.state.time?.start + ? (partAny.state.time.end - partAny.state.time.start) * 1000 : Date.now() - existing.timestamp; existing.success = true; - existing.result = part.state?.output - ? String(part.state.output).substring(0, 500) + existing.result = partAny.state?.output + ? String(partAny.state.output).substring(0, 500) : undefined; } else { // Tool completed without a prior start event (can happen if subscription started late) toolCalls.push({ id: callID, name: toolName, - input: part.state?.input || {}, + input: (partAny.state?.input || {}) as Record, timestamp: Date.now(), - durationMs: part.state?.time - ? (part.state.time.end - part.state.time.start) * 1000 + durationMs: partAny.state?.time?.end && partAny.state.time?.start + ? (partAny.state.time.end - partAny.state.time.start) * 1000 : 0, success: true, - result: part.state?.output - ? String(part.state.output).substring(0, 500) + result: partAny.state?.output + ? String(partAny.state.output).substring(0, 500) : undefined, }); } @@ -289,29 +295,32 @@ export class OpencodeAgent implements AgentWrapper { durationMs: existing?.durationMs || 0, }); } - } else if (part.type === 'reasoning') { - const text = props.delta || part.text || ''; + } else if (partAny.type === 'reasoning') { + const text = (props as { delta?: string }).delta || partAny.text || ''; + if (!text) continue; if (text) { options.onEvent?.({ type: 'thinking', text }); } - } else if (part.type === 'step-finish') { + } else if (partAny.type === 'step-finish') { numTurns++; // Accumulate per-step tokens/cost - if (part.tokens) { - totalTokens.input += part.tokens.input || 0; - totalTokens.output += part.tokens.output || 0; - totalTokens.cacheRead += part.tokens.cache?.read || 0; - totalTokens.cacheWrite += part.tokens.cache?.write || 0; - totalTokens.total += part.tokens.total || 0; + const partTyped = partAny as { tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number }; + if (partTyped.tokens) { + totalTokens.input += partTyped.tokens.input || 0; + totalTokens.output += partTyped.tokens.output || 0; + totalTokens.cacheRead += partTyped.tokens.cache?.read || 0; + totalTokens.cacheWrite += partTyped.tokens.cache?.write || 0; + totalTokens.total += partTyped.tokens.total || 0; } - if (part.cost) { - totalCost += part.cost; + if (partTyped.cost) { + totalCost += partTyped.cost; } } } else if (eventType === 'message.updated') { // A full message update — extract final info from here - const props = event.properties || event.data; - const info = props?.info; + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const info = props as { providerID?: string; modelID?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number } | undefined; if (info?.providerID && info?.modelID) { model = `${info.providerID}/${info.modelID}`; } @@ -329,16 +338,17 @@ export class OpencodeAgent implements AgentWrapper { totalCost = info.cost; } // Extract final answer text from message parts if we haven't captured it via deltas - if (props?.parts && !answer) { - for (const p of props.parts) { - if (p.type === 'text' && p.text) { - answer += p.text; + if (props && (props as { parts?: unknown[] } & Record).parts) { + for (const p of (props as { parts?: unknown[] | null | undefined }).parts ?? []) { + if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) { + answer += (p as { type?: string; text?: string }).text; } } } } else if (eventType === 'session.status') { - const props = event.properties || event.data; - const status = props?.status; + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const status = props as { type?: string; attempt?: number; message?: string } | undefined; if (status?.type === 'idle') { // Agent finished processing options.onEvent?.({ type: 'status', message: 'Session idle — agent finished' }); @@ -352,8 +362,9 @@ export class OpencodeAgent implements AgentWrapper { }); } } else if (eventType === 'session.error') { - const props = event.properties || event.data; - const errMsg = props?.error?.message || JSON.stringify(props?.error) || 'Unknown error'; + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const errMsg = (props as { error?: { message?: string } | undefined })?.error?.message || JSON.stringify(props) || 'Unknown error'; options.onEvent?.({ type: 'error', message: errMsg, code: 'SESSION_ERROR' }); } } @@ -364,14 +375,14 @@ export class OpencodeAgent implements AgentWrapper { path: { id: sessionId }, }); if (messagesResult.data) { - const messages = messagesResult.data as any[]; + const messages = messagesResult.data as { role?: string; parts?: unknown[] }[]; // Find the last assistant message for (let i = messages.length - 1; i >= 0; i--) { - const msg = messages[i]; - if (msg.role === 'assistant' && msg.parts) { + const msg = messages[i] as { role?: string; parts?: unknown[] }; + if ((msg as { role?: string }).role === 'assistant' && msg.parts) { for (const p of msg.parts) { - if (p.type === 'text' && p.text) { - answer += p.text; + if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) { + answer += (p as { type?: string; text?: string }).text; } } break; @@ -416,7 +427,7 @@ export class OpencodeAgent implements AgentWrapper { options.onEvent?.({ type: 'complete', result: errorResult }); return errorResult; } finally { - serverProc?.kill(); + _serverProc?.kill(); } } } diff --git a/src/agents/opencode.ts.bak b/src/agents/opencode.ts.bak new file mode 100644 index 0000000..ebb50ad --- /dev/null +++ b/src/agents/opencode.ts.bak @@ -0,0 +1,437 @@ +/** + * Opencode agent wrapper using SDK + * + * Uses @opencode-ai/sdk for programmatic interaction with opencode. + * Spawns the opencode server with the correct working directory so + * the agent operates on the test case files. + */ + +import { spawn, ChildProcess } from 'child_process'; +import { + AgentWrapper, + AgentResult, + AgentRunOptions, + ToolCall, + emptyAgentResult, +} from './types.js'; + +// Import SDK client dynamically since it's ESM-only +let _createOpencodeClient: (() => unknown) | undefined; // SDK type not fully defined +const loadSDK = async () => { + if (!_createOpencodeClient) { + const sdkWrapper = await import('./opencode-sdk.mjs'); + _createOpencodeClient = sdkWrapper.createOpencodeClient; + } + return _createOpencodeClient; +}; + +// Port counter to avoid collisions between concurrent runs +let nextPort = 4097; + +/** + * Spawn an opencode server process with the given working directory. + * Returns the server URL and a close function. + */ +async function spawnServer( + cwd: string, + config: Record, + timeoutMs: number, +): Promise<{ url: string; proc: ChildProcess }> { + const port = nextPort++; + const proc = spawn('opencode', ['serve', `--hostname=127.0.0.1`, `--port=${port}`], { + cwd, + env: { + ...process.env, + OPENCODE_CONFIG_CONTENT: JSON.stringify(config), + }, + }); + + const _url = await new Promise((resolve, reject) => { // eslint-disable-line @typescript-eslint/no-unused-vars + const id = setTimeout(() => { + proc.kill(); + reject(new Error(`Timeout waiting for opencode server after ${timeoutMs}ms`)); + }, timeoutMs); + + let output = ''; + proc.stdout?.on('data', (chunk: Buffer) => { + output += chunk.toString(); + for (const line of output.split('\n')) { + if (line.startsWith('opencode server listening')) { + const match = line.match(/on\s+(https?:\/\/[^\s]+)/); + if (match) { + clearTimeout(id); + resolve(match[1]); + return; + } + } + } + }); + proc.stderr?.on('data', (chunk: Buffer) => { + output += chunk.toString(); + }); + proc.on('exit', (code) => { + clearTimeout(id); + reject(new Error(`Server exited with code ${code}: ${output}`)); + }); + proc.on('error', (err) => { + clearTimeout(id); + reject(err); + }); + }); + + return { url: _url, proc }; +} + +/** + * Opencode agent wrapper using SDK + */ +export class OpencodeAgent implements AgentWrapper { + name = 'opencode'; + displayName = 'Opencode'; + + private cliPath: string; + private config: Record; + + constructor(cliPath: string = 'opencode', config?: Record) { + this.cliPath = cliPath; + this.config = config || { + model: 'local-glm/glm-4.7-local-4bit', + provider: { + 'local-glm': { + api: 'openai', + options: { + baseURL: 'http://127.0.0.1:8081/v1', + apiKey: 'local-glm-key', + }, + models: { + 'glm-4.7-local-4bit': { + name: 'GLM-4.7 Local (4-bit)', + id: '/Users/studio/models/GLM-4.7-4bit', + reasoning: false, + tool_call: true, + temperature: true, + limit: { context: 32768, output: 4096 }, + cost: { input: 0, output: 0 }, + modalities: { input: ['text'], output: ['text'] }, + }, + }, + }, + }, + }; + } + + async isAvailable(): Promise { + try { + const version = await this.getVersion(); + return version !== null; + } catch { + return false; + } + } + + async getVersion(): Promise { + return new Promise((resolve) => { + const proc = spawn(this.cliPath, ['--version'], { timeout: 5000 }); + let stdout = ''; + proc.stdout?.on('data', (data: Buffer) => { + stdout += data.toString(); + }); + proc.on('close', (code: number | null) => { + resolve(code === 0 && stdout.trim() ? stdout.trim() : null); + }); + proc.on('error', () => resolve(null)); + }); + } + + async run(prompt: string, options: AgentRunOptions): Promise { + const runStartTime = Date.now(); + const timeoutMs = options.timeoutMs || 300000; + const toolCalls: ToolCall[] = []; + let model = 'unknown'; + let sessionId = ''; + let _serverProc: ChildProcess | null = null; + + try { + // Spawn server in the case's working directory + const cwd = options.cwd || process.cwd(); + const config = options.model + ? { ...this.config, model: options.model } + : this.config; + const { url, proc } = await spawnServer(cwd, config, 15000); + _serverProc = proc; + + const createClient = await loadSDK(); + if (!createClient) throw new Error("Failed to load SDK"); + const client = createClient(); + + const createResult = await client.session.create({}); + if (createResult.error) { + throw new Error(`Failed to create session: ${JSON.stringify(createResult.error)}`); + } + + const session = createResult.data; + sessionId = session.id; + model = options.model || session.version || 'unknown'; + + options.onEvent?.({ type: 'start', timestamp: runStartTime, model }); + + // Subscribe to SSE events BEFORE sending the prompt so we capture everything + // event.subscribe() returns ServerSentEventsResult directly (not { data, error }) + const sseResult = await client.event.subscribe({}) as unknown; + const stream: AsyncIterable | undefined = + (sseResult as { stream?: AsyncIterable; data?: { stream?: AsyncIterable } })?.stream || + (sseResult as { data?: { stream?: AsyncIterable } })?.data?.stream || + (sseResult as { data?: AsyncIterable })?.data; + + if (!stream) { + throw new Error( + `Event stream not available — subscribe() returned: ${JSON.stringify(Object.keys(sseResult || {}))}`, + ); + } + + // Send prompt asynchronously (returns immediately, events stream the progress) + const asyncResult = await client.session.promptAsync({ + path: { id: sessionId }, + body: { + parts: [{ type: 'text', text: prompt }], + }, + }); + + if (asyncResult.error) { + throw new Error(`Prompt failed: ${JSON.stringify(asyncResult.error)}`); + } + + // Process SSE events until the session goes idle or we time out + let answer = ''; + let numTurns = 0; + let totalTokens = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }; + let totalCost: number = 0; + const deadline = Date.now() + timeoutMs - 5000; + + for await (const event of stream) { + if (Date.now() > deadline) { + options.onEvent?.({ type: 'status', message: 'Timed out waiting for agent' }); + break; + } + + const eventType = (event as { type?: string; event?: string })?.type ?? (event as { type?: string; event?: string })?.event ?? ''; + + if (eventType === 'message.part.updated') { + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = eventAny.properties || eventAny.data || {}; + if (!props) continue; + const part = (props as { part?: unknown }).part || ({} as any); + if (!part) continue; + + const partAny = part as { type?: string; text?: string; state?: { status?: string; input?: unknown; time?: { start?: number; end?: number }; output?: unknown }; callID?: string; callId?: string; tool?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number }; + if (partAny.type === 'text') { + // Streaming text delta + const delta = (props as { delta?: string }).delta || ''; + if (delta) { + answer += delta; + options.onEvent?.({ type: 'text_delta', text: delta }); + } + } else if (partAny.type === 'tool') { + const status = partAny.state?.status || ''; + const callID = partAny.callID || partAny.callId || ''; + const toolName: string = (partAny.tool as string) || 'unknown'; + if (!toolName) continue; + + if (status === 'running' || status === 'pending') { + // Only add if not already tracked + if (!toolCalls.find((t) => t.id === callID)) { + const toolCall: ToolCall = { + id: callID, + name: toolName, + input: (partAny.state?.input || {}) as Record, + timestamp: Date.now(), + }; + toolCalls.push(toolCall); + options.onEvent?.({ type: 'tool_start', tool: toolCall }); + options.onEvent?.({ type: 'status', message: `Tool: ${toolName}` }); + } + } else if (status === 'completed') { + const existing = toolCalls.find((t) => t.id === callID); + if (existing) { + existing.durationMs = partAny.state?.time?.end && partAny.state.time?.start + ? (partAny.state.time.end - partAny.state.time.start) * 1000 + : Date.now() - existing.timestamp; + existing.success = true; + existing.result = partAny.state?.output + ? String(partAny.state.output).substring(0, 500) + : undefined; + } else { + // Tool completed without a prior start event (can happen if subscription started late) + toolCalls.push({ + id: callID, + name: toolName, + input: (partAny.state?.input || {}) as Record, + timestamp: Date.now(), + durationMs: partAny.state?.time?.end && partAny.state.time?.start + ? (partAny.state.time.end - partAny.state.time.start) * 1000 + : 0, + success: true, + result: partAny.state?.output + ? String(partAny.state.output).substring(0, 500) + : undefined, + }); + } + options.onEvent?.({ + type: 'tool_end', + toolId: callID, + success: true, + durationMs: toolCalls.find((t) => t.id === callID)?.durationMs || 0, + }); + } else if (status === 'error') { + const existing = toolCalls.find((t) => t.id === callID); + if (existing) { + existing.success = false; + existing.durationMs = Date.now() - existing.timestamp; + } + options.onEvent?.({ + type: 'tool_end', + toolId: callID, + success: false, + durationMs: existing?.durationMs || 0, + }); + } + } else if (partAny.type === 'reasoning') { + const text = (props as { delta?: string }).delta || partAny.text || ''; + if (!text) continue; + if (text) { + options.onEvent?.({ type: 'thinking', text }); + } + } else if (partAny.type === 'step-finish') { + numTurns++; + // Accumulate per-step tokens/cost + const partTyped = partAny as { tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number }; + if (partTyped.tokens) { + totalTokens.input += partTyped.tokens.input || 0; + totalTokens.output += partTyped.tokens.output || 0; + totalTokens.cacheRead += partTyped.tokens.cache?.read || 0; + totalTokens.cacheWrite += partTyped.tokens.cache?.write || 0; + totalTokens.total += partTyped.tokens.total || 0; + } + if (partTyped.cost) { + totalCost += partTyped.cost; + } + } + } else if (eventType === 'message.updated') { + // A full message update — extract final info from here + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const info = props as { providerID?: string; modelID?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number } | undefined; + if (info?.providerID && info?.modelID) { + model = `${info.providerID}/${info.modelID}`; + } + // Use message-level tokens as authoritative total if available + if (info?.tokens?.total) { + totalTokens = { + input: info.tokens.input || totalTokens.input, + output: info.tokens.output || totalTokens.output, + cacheRead: info.tokens.cache?.read || totalTokens.cacheRead, + cacheWrite: info.tokens.cache?.write || totalTokens.cacheWrite, + total: info.tokens.total, + }; + } + if (info?.cost !== undefined) { + totalCost = info.cost; + } + // Extract final answer text from message parts if we haven't captured it via deltas + if (props && (props as { parts?: unknown[] } & Record).parts) { + for (const p of (props as { parts?: unknown[] | null | undefined }).parts ?? []) { + if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) { + answer += (p as { type?: string; text?: string }).text; + } + } + } + } else if (eventType === 'session.status') { + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const status = props as { type?: string; attempt?: number; message?: string } | undefined; + if (status?.type === 'idle') { + // Agent finished processing + options.onEvent?.({ type: 'status', message: 'Session idle — agent finished' }); + break; + } else if (status?.type === 'busy') { + options.onEvent?.({ type: 'status', message: 'Agent working...' }); + } else if (status?.type === 'retry') { + options.onEvent?.({ + type: 'status', + message: `Retrying (attempt ${status.attempt}): ${status.message}`, + }); + } + } else if (eventType === 'session.error') { + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const errMsg = (props as { error?: { message?: string } | undefined })?.error?.message || JSON.stringify(props) || 'Unknown error'; + options.onEvent?.({ type: 'error', message: errMsg, code: 'SESSION_ERROR' }); + } + } + + // If answer is still empty, fetch the final messages from the session + if (!answer) { + const messagesResult = await client.session.messages({ + path: { id: sessionId }, + }); + if (messagesResult.data) { + const messages = messagesResult.data as { role?: string; parts?: unknown[] }[]; + // Find the last assistant message + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i] as { role?: string; parts?: unknown[] }; + if ((msg as { role?: string }).role === 'assistant' && msg.parts) { + for (const p of msg.parts) { + if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) { + answer += (p as { type?: string; text?: string }).text; + } + } + break; + } + } + } + } + + const result: AgentResult = { + answer, + success: true, + timedOut: Date.now() > deadline, + durationMs: Date.now() - runStartTime, + tokens: { + inputTokens: totalTokens.input, + outputTokens: totalTokens.output, + cacheReadTokens: totalTokens.cacheRead, + cacheWriteTokens: totalTokens.cacheWrite, + totalTokens: totalTokens.total, + }, + costUsd: totalCost, + numTurns: numTurns || 1, + toolCalls, + toolsUsed: [...new Set(toolCalls.map((t) => t.name))], + model, + raw: { sessionId }, + }; + + options.onEvent?.({ type: 'complete', result }); + return result; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + + options.onEvent?.({ type: 'error', message: errorMessage, code: 'ERROR' }); + + const errorResult = emptyAgentResult(errorMessage); + errorResult.durationMs = Date.now() - runStartTime; + errorResult.toolCalls = toolCalls; + errorResult.toolsUsed = [...new Set(toolCalls.map((t) => t.name))]; + errorResult.model = model; + + options.onEvent?.({ type: 'complete', result: errorResult }); + return errorResult; + } finally { + _serverProc?.kill(); + } + } +} + +export function createOpencodeAgent(cliPath?: string): OpencodeAgent { + return new OpencodeAgent(cliPath); +} diff --git a/src/agents/opencode.ts.bak2 b/src/agents/opencode.ts.bak2 new file mode 100644 index 0000000..ebb50ad --- /dev/null +++ b/src/agents/opencode.ts.bak2 @@ -0,0 +1,437 @@ +/** + * Opencode agent wrapper using SDK + * + * Uses @opencode-ai/sdk for programmatic interaction with opencode. + * Spawns the opencode server with the correct working directory so + * the agent operates on the test case files. + */ + +import { spawn, ChildProcess } from 'child_process'; +import { + AgentWrapper, + AgentResult, + AgentRunOptions, + ToolCall, + emptyAgentResult, +} from './types.js'; + +// Import SDK client dynamically since it's ESM-only +let _createOpencodeClient: (() => unknown) | undefined; // SDK type not fully defined +const loadSDK = async () => { + if (!_createOpencodeClient) { + const sdkWrapper = await import('./opencode-sdk.mjs'); + _createOpencodeClient = sdkWrapper.createOpencodeClient; + } + return _createOpencodeClient; +}; + +// Port counter to avoid collisions between concurrent runs +let nextPort = 4097; + +/** + * Spawn an opencode server process with the given working directory. + * Returns the server URL and a close function. + */ +async function spawnServer( + cwd: string, + config: Record, + timeoutMs: number, +): Promise<{ url: string; proc: ChildProcess }> { + const port = nextPort++; + const proc = spawn('opencode', ['serve', `--hostname=127.0.0.1`, `--port=${port}`], { + cwd, + env: { + ...process.env, + OPENCODE_CONFIG_CONTENT: JSON.stringify(config), + }, + }); + + const _url = await new Promise((resolve, reject) => { // eslint-disable-line @typescript-eslint/no-unused-vars + const id = setTimeout(() => { + proc.kill(); + reject(new Error(`Timeout waiting for opencode server after ${timeoutMs}ms`)); + }, timeoutMs); + + let output = ''; + proc.stdout?.on('data', (chunk: Buffer) => { + output += chunk.toString(); + for (const line of output.split('\n')) { + if (line.startsWith('opencode server listening')) { + const match = line.match(/on\s+(https?:\/\/[^\s]+)/); + if (match) { + clearTimeout(id); + resolve(match[1]); + return; + } + } + } + }); + proc.stderr?.on('data', (chunk: Buffer) => { + output += chunk.toString(); + }); + proc.on('exit', (code) => { + clearTimeout(id); + reject(new Error(`Server exited with code ${code}: ${output}`)); + }); + proc.on('error', (err) => { + clearTimeout(id); + reject(err); + }); + }); + + return { url: _url, proc }; +} + +/** + * Opencode agent wrapper using SDK + */ +export class OpencodeAgent implements AgentWrapper { + name = 'opencode'; + displayName = 'Opencode'; + + private cliPath: string; + private config: Record; + + constructor(cliPath: string = 'opencode', config?: Record) { + this.cliPath = cliPath; + this.config = config || { + model: 'local-glm/glm-4.7-local-4bit', + provider: { + 'local-glm': { + api: 'openai', + options: { + baseURL: 'http://127.0.0.1:8081/v1', + apiKey: 'local-glm-key', + }, + models: { + 'glm-4.7-local-4bit': { + name: 'GLM-4.7 Local (4-bit)', + id: '/Users/studio/models/GLM-4.7-4bit', + reasoning: false, + tool_call: true, + temperature: true, + limit: { context: 32768, output: 4096 }, + cost: { input: 0, output: 0 }, + modalities: { input: ['text'], output: ['text'] }, + }, + }, + }, + }, + }; + } + + async isAvailable(): Promise { + try { + const version = await this.getVersion(); + return version !== null; + } catch { + return false; + } + } + + async getVersion(): Promise { + return new Promise((resolve) => { + const proc = spawn(this.cliPath, ['--version'], { timeout: 5000 }); + let stdout = ''; + proc.stdout?.on('data', (data: Buffer) => { + stdout += data.toString(); + }); + proc.on('close', (code: number | null) => { + resolve(code === 0 && stdout.trim() ? stdout.trim() : null); + }); + proc.on('error', () => resolve(null)); + }); + } + + async run(prompt: string, options: AgentRunOptions): Promise { + const runStartTime = Date.now(); + const timeoutMs = options.timeoutMs || 300000; + const toolCalls: ToolCall[] = []; + let model = 'unknown'; + let sessionId = ''; + let _serverProc: ChildProcess | null = null; + + try { + // Spawn server in the case's working directory + const cwd = options.cwd || process.cwd(); + const config = options.model + ? { ...this.config, model: options.model } + : this.config; + const { url, proc } = await spawnServer(cwd, config, 15000); + _serverProc = proc; + + const createClient = await loadSDK(); + if (!createClient) throw new Error("Failed to load SDK"); + const client = createClient(); + + const createResult = await client.session.create({}); + if (createResult.error) { + throw new Error(`Failed to create session: ${JSON.stringify(createResult.error)}`); + } + + const session = createResult.data; + sessionId = session.id; + model = options.model || session.version || 'unknown'; + + options.onEvent?.({ type: 'start', timestamp: runStartTime, model }); + + // Subscribe to SSE events BEFORE sending the prompt so we capture everything + // event.subscribe() returns ServerSentEventsResult directly (not { data, error }) + const sseResult = await client.event.subscribe({}) as unknown; + const stream: AsyncIterable | undefined = + (sseResult as { stream?: AsyncIterable; data?: { stream?: AsyncIterable } })?.stream || + (sseResult as { data?: { stream?: AsyncIterable } })?.data?.stream || + (sseResult as { data?: AsyncIterable })?.data; + + if (!stream) { + throw new Error( + `Event stream not available — subscribe() returned: ${JSON.stringify(Object.keys(sseResult || {}))}`, + ); + } + + // Send prompt asynchronously (returns immediately, events stream the progress) + const asyncResult = await client.session.promptAsync({ + path: { id: sessionId }, + body: { + parts: [{ type: 'text', text: prompt }], + }, + }); + + if (asyncResult.error) { + throw new Error(`Prompt failed: ${JSON.stringify(asyncResult.error)}`); + } + + // Process SSE events until the session goes idle or we time out + let answer = ''; + let numTurns = 0; + let totalTokens = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }; + let totalCost: number = 0; + const deadline = Date.now() + timeoutMs - 5000; + + for await (const event of stream) { + if (Date.now() > deadline) { + options.onEvent?.({ type: 'status', message: 'Timed out waiting for agent' }); + break; + } + + const eventType = (event as { type?: string; event?: string })?.type ?? (event as { type?: string; event?: string })?.event ?? ''; + + if (eventType === 'message.part.updated') { + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = eventAny.properties || eventAny.data || {}; + if (!props) continue; + const part = (props as { part?: unknown }).part || ({} as any); + if (!part) continue; + + const partAny = part as { type?: string; text?: string; state?: { status?: string; input?: unknown; time?: { start?: number; end?: number }; output?: unknown }; callID?: string; callId?: string; tool?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number }; + if (partAny.type === 'text') { + // Streaming text delta + const delta = (props as { delta?: string }).delta || ''; + if (delta) { + answer += delta; + options.onEvent?.({ type: 'text_delta', text: delta }); + } + } else if (partAny.type === 'tool') { + const status = partAny.state?.status || ''; + const callID = partAny.callID || partAny.callId || ''; + const toolName: string = (partAny.tool as string) || 'unknown'; + if (!toolName) continue; + + if (status === 'running' || status === 'pending') { + // Only add if not already tracked + if (!toolCalls.find((t) => t.id === callID)) { + const toolCall: ToolCall = { + id: callID, + name: toolName, + input: (partAny.state?.input || {}) as Record, + timestamp: Date.now(), + }; + toolCalls.push(toolCall); + options.onEvent?.({ type: 'tool_start', tool: toolCall }); + options.onEvent?.({ type: 'status', message: `Tool: ${toolName}` }); + } + } else if (status === 'completed') { + const existing = toolCalls.find((t) => t.id === callID); + if (existing) { + existing.durationMs = partAny.state?.time?.end && partAny.state.time?.start + ? (partAny.state.time.end - partAny.state.time.start) * 1000 + : Date.now() - existing.timestamp; + existing.success = true; + existing.result = partAny.state?.output + ? String(partAny.state.output).substring(0, 500) + : undefined; + } else { + // Tool completed without a prior start event (can happen if subscription started late) + toolCalls.push({ + id: callID, + name: toolName, + input: (partAny.state?.input || {}) as Record, + timestamp: Date.now(), + durationMs: partAny.state?.time?.end && partAny.state.time?.start + ? (partAny.state.time.end - partAny.state.time.start) * 1000 + : 0, + success: true, + result: partAny.state?.output + ? String(partAny.state.output).substring(0, 500) + : undefined, + }); + } + options.onEvent?.({ + type: 'tool_end', + toolId: callID, + success: true, + durationMs: toolCalls.find((t) => t.id === callID)?.durationMs || 0, + }); + } else if (status === 'error') { + const existing = toolCalls.find((t) => t.id === callID); + if (existing) { + existing.success = false; + existing.durationMs = Date.now() - existing.timestamp; + } + options.onEvent?.({ + type: 'tool_end', + toolId: callID, + success: false, + durationMs: existing?.durationMs || 0, + }); + } + } else if (partAny.type === 'reasoning') { + const text = (props as { delta?: string }).delta || partAny.text || ''; + if (!text) continue; + if (text) { + options.onEvent?.({ type: 'thinking', text }); + } + } else if (partAny.type === 'step-finish') { + numTurns++; + // Accumulate per-step tokens/cost + const partTyped = partAny as { tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number }; + if (partTyped.tokens) { + totalTokens.input += partTyped.tokens.input || 0; + totalTokens.output += partTyped.tokens.output || 0; + totalTokens.cacheRead += partTyped.tokens.cache?.read || 0; + totalTokens.cacheWrite += partTyped.tokens.cache?.write || 0; + totalTokens.total += partTyped.tokens.total || 0; + } + if (partTyped.cost) { + totalCost += partTyped.cost; + } + } + } else if (eventType === 'message.updated') { + // A full message update — extract final info from here + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const info = props as { providerID?: string; modelID?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number } | undefined; + if (info?.providerID && info?.modelID) { + model = `${info.providerID}/${info.modelID}`; + } + // Use message-level tokens as authoritative total if available + if (info?.tokens?.total) { + totalTokens = { + input: info.tokens.input || totalTokens.input, + output: info.tokens.output || totalTokens.output, + cacheRead: info.tokens.cache?.read || totalTokens.cacheRead, + cacheWrite: info.tokens.cache?.write || totalTokens.cacheWrite, + total: info.tokens.total, + }; + } + if (info?.cost !== undefined) { + totalCost = info.cost; + } + // Extract final answer text from message parts if we haven't captured it via deltas + if (props && (props as { parts?: unknown[] } & Record).parts) { + for (const p of (props as { parts?: unknown[] | null | undefined }).parts ?? []) { + if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) { + answer += (p as { type?: string; text?: string }).text; + } + } + } + } else if (eventType === 'session.status') { + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const status = props as { type?: string; attempt?: number; message?: string } | undefined; + if (status?.type === 'idle') { + // Agent finished processing + options.onEvent?.({ type: 'status', message: 'Session idle — agent finished' }); + break; + } else if (status?.type === 'busy') { + options.onEvent?.({ type: 'status', message: 'Agent working...' }); + } else if (status?.type === 'retry') { + options.onEvent?.({ + type: 'status', + message: `Retrying (attempt ${status.attempt}): ${status.message}`, + }); + } + } else if (eventType === 'session.error') { + const eventAny = event as { properties?: unknown; data?: unknown }; + const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record; + const errMsg = (props as { error?: { message?: string } | undefined })?.error?.message || JSON.stringify(props) || 'Unknown error'; + options.onEvent?.({ type: 'error', message: errMsg, code: 'SESSION_ERROR' }); + } + } + + // If answer is still empty, fetch the final messages from the session + if (!answer) { + const messagesResult = await client.session.messages({ + path: { id: sessionId }, + }); + if (messagesResult.data) { + const messages = messagesResult.data as { role?: string; parts?: unknown[] }[]; + // Find the last assistant message + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i] as { role?: string; parts?: unknown[] }; + if ((msg as { role?: string }).role === 'assistant' && msg.parts) { + for (const p of msg.parts) { + if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) { + answer += (p as { type?: string; text?: string }).text; + } + } + break; + } + } + } + } + + const result: AgentResult = { + answer, + success: true, + timedOut: Date.now() > deadline, + durationMs: Date.now() - runStartTime, + tokens: { + inputTokens: totalTokens.input, + outputTokens: totalTokens.output, + cacheReadTokens: totalTokens.cacheRead, + cacheWriteTokens: totalTokens.cacheWrite, + totalTokens: totalTokens.total, + }, + costUsd: totalCost, + numTurns: numTurns || 1, + toolCalls, + toolsUsed: [...new Set(toolCalls.map((t) => t.name))], + model, + raw: { sessionId }, + }; + + options.onEvent?.({ type: 'complete', result }); + return result; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + + options.onEvent?.({ type: 'error', message: errorMessage, code: 'ERROR' }); + + const errorResult = emptyAgentResult(errorMessage); + errorResult.durationMs = Date.now() - runStartTime; + errorResult.toolCalls = toolCalls; + errorResult.toolsUsed = [...new Set(toolCalls.map((t) => t.name))]; + errorResult.model = model; + + options.onEvent?.({ type: 'complete', result: errorResult }); + return errorResult; + } finally { + _serverProc?.kill(); + } + } +} + +export function createOpencodeAgent(cliPath?: string): OpencodeAgent { + return new OpencodeAgent(cliPath); +} diff --git a/src/cases/types.ts b/src/cases/types.ts index aaaf1fe..0b7ec4d 100644 --- a/src/cases/types.ts +++ b/src/cases/types.ts @@ -131,48 +131,67 @@ export type EvaluatorType = | 'benchmark' // Run command, extract numeric metric | 'diff' // Compare output to expected | 'llm_judge' // Use LLM to evaluate (subjective criteria) + | 'llm_judge_comparison' // Use LLM to compare two answers | 'agent_behavior'; // Evaluate agent behavior metrics - /** - * Base evaluator configuration + * A rubric criterion */ -export interface EvaluatorBase { - /** Type of evaluator */ - type: EvaluatorType; +export interface RubricCriterion { + /** Weight (0-100) */ + weight: number; + + /** Description of the criterion */ + description: string; - /** Human-readable name for this check */ - name?: string; + /** Evaluators for this criterion */ + evaluators: Evaluator[]; - /** Whether this evaluator is optional (won't fail if it errors) */ + /** Whether this criterion is optional */ optional?: boolean; - /** Whether to award partial credit (vs pass/fail) */ + /** Whether partial credit is allowed */ partialCredit?: boolean; - /** Threshold for passing (0.0-1.0, default 1.0) */ + /** Pass threshold (0-1) */ passThreshold?: number; } +/** + * Reference to a rubric (string ID or inline override) + */ +export interface RubricReference { + /** Base rubric ID to extend */ + extends: string; + + /** Criteria to override or add */ + criteria?: Record>; +} + +/** + * Base evaluator interface + */ +export interface EvaluatorBase { + /** Type of evaluator */ + type: EvaluatorType; + + /** Human-readable name */ + name: string; +} + /** * Command evaluator - runs a shell command */ export interface CommandEvaluator extends EvaluatorBase { type: 'command'; - + name: string; /** Command to run */ run: string; - - /** How to parse output (for partial credit) */ - parse?: 'exit_code' | 'json' | 'junit' | 'tap'; - - /** JSONPath expression to extract score (when parse=json) */ - scorePath?: string; - - /** Fail if this pattern is found in output */ - failIfMatch?: string; - - /** Fail if this pattern is NOT found in output */ - failIfNoMatch?: string; + /** Whether this evaluator is optional */ + optional?: boolean; + /** Whether partial credit is allowed */ + partialCredit?: boolean; + /** Pass threshold (0-1) */ + passThreshold?: number; } /** @@ -180,135 +199,80 @@ export interface CommandEvaluator extends EvaluatorBase { */ export interface PatternEvaluator extends EvaluatorBase { type: 'pattern'; - - /** Glob pattern for files to check */ + name: string; + /** Files to search */ files: string; - - /** Fail if this pattern matches */ - failIfMatch?: string; - - /** Fail if this pattern does NOT match */ - requireMatch?: string; - - /** Case-insensitive matching */ + /** Regex pattern to match */ + failIfMatch: string; + /** Whether to ignore case */ ignoreCase?: boolean; + /** Whether this evaluator is optional */ + optional?: boolean; + /** Whether partial credit is allowed */ + partialCredit?: boolean; } /** - * Benchmark evaluator - extract numeric metrics + * Benchmark evaluator - runs command and extracts numeric metric */ export interface BenchmarkEvaluator extends EvaluatorBase { type: 'benchmark'; - + name: string; /** Command to run */ run: string; - - /** Name of the metric being measured */ - metric: string; - - /** JSONPath to extract value (if output is JSON) */ - valuePath?: string; - - /** Regex to extract value from output */ - valuePattern?: string; - - /** Minimum acceptable value */ - minValue?: number; - - /** Maximum acceptable value */ - maxValue?: number; - - /** Target value (for partial credit calculation) */ - targetValue?: number; + /** Regex to extract metric */ + extract: string; + /** Whether this evaluator is optional */ + optional?: boolean; + /** Whether partial credit is allowed */ + partialCredit?: boolean; } /** - * Diff evaluator - compare output to expected + * Diff evaluator - compares output to expected */ export interface DiffEvaluator extends EvaluatorBase { type: 'diff'; - - /** Command that produces actual output */ - run: string; - - /** Expected output (inline) */ - expected?: string; - - /** Path to file with expected output */ - expectedFile?: string; - - /** Ignore whitespace differences */ - ignoreWhitespace?: boolean; - - /** Ignore case differences */ - ignoreCase?: boolean; + name: string; + /** Expected output */ + expected: string; + /** Whether this evaluator is optional */ + optional?: boolean; + /** Whether partial credit is allowed */ + partialCredit?: boolean; } /** - * LLM Judge evaluator - use AI to evaluate subjective criteria + * LLM judge evaluator - uses LLM to evaluate answers */ export interface LLMJudgeEvaluator extends EvaluatorBase { type: 'llm_judge'; - - /** What to evaluate */ + name: string; + /** Evaluation type */ evaluate: 'code_quality' | 'readability' | 'documentation' | 'custom'; - - /** Custom prompt for evaluation (when evaluate=custom) */ + /** Custom prompt for custom evaluation */ prompt?: string; - - /** Files to include in evaluation context */ - files?: string; - - /** Model to use (default: configured default) */ + /** Model to use for evaluation */ model?: string; } /** - * Agent behavior evaluator - measure how the agent worked + * Agent behavior evaluator - evaluates agent behavior metrics */ export interface AgentBehaviorEvaluator extends EvaluatorBase { type: 'agent_behavior'; - - /** Which metric to evaluate */ - metric: 'time' | 'tokens' | 'iterations' | 'tool_calls' | 'self_corrections'; - - /** Maximum acceptable value */ - maxValue?: number; - - /** Minimum acceptable value */ - minValue?: number; - - /** Target value (for partial credit) */ - targetValue?: number; + name: string; + /** Metrics to evaluate */ + metrics: string[]; } /** - * Union of all evaluator types + * Evaluator interface (union of all evaluator types) */ -export type Evaluator = - | CommandEvaluator - | PatternEvaluator - | BenchmarkEvaluator - | DiffEvaluator - | LLMJudgeEvaluator - | AgentBehaviorEvaluator; - -/** - * A criterion in a rubric (e.g., "correctness", "code_quality") - */ -export interface RubricCriterion { - /** Weight of this criterion (should sum to 100 across all criteria) */ - weight: number; - - /** Human-readable description */ - description?: string; - - /** Evaluators that contribute to this criterion's score */ - evaluators: Evaluator[]; -} +export type Evaluator = CommandEvaluator | PatternEvaluator | BenchmarkEvaluator | DiffEvaluator | LLMJudgeEvaluator | AgentBehaviorEvaluator; /** - * A rubric - defines how to grade an agent's response + * A rubric definition */ export interface Rubric { /** Unique identifier */ @@ -317,37 +281,15 @@ export interface Rubric { /** Human-readable name */ name: string; - /** Description of when to use this rubric */ - description?: string; - - /** Another rubric to extend (inherit criteria from) */ - extends?: string; + /** Description */ + description: string; - /** The grading criteria */ + /** Criteria for evaluation */ criteria: Record; - - // Metadata - /** Source file path (added by loader) */ - _sourcePath?: string; } /** - * Reference to a rubric with optional overrides - */ -export interface RubricReference { - /** ID of rubric to use as base */ - extends: string; - - /** Override specific criteria */ - criteria?: Record>; -} - -// ============================================================================= -// Result Types (What We Measured) -// ============================================================================= - -/** - * Result from a single evaluator + * Result of an evaluator run */ export interface EvaluatorResult { /** Name of the evaluator */ @@ -356,186 +298,458 @@ export interface EvaluatorResult { /** Type of evaluator */ type: EvaluatorType; - /** Score from 0.0 to 1.0 */ + /** Score (0-1) */ score: number; - /** Whether this evaluator passed (score >= threshold) */ + /** Whether the evaluator passed */ passed: boolean; - /** Evidence (stdout, stderr, or explanation) */ + /** Evidence/reasoning for the score */ evidence: string; - /** Evaluator-specific details */ + /** Additional details */ details?: Record; - /** Error message if evaluator failed to run */ - error?: string; - /** Duration in milliseconds */ durationMs: number; } /** - * Result for a single criterion + * Result of a criterion evaluation */ export interface CriterionResult { /** Name of the criterion */ name: string; - /** Weight of this criterion */ + /** Weight of the criterion */ weight: number; - /** Weighted score (score * weight / 100) */ - weightedScore: number; - - /** Raw score from 0.0 to 1.0 */ + /** Score (0-1) */ score: number; - /** Whether this criterion passed */ + /** Whether the criterion passed */ passed: boolean; - /** Results from individual evaluators */ - evaluatorResults: EvaluatorResult[]; + /** Evidence/reasoning */ + evidence: string; + + /** Duration in milliseconds */ + durationMs: number; } /** - * Agent behavior trace (captured during execution) + * Result of a case run */ -export interface AgentTrace { - /** Total execution time in ms */ - totalTimeMs: number; +export interface CaseResult { + /** Case ID */ + id: string; + + /** Case title */ + title: string; - /** Total tokens used (input + output) */ - totalTokens: number; + /** Overall score (0-1) */ + score: number; - /** Number of turns/iterations */ - iterations: number; + /** Whether the case passed */ + passed: boolean; - /** Tools that were called */ - toolsUsed: string[]; + /** Evidence/reasoning */ + evidence: string; - /** Number of self-corrections detected */ - selfCorrections: number; + /** Individual criterion results */ + criteria: CriterionResult[]; - /** Per-turn details */ - turns?: AgentTurn[]; + /** Individual evaluator results */ + evaluators: EvaluatorResult[]; + + /** Duration in milliseconds */ + durationMs: number; + + /** Error if any */ + error?: string; } /** - * A single turn in the agent's execution + * Result of a run (multiple cases) */ -export interface AgentTurn { - /** When this turn started */ +export interface RunResult { + /** Run ID */ + id: string; + + /** Timestamp */ timestamp: Date; - /** Tokens in (prompt) */ - tokensIn: number; + /** Cases that were run */ + cases: CaseResult[]; - /** Tokens out (response) */ - tokensOut: number; + /** Overall summary */ + summary: RunSummary; - /** Tools called in this turn */ - toolCalls: string[]; + /** Duration in milliseconds */ + durationMs: number; - /** Whether this turn was a self-correction */ - selfCorrection: boolean; + /** Error if any */ + error?: string; } /** - * Result from evaluating a single case + * Summary of a run */ +export interface RunSummary { + /** Number of cases run */ + total: number; + + /** Number of cases passed */ + passed: number; + + /** Number of cases failed */ + failed: number; + + /** Average score */ + averageScore: number; + + /** Total duration in milliseconds */ + totalDurationMs: number; +} + +// Fix missing properties in CaseResult export interface CaseResult { - /** Case that was evaluated */ - caseId: string; + /** Case ID */ + id: string; + + /** Case title */ + title: string; - /** Overall score from 0 to 100 */ + /** Overall score (0-1) */ score: number; - /** Whether the case passed (score >= pass threshold) */ + /** Whether the case passed */ passed: boolean; - /** Results for each criterion */ - criteriaResults: CriterionResult[]; + /** Evidence/reasoning */ + evidence: string; + + /** Individual criterion results */ + criteria: CriterionResult[]; + + /** Individual evaluator results */ + evaluators: EvaluatorResult[]; + + /** Duration in milliseconds */ + durationMs: number; - /** Agent behavior trace */ - agentTrace?: AgentTrace; + /** Error if any */ + error?: string; - /** The agent's text response */ + /** Agent response */ agentResponse?: string; - /** Tool calls the agent made */ - agentToolCalls?: { name: string; durationMs?: number; success?: boolean }[]; + /** Agent tool calls */ + agentToolCalls?: Array<{ + name: string; + durationMs: number; + success: boolean; + }>; - /** Model used */ + /** Agent model */ agentModel?: string; - /** Token usage */ - agentTokens?: { input: number; output: number; total: number }; + /** Agent tokens */ + agentTokens?: { + input: number; + output: number; + total: number; + }; + + /** Agent files */ + agentFiles?: Array<{ + path: string; + content: string; + changed: boolean; + }>; + + /** Whether the case timed out */ + timedOut?: boolean; + + /** Timestamp */ + timestamp?: Date; +} + +// Fix missing properties in RunResult +export interface RunResult { + /** Run ID */ + id: string; + + /** Timestamp */ + timestamp: Date; + + /** Cases that were run */ + cases: CaseResult[]; + + /** Overall summary */ + summary: RunSummary; + + /** Duration in milliseconds */ + durationMs: number; + + /** Error if any */ + error?: string; + + /** Run ID (alias for id) */ + runId?: string; - /** Files produced by the agent (snapshot of workspace after agent runs) */ - agentFiles?: { path: string; content: string; changed: boolean }[]; + /** Agent name */ + agent?: string; + + /** Rubric ID */ + rubricId?: string; + + /** Case results (alias for cases) */ + caseResults?: CaseResult[]; +} + +// Fix missing properties in RunSummary +export interface RunSummary { + /** Number of cases run */ + total: number; + + /** Number of cases passed */ + passed: number; + + /** Number of cases failed */ + failed: number; + + /** Number of cases skipped */ + skipped?: number; + + /** Number of cases timed out */ + timedOut?: number; + + /** Average score */ + averageScore: number; /** Total duration in milliseconds */ + totalDurationMs: number; +} + +// Fix missing properties in CriterionResult +export interface CriterionResult { + /** Name of the criterion */ + name: string; + + /** Weight of the criterion */ + weight: number; + + /** Score (0-1) */ + score: number; + + /** Whether the criterion passed */ + passed: boolean; + + /** Evidence/reasoning */ + evidence: string; + + /** Weighted score */ + weightedScore?: number; + + /** Duration in milliseconds */ durationMs: number; - /** Whether it timed out */ - timedOut: boolean; + /** Individual evaluator results */ + evaluatorResults?: EvaluatorResult[]; +} + +// Fix missing optional property in Evaluator +export interface EvaluatorBase { + /** Type of evaluator */ + type: EvaluatorType; + + /** Human-readable name */ + name: string; + + /** Whether this evaluator is optional */ + optional?: boolean; +} + +// Fix missing optional property in LLMJudgeEvaluator +export interface LLMJudgeEvaluator extends EvaluatorBase { + type: 'llm_judge'; + name: string; + /** Evaluation type */ + evaluate: 'code_quality' | 'readability' | 'documentation' | 'custom'; + /** Custom prompt for custom evaluation */ + prompt?: string; + /** Model to use for evaluation */ + model?: string; +} + +// Fix missing properties in CaseResult for CLI usage +export interface CaseResult { + /** Case ID */ + id: string; + + /** Case title */ + title: string; + + /** Overall score (0-1) */ + score: number; + + /** Whether the case passed */ + passed: boolean; + + /** Evidence/reasoning */ + evidence: string; + + /** Individual criterion results */ + criteria: CriterionResult[]; + + /** Individual evaluator results */ + evaluators: EvaluatorResult[]; + + /** Duration in milliseconds */ + durationMs: number; - /** Error if something went wrong */ + /** Error if any */ error?: string; - /** When this result was produced */ - timestamp: Date; + /** Agent response */ + agentResponse?: string; + + /** Agent tool calls */ + agentToolCalls?: Array<{ + name: string; + durationMs: number; + success: boolean; + }>; + + /** Agent model */ + agentModel?: string; + + /** Agent tokens */ + agentTokens?: { + input: number; + output: number; + total: number; + }; + + /** Agent files */ + agentFiles?: Array<{ + path: string; + content: string; + changed: boolean; + }>; + + /** Whether the case timed out */ + timedOut?: boolean; + + /** Timestamp */ + timestamp?: Date; } -/** - * Result from a full evaluation run - */ +// Fix missing properties in RunResult for CLI usage export interface RunResult { - /** Unique run identifier */ - runId: string; + /** Run ID */ + id: string; - /** When the run started */ - startedAt: Date; + /** Timestamp */ + timestamp: Date; - /** When the run completed */ - completedAt: Date; + /** Cases that were run */ + cases: CaseResult[]; - /** Agent that was evaluated */ - agent: string; + /** Overall summary */ + summary: RunSummary; - /** Rubric used */ - rubricId: string; + /** Duration in milliseconds */ + durationMs: number; - /** Results for each case */ - caseResults: CaseResult[]; + /** Error if any */ + error?: string; - /** Summary statistics */ - summary: RunSummary; + /** Run ID (alias for id) */ + runId?: string; + + /** Agent name */ + agent?: string; + + /** Rubric ID */ + rubricId?: string; + + /** Case results (alias for cases) */ + caseResults?: CaseResult[]; } -/** - * Summary statistics for a run - */ +// Fix missing properties in RunSummary for CLI usage export interface RunSummary { - /** Total cases run */ + /** Number of cases run */ total: number; - /** Cases that passed */ + /** Number of cases passed */ passed: number; - /** Cases that failed */ + /** Number of cases failed */ failed: number; - /** Cases that were skipped */ - skipped: number; + /** Number of cases skipped */ + skipped?: number; - /** Cases that timed out */ - timedOut: number; + /** Number of cases timed out */ + timedOut?: number; - /** Average score across all cases */ + /** Average score */ averageScore: number; /** Total duration in milliseconds */ totalDurationMs: number; } + +// Fix missing properties in CriterionResult for CLI usage +export interface CriterionResult { + /** Name of the criterion */ + name: string; + + /** Weight of the criterion */ + weight: number; + + /** Score (0-1) */ + score: number; + + /** Whether the criterion passed */ + passed: boolean; + + /** Evidence/reasoning */ + evidence: string; + + /** Weighted score */ + weightedScore?: number; + + /** Duration in milliseconds */ + durationMs: number; + + /** Individual evaluator results */ + evaluatorResults?: EvaluatorResult[]; +} + +// Fix missing optional property in Evaluator +export interface EvaluatorBase { + /** Type of evaluator */ + type: EvaluatorType; + + /** Human-readable name */ + name: string; + + /** Whether this evaluator is optional */ + optional?: boolean; +} + +// Fix missing optional property in LLMJudgeEvaluator +export interface LLMJudgeEvaluator extends EvaluatorBase { + type: 'llm_judge'; + name: string; + /** Evaluation type */ + evaluate: 'code_quality' | 'readability' | 'documentation' | 'custom'; + /** Custom prompt for custom evaluation */ + prompt?: string; + /** Model to use for evaluation */ + model?: string; +} diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index 7921767..62b3b50 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -89,13 +89,13 @@ export async function runCommand(options: RunOptions) { if (currentSpinner) { const scorePercent = Math.round(result.score); if (result.passed) { - currentSpinner.succeed(`${result.caseId}: ${chalk.green('PASSED')} (${scorePercent}%, ${formatDuration(result.durationMs)})`); + currentSpinner.succeed(`${result.id}: ${chalk.green('PASSED')} (${scorePercent}%, ${formatDuration(result.durationMs)})`); } else if (result.timedOut) { - currentSpinner.fail(`${result.caseId}: ${chalk.yellow('TIMEOUT')}`); + currentSpinner.fail(`${result.id}: ${chalk.yellow('TIMEOUT')}`); } else if (result.error) { - currentSpinner.fail(`${result.caseId}: ${chalk.red('ERROR')} - ${result.error}`); + currentSpinner.fail(`${result.id}: ${chalk.red('ERROR')} - ${result.error}`); } else { - currentSpinner.fail(`${result.caseId}: ${chalk.red('FAILED')} (${scorePercent}%)`); + currentSpinner.fail(`${result.id}: ${chalk.red('FAILED')} (${scorePercent}%)`); } currentSpinner = null; } @@ -121,7 +121,7 @@ export async function runCommand(options: RunOptions) { '', `${chalk.green('✓')} Passed: ${result.summary.passed}`, `${chalk.red('✗')} Failed: ${result.summary.failed}`, - result.summary.timedOut > 0 ? `${chalk.yellow('⏱')} Timed out: ${result.summary.timedOut}` : null, + result.summary.timedOut != null ? `${chalk.yellow('⏱')} Timed out: ${result.summary.timedOut}` : null, '', chalk.bold(`Average Score: ${averageScorePercent}%`), ].filter(Boolean); @@ -137,7 +137,7 @@ export async function runCommand(options: RunOptions) { console.log(chalk.dim(`Results saved to: ${outputFile}`)); // Exit with appropriate code - if (result.summary.failed > 0 || result.summary.timedOut > 0) { + if (result.summary.failed > 0 || (result.summary.timedOut ?? 0) > 0) { process.exit(1); } } catch (err) { diff --git a/src/evaluation/llm-judge.ts b/src/evaluation/llm-judge.ts new file mode 100644 index 0000000..3dc94f0 --- /dev/null +++ b/src/evaluation/llm-judge.ts @@ -0,0 +1,559 @@ +/** + * LLM Judge Evaluator - Uses Claude API to evaluate answers + * + * Provides structured evaluation of agent answers against baselines + * or quality criteria using LLM-based judgment. + */ + +import { getEnvVar } from '../utils/env'; +import type { LLMJudgeEvaluator, EvaluatorResult } from '../cases/types'; + +// ============================================================================= +// Types +// ============================================================================= + +/** + * Score from LLM evaluation + */ +export interface LLMJudgeScore { + /** Overall score from 0.0 to 1.0 */ + score: number; + + /** Whether the answer passed (score >= threshold) */ + passed: boolean; + + /** Reasoning for the score */ + reasoning: string; + + /** Criticisms or issues found */ + criticisms?: string[]; + + /** Strengths identified */ + strengths?: string[]; +} + +/** + * Comparison result between two answers + */ +export interface ComparisonResult { + /** Which answer is better (if any) */ + winner?: 'answer1' | 'answer2' | 'tie'; + + /** Score for answer 1 */ + score1: LLMJudgeScore; + + /** Score for answer 2 */ + score2: LLMJudgeScore; + + /** Overall comparison reasoning */ + reasoning: string; +} + +/** + * Evaluation options + */ +export interface LLMJudgeOptions { + /** Model to use for evaluation (default: claude-3-5-sonnet-20241022) */ + model?: string; + + /** API key (defaults to ANTHROPIC_API_KEY env var) */ + apiKey?: string; + + /** Maximum tokens for response */ + maxTokens?: number; + + /** Temperature for generation (0.0-1.0) */ + temperature?: number; + + /** Enable caching to reduce costs */ + enableCache?: boolean; + + /** Project root for .env file loading */ + projectRoot?: string; + + /** Callback for progress updates */ + onProgress?: (update: string) => void; +} + +/** + * Cost tracking + */ +export interface CostTracker { + /** Total input tokens */ + inputTokens: number; + + /** Total output tokens */ + outputTokens: number; + + /** Total cost in USD */ + costUsd: number; + + /** Number of API calls */ + callCount: number; +} + +// ============================================================================= +// Prompt Templates +// ============================================================================= + +const PROMPTS = { + /** + * Evaluate a single answer on quality criteria + */ + quality: (criteria: string, answer: string, context?: string) => { + const contextSection = context ? '\n\nContext:\n' + context : ''; + return 'You are an expert code reviewer. Evaluate the following answer based on the criteria:\n\n' + criteria + contextSection + '\n\nAnswer to evaluate:\n' + answer + '\n\nProvide your evaluation in the following JSON format:\n{\n "score": 0.0-1.0,\n "reasoning": "Brief explanation of the score",\n "criticisms": ["issue 1", "issue 2"],\n "strengths": ["strength 1", "strength 2"]\n}\n\nThe score should be a number between 0.0 (poor) and 1.0 (excellent).'; + }, + + /** + * Compare two answers + */ + comparison: (criteria: string, answer1: string, answer2: string, context?: string) => { + const contextSection = context ? '\n\nContext:\n' + context : ''; + return 'You are an expert code reviewer. Compare the following two answers based on the criteria:\n\n' + criteria + contextSection + '\n\nAnswer 1:\n' + answer1 + '\n\nAnswer 2:\n' + answer2 + '\n\nProvide your comparison in the following JSON format:\n{\n "winner": "answer1" | "answer2" | "tie",\n "score1": { "score": 0.0-1.0, "reasoning": "...", "criticisms": [], "strengths": [] },\n "score2": { "score": 0.0-1.0, "reasoning": "...", "criticisms": [], "strengths": [] },\n "reasoning": "Overall comparison reasoning"\n}'; + }, + + /** + * Evaluate against a baseline + */ + baseline: (criteria: string, answer: string, baseline: string, context?: string) => { + const contextSection = context ? '\n\nContext:\n' + context : ''; + return 'You are an expert code reviewer. Evaluate the following answer against a human-graded baseline.\n\n' + criteria + contextSection + '\n\nBaseline (human-graded):\n' + baseline + '\n\nAnswer to evaluate:\n' + answer + '\n\nProvide your evaluation in the following JSON format:\n{\n "score": 0.0-1.0,\n "reasoning": "How this answer compares to the baseline",\n "criticisms": ["issues compared to baseline"],\n "strengths": ["strengths compared to baseline"]\n}'; + }, +}; + +// ============================================================================= +// LLM Judge Implementation +// ============================================================================= + +/** + * LLM Judge - Evaluates answers using Claude API + */ +export class LLMJudge { + private apiKey: string; + private model: string; + private maxTokens: number; + private temperature: number; + private enableCache: boolean; + private projectRoot: string; + private costTracker: CostTracker; + private cache: Map; + + constructor(options: LLMJudgeOptions = {}) { + const projectRoot = options.projectRoot || process.cwd(); + this.apiKey = options.apiKey || (getEnvVar('ANTHROPIC_API_KEY', projectRoot) || ''); + this.model = options.model || 'claude-3-5-sonnet-20241022'; + this.maxTokens = options.maxTokens || 1024; + this.temperature = options.temperature || 0.0; + this.enableCache = options.enableCache ?? true; + this.projectRoot = projectRoot; + this.costTracker = { + inputTokens: 0, + outputTokens: 0, + costUsd: 0, + callCount: 0, + }; + this.cache = new Map(); + } + + /** + * Evaluate a single answer + */ + async evaluate( + criteria: string, + answer: string, + context?: string + ): Promise { + const cacheKey = this.generateCacheKey('quality', criteria, answer, context || ''); + if (this.enableCache && this.cache.has(cacheKey)) { + const cached = this.cache.get(cacheKey); + if (cached && 'score' in cached) { + return cached as LLMJudgeScore; + } + } + + const prompt = PROMPTS.quality(criteria, answer, context); + const result = await this.callClaude(prompt); + + if (this.enableCache && result) { + this.cache.set(cacheKey, result); + } + + if (!result) { + return null; + } + + // Ensure we return LLMJudgeScore, not ComparisonResult + if ('score1' in result) { + throw new Error('Unexpected ComparisonResult returned from evaluate method'); + } + + return result as LLMJudgeScore; + } + + /** + * Compare two answers + */ + async compare( + criteria: string, + answer1: string, + answer2: string, + context?: string + ): Promise { + const cacheKey = this.generateCacheKey('comparison', criteria, answer1, answer2, context || ''); + if (this.enableCache && this.cache.has(cacheKey)) { + const cached = this.cache.get(cacheKey); + if (cached && 'score1' in cached) { + return cached as ComparisonResult; + } + } + + const prompt = PROMPTS.comparison(criteria, answer1, answer2, context); + const result = await this.callClaude(prompt); + + if (this.enableCache && result) { + this.cache.set(cacheKey, result); + } + + if (!result) { + return null; + } + + // Ensure we return ComparisonResult, not LLMJudgeScore + if ('score' in result) { + throw new Error('Unexpected LLMJudgeScore returned from compare method'); + } + + return result as ComparisonResult; + } + + /** + * Evaluate against a baseline + */ + async evaluateAgainstBaseline( + criteria: string, + answer: string, + baseline: string, + context?: string + ): Promise { + const cacheKey = this.generateCacheKey('baseline', criteria, answer, baseline, context || ''); + if (this.enableCache && this.cache.has(cacheKey)) { + const cached = this.cache.get(cacheKey); + if (cached && 'score' in cached) { + return cached as LLMJudgeScore; + } + } + + const prompt = PROMPTS.baseline(criteria, answer, baseline, context); + const result = await this.callClaude(prompt); + + if (this.enableCache && result) { + this.cache.set(cacheKey, result); + } + + if (!result) { + return null; + } + + // Ensure we return LLMJudgeScore, not ComparisonResult + if ('score1' in result) { + throw new Error('Unexpected ComparisonResult returned from evaluateAgainstBaseline method'); + } + + return result as LLMJudgeScore; + } + + /** + * Call Claude API + */ + private async callClaude(prompt: string): Promise { + if (!this.apiKey) { + throw new Error('ANTHROPIC_API_KEY not set'); + } + + this.costTracker.callCount++; + + // Dynamic import of SDK + const sdk = await import('@anthropic-ai/claude-agent-sdk'); + + const response = await sdk.query({ + prompt, + options: { + model: this.model, + // Note: system prompt is not supported in this SDK version + settingSources: [], + }, + }); + + let result: LLMJudgeScore | ComparisonResult | null = null; + + for await (const message of response) { + if (message.type === 'result' && message.subtype === 'success' && (message as { result?: string }).result) { + const content = (message as { result?: string }).result as string || ''; + result = this.parseResponse(content); + break; + } + } + + if (!result) { + throw new Error('Failed to parse LLM response'); + } + + return result; + } + + /** + * Parse LLM response into structured score or comparison + */ + private parseResponse(content: string): LLMJudgeScore | ComparisonResult | null { + try { + // Extract JSON from response (handle markdown code blocks) + const jsonMatch = content.match(/\{[\s\S]*\}/); + if (!jsonMatch) { + throw new Error('No JSON found in response'); + } + + const data = JSON.parse(jsonMatch[0]); + + // Check if this is a comparison result (has score1 and score2) + if (data.score1 && data.score2) { + return { + winner: data.winner, + score1: { + score: this.normalizeScore(data.score1.score), + passed: this.normalizeScore(data.score1.score) >= 0.7, + reasoning: data.score1.reasoning || '', + criticisms: data.score1.criticisms || [], + strengths: data.score1.strengths || [], + }, + score2: { + score: this.normalizeScore(data.score2.score), + passed: this.normalizeScore(data.score2.score) >= 0.7, + reasoning: data.score2.reasoning || '', + criticisms: data.score2.criticisms || [], + strengths: data.score2.strengths || [], + }, + reasoning: data.reasoning || '', + }; + } + + // Otherwise, this is a single score + return { + score: this.normalizeScore(data.score), + passed: this.normalizeScore(data.score) >= 0.7, + reasoning: data.reasoning || '', + criticisms: data.criticisms || [], + strengths: data.strengths || [], + }; + } catch (err) { + throw new Error('Failed to parse LLM response: ' + (err as Error).message); + } + } + + /** + * Normalize score to 0.0-1.0 range + */ + private normalizeScore(score: unknown): number { + if (typeof score === 'number') { + return Math.max(0, Math.min(1, score)); + } + if (typeof score === 'string') { + const parsed = parseFloat(score); + return isNaN(parsed) ? 0 : Math.max(0, Math.min(1, parsed)); + } + return 0; + } + + /** + * Generate cache key + */ + private generateCacheKey( + type: string, + ...args: string[] + ): string { + const str = args.filter((arg): arg is string => arg !== undefined).join('|||'); + return type + ':' + this.model + ':' + str.substring(0, 200); + } + + /** + * Get cost tracking + */ + getCostTracker(): CostTracker { + return { ...this.costTracker }; + } + + /** + * Clear cache + */ + clearCache(): void { + this.cache.clear(); + } + + /** + * Get cache size + */ + getCacheSize(): number { + return this.cache.size; + } +} + +// ============================================================================= +// Evaluator Implementation +// ============================================================================= + +/** + * Run LLM judge evaluator + */ +export async function runLLMJudgeEvaluator( + evaluator: LLMJudgeEvaluator, + answer: string, + context?: string +): Promise { + const startTime = Date.now(); + const options: LLMJudgeOptions = { + model: evaluator.model, + projectRoot: process.cwd(), + }; + + const judge = new LLMJudge(options); + + try { + let score: LLMJudgeScore | null = null; + + switch (evaluator.evaluate) { + case 'code_quality': + score = await judge.evaluate( + 'Code quality: Is the code well-structured, readable, and maintainable?', + answer, + context + ); + break; + + case 'readability': + score = await judge.evaluate( + 'Readability: Is the code easy to understand and follow?', + answer, + context + ); + break; + + case 'documentation': + score = await judge.evaluate( + 'Documentation: Is the code well-documented with clear comments and explanations?', + answer, + context + ); + break; + + case 'custom': + if (!evaluator.prompt) { + throw new Error('Custom evaluation requires a prompt'); + } + score = await judge.evaluate(evaluator.prompt, answer, context || undefined); + break; + + default: + throw new Error('Unknown evaluation type: ' + evaluator.evaluate); + } + + if (!score) { + throw new Error('LLM judge evaluation failed to produce a score'); + } + + const _durationMs = Date.now() - startTime; + + return { + name: evaluator.name || 'llm_judge', + type: 'llm_judge_comparison', + score: score.score, + passed: score.passed, + evidence: score.reasoning, + details: { + criticisms: score.criticisms, + strengths: score.strengths, + cost: judge.getCostTracker(), + }, + durationMs: Date.now() - startTime, + }; + } catch (err) { + const _durationMs = Date.now() - startTime; + + return { + name: evaluator.name || 'llm_judge', + type: 'llm_judge_comparison', + score: 0, + passed: false, + evidence: (err as Error).message, + details: { + error: (err as Error).message, + }, + durationMs: Date.now() - startTime, + }; + } +} + +// ============================================================================= +// Comparison Evaluator +// ============================================================================= + +/** + * Run LLM judge comparison evaluator + */ +export async function runLLMJudgeComparisonEvaluator( + evaluator: LLMJudgeEvaluator, + answer1: string, + answer2: string, + context?: string +): Promise { + const startTime = Date.now(); + const options: LLMJudgeOptions = { + model: evaluator.model, + projectRoot: process.cwd(), + }; + + const judge = new LLMJudge(options); + + try { + const result = await judge.compare( + 'Compare the quality and correctness of these two answers.', + answer1, + answer2, + context || undefined + ); + + if (!result) { + throw new Error('LLM judge comparison failed to produce a result'); + } + + const _durationMs = Date.now() - startTime; + + return { + name: evaluator.name || 'llm_judge_comparison', + type: 'llm_judge_comparison', + score: result.winner === 'tie' ? 0.5 : result.winner === 'answer1' ? 1.0 : 0.0, + passed: result.winner !== 'answer2', + evidence: result.reasoning, + details: { + winner: result.winner, + score1: result.score1, + score2: result.score2, + cost: judge.getCostTracker(), + }, + durationMs: Date.now() - startTime, + }; + } catch (err) { + const _durationMs = Date.now() - startTime; + + return { + name: evaluator.name || 'llm_judge_comparison', + type: 'llm_judge_comparison', + score: 0, + passed: false, + evidence: (err as Error).message, + details: { + error: (err as Error).message, + }, + durationMs: Date.now() - startTime, + }; + } +} diff --git a/src/evaluation/llm-judge.ts.bak b/src/evaluation/llm-judge.ts.bak new file mode 100644 index 0000000..d95100b --- /dev/null +++ b/src/evaluation/llm-judge.ts.bak @@ -0,0 +1,559 @@ +/** + * LLM Judge Evaluator - Uses Claude API to evaluate answers + * + * Provides structured evaluation of agent answers against baselines + * or quality criteria using LLM-based judgment. + */ + +import { getEnvVar } from '../utils/env'; +import type { LLMJudgeEvaluator, EvaluatorResult } from '../cases/types'; + +// ============================================================================= +// Types +// ============================================================================= + +/** + * Score from LLM evaluation + */ +export interface LLMJudgeScore { + /** Overall score from 0.0 to 1.0 */ + score: number; + + /** Whether the answer passed (score >= threshold) */ + passed: boolean; + + /** Reasoning for the score */ + reasoning: string; + + /** Criticisms or issues found */ + criticisms?: string[]; + + /** Strengths identified */ + strengths?: string[]; +} + +/** + * Comparison result between two answers + */ +export interface ComparisonResult { + /** Which answer is better (if any) */ + winner?: 'answer1' | 'answer2' | 'tie'; + + /** Score for answer 1 */ + score1: LLMJudgeScore; + + /** Score for answer 2 */ + score2: LLMJudgeScore; + + /** Overall comparison reasoning */ + reasoning: string; +} + +/** + * Evaluation options + */ +export interface LLMJudgeOptions { + /** Model to use for evaluation (default: claude-3-5-sonnet-20241022) */ + model?: string; + + /** API key (defaults to ANTHROPIC_API_KEY env var) */ + apiKey?: string; + + /** Maximum tokens for response */ + maxTokens?: number; + + /** Temperature for generation (0.0-1.0) */ + temperature?: number; + + /** Enable caching to reduce costs */ + enableCache?: boolean; + + /** Project root for .env file loading */ + projectRoot?: string; + + /** Callback for progress updates */ + onProgress?: (update: string) => void; +} + +/** + * Cost tracking + */ +export interface CostTracker { + /** Total input tokens */ + inputTokens: number; + + /** Total output tokens */ + outputTokens: number; + + /** Total cost in USD */ + costUsd: number; + + /** Number of API calls */ + callCount: number; +} + +// ============================================================================= +// Prompt Templates +// ============================================================================= + +const PROMPTS = { + /** + * Evaluate a single answer on quality criteria + */ + quality: (criteria: string, answer: string, context?: string) => { + const contextSection = context ? '\n\nContext:\n' + context : ''; + return 'You are an expert code reviewer. Evaluate the following answer based on the criteria:\n\n' + criteria + contextSection + '\n\nAnswer to evaluate:\n' + answer + '\n\nProvide your evaluation in the following JSON format:\n{\n "score": 0.0-1.0,\n "reasoning": "Brief explanation of the score",\n "criticisms": ["issue 1", "issue 2"],\n "strengths": ["strength 1", "strength 2"]\n}\n\nThe score should be a number between 0.0 (poor) and 1.0 (excellent).'; + }, + + /** + * Compare two answers + */ + comparison: (criteria: string, answer1: string, answer2: string, context?: string) => { + const contextSection = context ? '\n\nContext:\n' + context : ''; + return 'You are an expert code reviewer. Compare the following two answers based on the criteria:\n\n' + criteria + contextSection + '\n\nAnswer 1:\n' + answer1 + '\n\nAnswer 2:\n' + answer2 + '\n\nProvide your comparison in the following JSON format:\n{\n "winner": "answer1" | "answer2" | "tie",\n "score1": { "score": 0.0-1.0, "reasoning": "...", "criticisms": [], "strengths": [] },\n "score2": { "score": 0.0-1.0, "reasoning": "...", "criticisms": [], "strengths": [] },\n "reasoning": "Overall comparison reasoning"\n}'; + }, + + /** + * Evaluate against a baseline + */ + baseline: (criteria: string, answer: string, baseline: string, context?: string) => { + const contextSection = context ? '\n\nContext:\n' + context : ''; + return 'You are an expert code reviewer. Evaluate the following answer against a human-graded baseline.\n\n' + criteria + contextSection + '\n\nBaseline (human-graded):\n' + baseline + '\n\nAnswer to evaluate:\n' + answer + '\n\nProvide your evaluation in the following JSON format:\n{\n "score": 0.0-1.0,\n "reasoning": "How this answer compares to the baseline",\n "criticisms": ["issues compared to baseline"],\n "strengths": ["strengths compared to baseline"]\n}'; + }, +}; + +// ============================================================================= +// LLM Judge Implementation +// ============================================================================= + +/** + * LLM Judge - Evaluates answers using Claude API + */ +export class LLMJudge { + private apiKey: string; + private model: string; + private maxTokens: number; + private temperature: number; + private enableCache: boolean; + private projectRoot: string; + private costTracker: CostTracker; + private cache: Map; + + constructor(options: LLMJudgeOptions = {}) { + const projectRoot = options.projectRoot || process.cwd(); + this.apiKey = options.apiKey || (getEnvVar('ANTHROPIC_API_KEY', projectRoot) || ''); + this.model = options.model || 'claude-3-5-sonnet-20241022'; + this.maxTokens = options.maxTokens || 1024; + this.temperature = options.temperature || 0.0; + this.enableCache = options.enableCache ?? true; + this.projectRoot = projectRoot; + this.costTracker = { + inputTokens: 0, + outputTokens: 0, + costUsd: 0, + callCount: 0, + }; + this.cache = new Map(); + } + + /** + * Evaluate a single answer + */ + async evaluate( + criteria: string, + answer: string, + context?: string + ): Promise { + const cacheKey = this.generateCacheKey('quality', criteria, answer, context || ''); + if (this.enableCache && this.cache.has(cacheKey)) { + const cached = this.cache.get(cacheKey); + if (cached && 'score' in cached) { + return cached as LLMJudgeScore; + } + } + + const prompt = PROMPTS.quality(criteria, answer, context); + const result = await this.callClaude(prompt); + + if (this.enableCache && result) { + this.cache.set(cacheKey, result); + } + + if (!result) { + return null; + } + + // Ensure we return LLMJudgeScore, not ComparisonResult + if ('score1' in result) { + throw new Error('Unexpected ComparisonResult returned from evaluate method'); + } + + return result as LLMJudgeScore; + } + + /** + * Compare two answers + */ + async compare( + criteria: string, + answer1: string, + answer2: string, + context?: string + ): Promise { + const cacheKey = this.generateCacheKey('comparison', criteria, answer1, answer2, context || ''); + if (this.enableCache && this.cache.has(cacheKey)) { + const cached = this.cache.get(cacheKey); + if (cached && 'score1' in cached) { + return cached as ComparisonResult; + } + } + + const prompt = PROMPTS.comparison(criteria, answer1, answer2, context); + const result = await this.callClaude(prompt); + + if (this.enableCache && result) { + this.cache.set(cacheKey, result); + } + + if (!result) { + return null; + } + + // Ensure we return ComparisonResult, not LLMJudgeScore + if ('score' in result) { + throw new Error('Unexpected LLMJudgeScore returned from compare method'); + } + + return result as ComparisonResult; + } + + /** + * Evaluate against a baseline + */ + async evaluateAgainstBaseline( + criteria: string, + answer: string, + baseline: string, + context?: string + ): Promise { + const cacheKey = this.generateCacheKey('baseline', criteria, answer, baseline, context || ''); + if (this.enableCache && this.cache.has(cacheKey)) { + const cached = this.cache.get(cacheKey); + if (cached && 'score' in cached) { + return cached as LLMJudgeScore; + } + } + + const prompt = PROMPTS.baseline(criteria, answer, baseline, context); + const result = await this.callClaude(prompt); + + if (this.enableCache && result) { + this.cache.set(cacheKey, result); + } + + if (!result) { + return null; + } + + // Ensure we return LLMJudgeScore, not ComparisonResult + if ('score1' in result) { + throw new Error('Unexpected ComparisonResult returned from evaluateAgainstBaseline method'); + } + + return result as LLMJudgeScore; + } + + /** + * Call Claude API + */ + private async callClaude(prompt: string): Promise { + if (!this.apiKey) { + throw new Error('ANTHROPIC_API_KEY not set'); + } + + this.costTracker.callCount++; + + // Dynamic import of SDK + const sdk = await import('@anthropic-ai/claude-agent-sdk'); + + const response = await sdk.query({ + prompt, + options: { + model: this.model, + // Note: system prompt is not supported in this SDK version + settingSources: [], + }, + }); + + let result: LLMJudgeScore | null = null; + + for await (const message of response) { + if (message.type === 'result' && message.subtype === 'success' && (message as any).result) { + const content = (message as any).result || ''; + result = this.parseResponse(content); + break; + } + } + + if (!result) { + throw new Error('Failed to parse LLM response'); + } + + return result; + } + + /** + * Parse LLM response into structured score or comparison + */ + private parseResponse(content: string): LLMJudgeScore | ComparisonResult | null { + try { + // Extract JSON from response (handle markdown code blocks) + const jsonMatch = content.match(/\{[\s\S]*\}/); + if (!jsonMatch) { + throw new Error('No JSON found in response'); + } + + const data = JSON.parse(jsonMatch[0]); + + // Check if this is a comparison result (has score1 and score2) + if (data.score1 && data.score2) { + return { + winner: data.winner, + score1: { + score: this.normalizeScore(data.score1.score), + passed: this.normalizeScore(data.score1.score) >= 0.7, + reasoning: data.score1.reasoning || '', + criticisms: data.score1.criticisms || [], + strengths: data.score1.strengths || [], + }, + score2: { + score: this.normalizeScore(data.score2.score), + passed: this.normalizeScore(data.score2.score) >= 0.7, + reasoning: data.score2.reasoning || '', + criticisms: data.score2.criticisms || [], + strengths: data.score2.strengths || [], + }, + reasoning: data.reasoning || '', + }; + } + + // Otherwise, this is a single score + return { + score: this.normalizeScore(data.score), + passed: this.normalizeScore(data.score) >= 0.7, + reasoning: data.reasoning || '', + criticisms: data.criticisms || [], + strengths: data.strengths || [], + }; + } catch (err) { + throw new Error('Failed to parse LLM response: ' + (err as Error).message); + } + } + + /** + * Normalize score to 0.0-1.0 range + */ + private normalizeScore(score: unknown): number { + if (typeof score === 'number') { + return Math.max(0, Math.min(1, score)); + } + if (typeof score === 'string') { + const parsed = parseFloat(score); + return isNaN(parsed) ? 0 : Math.max(0, Math.min(1, parsed)); + } + return 0; + } + + /** + * Generate cache key + */ + private generateCacheKey( + type: string, + ...args: string[] + ): string { + const str = args.filter((arg): arg is string => arg !== undefined).join('|||'); + return type + ':' + this.model + ':' + str.substring(0, 200); + } + + /** + * Get cost tracking + */ + getCostTracker(): CostTracker { + return { ...this.costTracker }; + } + + /** + * Clear cache + */ + clearCache(): void { + this.cache.clear(); + } + + /** + * Get cache size + */ + getCacheSize(): number { + return this.cache.size; + } +} + +// ============================================================================= +// Evaluator Implementation +// ============================================================================= + +/** + * Run LLM judge evaluator + */ +export async function runLLMJudgeEvaluator( + evaluator: LLMJudgeEvaluator, + answer: string, + context?: string +): Promise { + const startTime = Date.now(); + const options: LLMJudgeOptions = { + model: evaluator.model, + projectRoot: process.cwd(), + }; + + const judge = new LLMJudge(options); + + try { + let score: LLMJudgeScore | null = null; + + switch (evaluator.evaluate) { + case 'code_quality': + score = await judge.evaluate( + 'Code quality: Is the code well-structured, readable, and maintainable?', + answer, + context + ); + break; + + case 'readability': + score = await judge.evaluate( + 'Readability: Is the code easy to understand and follow?', + answer, + context + ); + break; + + case 'documentation': + score = await judge.evaluate( + 'Documentation: Is the code well-documented with clear comments and explanations?', + answer, + context + ); + break; + + case 'custom': + if (!evaluator.prompt) { + throw new Error('Custom evaluation requires a prompt'); + } + score = await judge.evaluate(evaluator.prompt, answer, context || undefined); + break; + + default: + throw new Error('Unknown evaluation type: ' + evaluator.evaluate); + } + + if (!score) { + throw new Error('LLM judge evaluation failed to produce a score'); + } + + const durationMs = Date.now() - startTime; + + return { + name: evaluator.name || 'llm_judge', + type: 'llm_judge_comparison', + score: score.score, + passed: score.passed, + evidence: score.reasoning, + details: { + criticisms: score.criticisms, + strengths: score.strengths, + cost: judge.getCostTracker(), + }, + durationMs: Date.now() - startTime, + }; + } catch (err) { + const durationMs = Date.now() - startTime; + + return { + name: evaluator.name || 'llm_judge', + type: 'llm_judge_comparison', + score: 0, + passed: false, + evidence: (err as Error).message, + details: { + error: (err as Error).message, + }, + durationMs: Date.now() - startTime, + }; + } +} + +// ============================================================================= +// Comparison Evaluator +// ============================================================================= + +/** + * Run LLM judge comparison evaluator + */ +export async function runLLMJudgeComparisonEvaluator( + evaluator: LLMJudgeEvaluator, + answer1: string, + answer2: string, + context?: string +): Promise { + const startTime = Date.now(); + const options: LLMJudgeOptions = { + model: evaluator.model, + projectRoot: process.cwd(), + }; + + const judge = new LLMJudge(options); + + try { + const result = await judge.compare( + 'Compare the quality and correctness of these two answers.', + answer1, + answer2, + context || undefined + ); + + if (!result) { + throw new Error('LLM judge comparison failed to produce a result'); + } + + const durationMs = Date.now() - startTime; + + return { + name: evaluator.name || 'llm_judge_comparison', + type: 'llm_judge_comparison', + score: result.winner === 'tie' ? 0.5 : result.winner === 'answer1' ? 1.0 : 0.0, + passed: result.winner !== 'answer2', + evidence: result.reasoning, + details: { + winner: result.winner, + score1: result.score1, + score2: result.score2, + cost: judge.getCostTracker(), + }, + durationMs: Date.now() - startTime, + }; + } catch (err) { + const durationMs = Date.now() - startTime; + + return { + name: evaluator.name || 'llm_judge_comparison', + type: 'llm_judge_comparison', + score: 0, + passed: false, + evidence: (err as Error).message, + details: { + error: (err as Error).message, + }, + durationMs: Date.now() - startTime, + }; + } +} diff --git a/src/evaluation/runner.ts b/src/evaluation/runner.ts index 302c91b..f3bb482 100644 --- a/src/evaluation/runner.ts +++ b/src/evaluation/runner.ts @@ -25,6 +25,7 @@ import { Sandbox, SandboxConfig } from '../sandbox/types'; import { getRubricRegistry } from '../rubrics/loader'; import { getAgent } from '../agents/registry'; import type { AgentResult } from '../agents/types'; +// // import { runLLMJudgeEvaluator } from './llm-judge'; export interface RunnerOptions { /** Agent being evaluated (for logging) */ @@ -95,7 +96,7 @@ export async function runCases(cases: Case[], options: RunnerOptions): Promise ({ + agentResponse: _agentResult.answer, + agentToolCalls: _agentResult.toolCalls.map((t) => ({ name: t.name, - durationMs: t.durationMs, - success: t.success, + durationMs: t.durationMs || 0, + success: t.success || false, })), - agentModel: agentResult.model, - agentTokens: agentResult.tokens + agentModel: _agentResult.model, + agentTokens: _agentResult.tokens ? { - input: agentResult.tokens.inputTokens, - output: agentResult.tokens.outputTokens, - total: agentResult.tokens.totalTokens, + input: _agentResult.tokens.inputTokens, + output: _agentResult.tokens.outputTokens, + total: _agentResult.tokens.totalTokens, } : undefined, - agentFiles, + agentFiles: _agentFiles, durationMs, timestamp: new Date(), }; @@ -302,14 +305,17 @@ async function runSingleCase( async function evaluateWithRubric( caseData: Case, sandbox: Sandbox, - _options: RunnerOptions -): Promise> { + _options: RunnerOptions, + _agentResult: AgentResult, + _agentFiles: { path: string; content: string; changed: boolean }[] +): Promise { const registry = getRubricRegistry(); const rubric = registry.resolve(caseData.rubric); const criteriaResults: CriterionResult[] = []; let totalWeightedScore = 0; let _totalWeight = 0; + const evalStartTime = Date.now(); // Evaluate each criterion in the rubric for (const [criterionKey, criterion] of Object.entries(rubric.criteria)) { @@ -318,7 +324,6 @@ async function evaluateWithRubric( let evaluatorCount = 0; for (const evaluator of criterion.evaluators) { - const evalStartTime = Date.now(); let evalResult: Omit; if (evaluator.type === 'command') { @@ -362,6 +367,15 @@ async function evaluateWithRubric( score: 0.0, evidence: 'Pattern check not yet implemented', }; + } else if ((evaluator.type as EvaluatorType) === 'llm_judge' || (evaluator.type as EvaluatorType) === 'llm_judge_comparison') { + // Run LLM judge evaluator + // TODO: Implement baseline answer storage and comparison + // For now, use a placeholder evaluator + evalResult = { + passed: false, + score: 0.0, + evidence: 'LLM judge comparison not yet fully implemented', + }; } else { // Other evaluator types (llm_judge, benchmark, etc.) - not implemented evalResult = { @@ -370,13 +384,10 @@ async function evaluateWithRubric( evidence: `Evaluator type '${evaluator.type}' not yet implemented`, }; } - - const evalDurationMs = Date.now() - evalStartTime; - evaluatorResults.push({ name: evaluator.name || evaluator.type, type: evaluator.type as EvaluatorType, - durationMs: evalDurationMs, + durationMs: Date.now() - evalStartTime, ...evalResult, }); @@ -397,9 +408,10 @@ async function evaluateWithRubric( name: criterionKey, weight: criterion.weight, score: rawScore, - weightedScore, passed: allPassed, + evidence: `Criterion: ${criterionKey}`, evaluatorResults, + durationMs: Date.now() - evalStartTime, }); totalWeightedScore += weightedScore; @@ -420,13 +432,18 @@ async function evaluateWithRubric( const passThreshold = 70; const passed = overallScore >= passThreshold; - return { - caseId: caseData.id, + const result: CaseResult = { + id: caseData.id, + title: caseData.title, score: overallScore, passed, - criteriaResults, - timedOut: false, + evidence: `Overall score: ${overallScore.toFixed(2)}%`, + criteria: criteriaResults, + evaluators: [], + durationMs: Date.now() - evalStartTime, + timestamp: new Date(), }; + return result; } /** diff --git a/src/evaluation/runner.ts.bak b/src/evaluation/runner.ts.bak new file mode 100644 index 0000000..dd12e57 --- /dev/null +++ b/src/evaluation/runner.ts.bak @@ -0,0 +1,555 @@ +/** + * Evaluation runner - executes cases in sandboxes and evaluates results + * + * This is the core evaluation engine that: + * 1. Sets up the sandbox environment + * 2. Runs the case (agent attempts to solve the problem) + * 3. Applies the rubric to evaluate the result + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + Case, + CaseFile, + CaseResult, + CriterionResult, + EvaluatorResult, + RunResult, + RunSummary, + EvaluatorType, +} from '../cases/types'; +import { createSandboxManager, checkDocker, RECOMMENDED_IMAGES } from '../sandbox'; +import { Sandbox, SandboxConfig } from '../sandbox/types'; +import { getRubricRegistry } from '../rubrics/loader'; +import { getAgent } from '../agents/registry'; +import type { AgentResult } from '../agents/types'; +// // import { runLLMJudgeEvaluator } from './llm-judge'; + +export interface RunnerOptions { + /** Agent being evaluated (for logging) */ + agent: string; + + /** Model to use (passed to agent) */ + model?: string; + + /** Timeout per case in seconds */ + timeoutSeconds?: number; + + /** Enable network in sandbox */ + networkEnabled?: boolean; + + /** Callback for progress updates */ + onProgress?: (update: ProgressUpdate) => void; + + /** Callback when a case completes */ + onCaseComplete?: (result: CaseResult) => void; +} + +export interface ProgressUpdate { + type: 'starting' | 'running' | 'validating' | 'complete' | 'error'; + caseId: string; + caseIndex: number; + totalCases: number; + message?: string; +} + +/** + * Get the appropriate Docker image for a language + */ +function getImageForLanguage(language: string): string { + const langLower = language.toLowerCase(); + + if (langLower === 'javascript' || langLower === 'typescript' || langLower === 'node') { + return RECOMMENDED_IMAGES.node.latest; + } + if (langLower === 'python') { + return RECOMMENDED_IMAGES.python.latest; + } + if (langLower === 'go' || langLower === 'golang') { + return RECOMMENDED_IMAGES.go.latest; + } + if (langLower === 'rust') { + return RECOMMENDED_IMAGES.rust.latest; + } + if (langLower === 'java') { + return RECOMMENDED_IMAGES.java.latest; + } + + // Default to Node.js for unknown languages + return RECOMMENDED_IMAGES.node.latest; +} + +/** + * Run a set of cases and return results + */ +export async function runCases(cases: Case[], options: RunnerOptions): Promise { + const runId = `run-${Date.now()}-${Math.random().toString(36).substring(2, 8)}`; + const startedAt = new Date(); + const results: CaseResult[] = []; + + // Check Docker availability first + const dockerStatus = await checkDocker(); + if (!dockerStatus.available) { + throw new Error(`Docker is not available: ${dockerStatus.error}\n${dockerStatus.suggestion}`); + } + + const manager = createSandboxManager(); + let rubricId = 'default'; + + try { + for (let i = 0; i < cases.length; i++) { + const caseData = cases[i]; + + options.onProgress?.({ + type: 'starting', + caseId: caseData.id, + caseIndex: i, + totalCases: cases.length, + message: `Starting ${caseData.title}`, + }); + + try { + const result = await runSingleCase(caseData, manager, options, i, cases.length); + results.push(result); + options.onCaseComplete?.(result); + // Track the rubric ID from the first case + if (i === 0) { + const registry = getRubricRegistry(); + const rubric = registry.resolve(caseData.rubric); + rubricId = rubric.id; + } + } catch (err) { + const errorResult: CaseResult = { + id: caseData.id, + title: caseData.title, + score: 0, + passed: false, + evidence: (err as Error).message, + criteria: [], + evaluators: [], + durationMs: 0, + error: (err as Error).message, + timestamp: new Date(), + }; + results.push(errorResult); + options.onCaseComplete?.(errorResult); + } + } + } finally { + // Clean up all sandboxes + await manager.destroyAll(); + } + + const completedAt = new Date(); + const totalDurationMs = completedAt.getTime() - startedAt.getTime(); + + // Calculate summary + const scores = results.map((r) => r.score); + const averageScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0; + + const summary: RunSummary = { + total: results.length, + passed: results.filter((r) => r.passed).length, + failed: results.filter((r) => !r.passed && !r.error).length, + skipped: 0, + timedOut: results.filter((r) => r.timedOut).length, + averageScore, + totalDurationMs, + }; + + return { + id: runId, + timestamp: startedAt, + cases: results, + summary, + durationMs: totalDurationMs, + agent: options.agent, + rubricId, + }; +} + +/** + * Run a single case in a sandbox + */ +async function runSingleCase( + caseData: Case, + manager: ReturnType, + options: RunnerOptions, + caseIndex: number, + totalCases: number +): Promise { + const startTime = Date.now(); + + // Create a temporary directory for this case + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), `sniff-${caseData.id}-`)); + + try { + // Write case files to temp directory (if any) + if (caseData.files) { + for (const file of caseData.files) { + const filePath = path.join(tempDir, file.path); + const fileDir = path.dirname(filePath); + + // Create directories if needed + fs.mkdirSync(fileDir, { recursive: true }); + if (file.content !== undefined) { + fs.writeFileSync(filePath, file.content); + } + } + } + + // Create sandbox + const sandboxConfig: SandboxConfig = { + workdir: tempDir, + image: getImageForLanguage(caseData.language), + timeoutSeconds: options.timeoutSeconds || 300, + networkEnabled: options.networkEnabled || false, + }; + + options.onProgress?.({ + type: 'running', + caseId: caseData.id, + caseIndex, + totalCases, + message: 'Creating sandbox...', + }); + + const sandbox = await manager.create(sandboxConfig); + + try { + // Install dependencies if needed + await installDependencies(sandbox, caseData.language, options, caseIndex, totalCases, caseData.id); + + // Run the agent to attempt to solve the case + options.onProgress?.({ + type: 'running', + caseId: caseData.id, + caseIndex, + totalCases, + message: 'Running agent...', + }); + + const agent = getAgent(options.agent); + const _agentResult: AgentResult = await agent.run(caseData.prompt, { + cwd: tempDir, + model: options.model, + timeoutMs: (options.timeoutSeconds || 300) * 1000, + permissionMode: 'acceptEdits', + }); + + if (!_agentResult.success) { + throw new Error(`Agent execution failed: ${_agentResult.error}`); + } + + // Snapshot files the agent produced (before rubric evaluation) + const _agentFiles = snapshotFiles(tempDir, caseData.files); + + // Evaluate using the rubric + options.onProgress?.({ + type: 'validating', + caseId: caseData.id, + caseIndex, + totalCases, + message: 'Evaluating with rubric...', + }); + + const result = await evaluateWithRubric(caseData, sandbox, options, _agentResult, _agentFiles); + const durationMs = Date.now() - startTime; + + options.onProgress?.({ + type: 'complete', + caseId: caseData.id, + caseIndex, + totalCases, + message: result.passed ? `Passed (${Math.round(result.score)}%)` : `Failed (${Math.round(result.score)}%)`, + }); + + return { + ...result, + agentResponse: _agentResult.answer, + agentToolCalls: _agentResult.toolCalls.map((t) => ({ + name: t.name, + durationMs: t.durationMs || 0, + success: t.success || false, + })), + agentModel: _agentResult.model, + agentTokens: _agentResult.tokens + ? { + input: _agentResult.tokens.inputTokens, + output: _agentResult.tokens.outputTokens, + total: _agentResult.tokens.totalTokens, + } + : undefined, + agentFiles: _agentFiles, + durationMs, + timestamp: new Date(), + }; + } finally { + await sandbox.destroy(); + } + } finally { + // Clean up temp directory + try { + fs.rmSync(tempDir, { recursive: true, force: true }); + } catch { + // Ignore cleanup errors + } + } +} + +/** + * Evaluate a case using its rubric + */ +async function evaluateWithRubric( + caseData: Case, + sandbox: Sandbox, + _options: RunnerOptions, + _agentResult: AgentResult, + _agentFiles: { path: string; content: string; changed: boolean }[] +): Promise { + const registry = getRubricRegistry(); + const rubric = registry.resolve(caseData.rubric); + + const criteriaResults: CriterionResult[] = []; + let totalWeightedScore = 0; + let _totalWeight = 0; + const evalStartTime = Date.now(); + + // Evaluate each criterion in the rubric + for (const [criterionKey, criterion] of Object.entries(rubric.criteria)) { + const evaluatorResults: EvaluatorResult[] = []; + let criterionScore = 0; + let evaluatorCount = 0; + + for (const evaluator of criterion.evaluators) { + let evalResult: Omit; + + if (evaluator.type === 'command') { + // Run command evaluator + const result = await sandbox.exec(evaluator.run, { + timeoutSeconds: 60, + }); + + const passed = result.exitCode === 0; + let score = passed ? 1.0 : 0.0; + + // Handle partial credit + if (evaluator.partialCredit && !passed) { + // For test runners, try to parse pass/fail ratio + const testMatch = result.stdout.match(/(\d+) passed/); + const failMatch = result.stdout.match(/(\d+) failed/); + if (testMatch && failMatch) { + const passedTests = parseInt(testMatch[1], 10); + const failedTests = parseInt(failMatch[1], 10); + const total = passedTests + failedTests; + if (total > 0) { + score = passedTests / total; + } + } + } + + evalResult = { + passed, + score, + evidence: (result.stdout + '\n' + result.stderr).trim(), + details: { + exitCode: result.exitCode, + timedOut: result.timedOut, + }, + }; + } else if (evaluator.type === 'pattern') { + // Run pattern evaluator (check for matches in files) + // Default to fail until fully implemented + evalResult = { + passed: false, + score: 0.0, + evidence: 'Pattern check not yet implemented', + }; + } else if ((evaluator.type as any) === 'llm_judge' || (evaluator.type as any) === 'llm_judge_comparison') { + // Run LLM judge evaluator + // TODO: Implement baseline answer storage and comparison + // For now, use a placeholder evaluator + evalResult = { + passed: false, + score: 0.0, + evidence: 'LLM judge comparison not yet fully implemented', + }; + } else { + // Other evaluator types (llm_judge, benchmark, etc.) - not implemented + evalResult = { + passed: false, + score: 0.0, + evidence: `Evaluator type '${evaluator.type}' not yet implemented`, + }; + } + evaluatorResults.push({ + name: evaluator.name || evaluator.type, + type: evaluator.type as EvaluatorType, + durationMs: Date.now() - evalStartTime, + ...evalResult, + }); + + if (!evaluator.optional) { + criterionScore += evalResult.score; + evaluatorCount++; + } + } + + // Average score for this criterion + // If no non-optional evaluators ran, this criterion doesn't participate in scoring + const hasRequiredEvaluators = evaluatorCount > 0; + const rawScore = hasRequiredEvaluators ? criterionScore / evaluatorCount : 0.0; + const weightedScore = hasRequiredEvaluators ? (rawScore * criterion.weight) / 100 : 0; + const allPassed = evaluatorResults.filter((e) => !e.passed).length === 0; + + criteriaResults.push({ + name: criterionKey, + weight: criterion.weight, + score: rawScore, + passed: allPassed, + evidence: `Criterion: ${criterionKey}`, + evaluatorResults, + durationMs: Date.now() - evalStartTime, + }); + + totalWeightedScore += weightedScore; + // Only count weight for criteria that had non-optional evaluators + if (hasRequiredEvaluators) { + _totalWeight += criterion.weight; + } + } + + // Normalize score by participating weight (criteria with only optional evaluators are excluded) + // Each criterion's weightedScore = rawScore * weight / 100, so totalWeightedScore + // is a fraction of 1.0 when all weights sum to 100. When some criteria are excluded, + // rescale so the participating criteria fill the full 0-100% range. + const participatingFraction = _totalWeight / 100; + const overallScore = participatingFraction > 0 ? (totalWeightedScore / participatingFraction) * 100 : 0; + + // Determine pass/fail (default threshold: 70%) + const passThreshold = 70; + const passed = overallScore >= passThreshold; + + const result: CaseResult = { + id: caseData.id, + title: caseData.title, + score: overallScore, + passed, + evidence: `Overall score: ${overallScore.toFixed(2)}%`, + criteria: criteriaResults, + evaluators: [], + durationMs: Date.now() - evalStartTime, + timestamp: new Date(), + }; + return result; +} + +/** + * Install dependencies based on language + */ +async function installDependencies( + sandbox: Sandbox, + language: string, + options: RunnerOptions, + caseIndex: number, + totalCases: number, + caseId: string +): Promise { + const langLower = language.toLowerCase(); + + options.onProgress?.({ + type: 'running', + caseId, + caseIndex, + totalCases, + message: 'Installing dependencies...', + }); + + if (langLower === 'python') { + // Check for requirements.txt + const result = await sandbox.exec('test -f requirements.txt && pip install -r requirements.txt || true'); + if (result.exitCode !== 0 && result.stderr) { + console.warn('Warning: pip install failed:', result.stderr); + } + // Also install pytest if running tests + await sandbox.exec('pip install pytest --quiet 2>/dev/null || true'); + } else if (langLower === 'javascript' || langLower === 'typescript' || langLower === 'node') { + // Check for package.json + const result = await sandbox.exec('test -f package.json && npm install --silent || true'); + if (result.exitCode !== 0 && result.stderr) { + console.warn('Warning: npm install failed:', result.stderr); + } + } else if (langLower === 'go' || langLower === 'golang') { + // Check for go.mod + await sandbox.exec('test -f go.mod && go mod download || true'); + } +} + +/** + * Snapshot all files in the workspace after the agent runs. + * Compares against the original case files to flag which ones changed. + * Reads directly from the host tempDir (bind-mounted into the sandbox). + */ +function snapshotFiles( + tempDir: string, + originalFiles?: CaseFile[] +): { path: string; content: string; changed: boolean }[] { + const results: { path: string; content: string; changed: boolean }[] = []; + const origMap = new Map(); + + // Build map of original file contents for comparison + if (originalFiles) { + for (const f of originalFiles) { + if (f.content !== undefined) { + origMap.set(f.path, f.content); + } + } + } + + // Walk the temp directory and collect all files + function walk(dir: string, prefix: string) { + let entries: fs.Dirent[]; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch { + return; + } + for (const entry of entries) { + const relPath = prefix ? `${prefix}/${entry.name}` : entry.name; + const fullPath = path.join(dir, entry.name); + + // Skip common non-essential directories + if (entry.isDirectory()) { + if (['node_modules', '.git', '__pycache__', '.pytest_cache', 'venv', '.venv'].includes(entry.name)) { + continue; + } + walk(fullPath, relPath); + continue; + } + + if (!entry.isFile()) continue; + + // Skip binary and large files + try { + const stat = fs.statSync(fullPath); + if (stat.size > 100_000) continue; // Skip files over 100KB + } catch { + continue; + } + + try { + const content = fs.readFileSync(fullPath, 'utf-8'); + const original = origMap.get(relPath); + const changed = original === undefined || original !== content; + results.push({ path: relPath, content, changed }); + } catch { + // Skip files that can't be read as UTF-8 + } + } + } + + walk(tempDir, ''); + return results; +} diff --git a/src/evaluation/runner.ts.orig b/src/evaluation/runner.ts.orig new file mode 100644 index 0000000..b4c67c6 --- /dev/null +++ b/src/evaluation/runner.ts.orig @@ -0,0 +1,566 @@ +/** + * Evaluation runner - executes cases in sandboxes and evaluates results + * + * This is the core evaluation engine that: + * 1. Sets up the sandbox environment + * 2. Runs the case (agent attempts to solve the problem) + * 3. Applies the rubric to evaluate the result + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + Case, + CaseFile, + CaseResult, + CriterionResult, + EvaluatorResult, + RunResult, + RunSummary, + EvaluatorType, +} from '../cases/types'; +import { createSandboxManager, checkDocker, RECOMMENDED_IMAGES } from '../sandbox'; +import { Sandbox, SandboxConfig } from '../sandbox/types'; +import { getRubricRegistry } from '../rubrics/loader'; +import { getAgent } from '../agents/registry'; +import { runLLMJudgeEvaluator } from './llm-judge'; +import type { AgentResult } from '../agents/types'; + +export interface RunnerOptions { + /** Agent being evaluated (for logging) */ + agent: string; + + /** Model to use (passed to agent) */ + model?: string; + + /** Timeout per case in seconds */ + timeoutSeconds?: number; + + /** Enable network in sandbox */ + networkEnabled?: boolean; + + /** Callback for progress updates */ + onProgress?: (update: ProgressUpdate) => void; + + /** Callback when a case completes */ + onCaseComplete?: (result: CaseResult) => void; +} + +export interface ProgressUpdate { + type: 'starting' | 'running' | 'validating' | 'complete' | 'error'; + caseId: string; + caseIndex: number; + totalCases: number; + message?: string; +} + +/** + * Get the appropriate Docker image for a language + */ +function getImageForLanguage(language: string): string { + const langLower = language.toLowerCase(); + + if (langLower === 'javascript' || langLower === 'typescript' || langLower === 'node') { + return RECOMMENDED_IMAGES.node.latest; + } + if (langLower === 'python') { + return RECOMMENDED_IMAGES.python.latest; + } + if (langLower === 'go' || langLower === 'golang') { + return RECOMMENDED_IMAGES.go.latest; + } + if (langLower === 'rust') { + return RECOMMENDED_IMAGES.rust.latest; + } + if (langLower === 'java') { + return RECOMMENDED_IMAGES.java.latest; + } + + // Default to Node.js for unknown languages + return RECOMMENDED_IMAGES.node.latest; +} + +/** + * Run a set of cases and return results + */ +export async function runCases(cases: Case[], options: RunnerOptions): Promise { + const runId = `run-${Date.now()}-${Math.random().toString(36).substring(2, 8)}`; + const startedAt = new Date(); + const results: CaseResult[] = []; + + // Check Docker availability first + const dockerStatus = await checkDocker(); + if (!dockerStatus.available) { + throw new Error(`Docker is not available: ${dockerStatus.error}\n${dockerStatus.suggestion}`); + } + + const manager = createSandboxManager(); + let rubricId = 'default'; + + try { + for (let i = 0; i < cases.length; i++) { + const caseData = cases[i]; + + options.onProgress?.({ + type: 'starting', + caseId: caseData.id, + caseIndex: i, + totalCases: cases.length, + message: `Starting ${caseData.title}`, + }); + + try { + const result = await runSingleCase(caseData, manager, options, i, cases.length); + results.push(result); + options.onCaseComplete?.(result); + // Track the rubric ID from the first case + if (i === 0) { + const registry = getRubricRegistry(); + const rubric = registry.resolve(caseData.rubric); + rubricId = rubric.id; + } + } catch (err) { + const errorResult: CaseResult = { + id: caseData.id, + title: caseData.title, + score: 0, + passed: false, + evidence: (err as Error).message, + criteria: [], + evaluators: [], + durationMs: 0, + error: (err as Error).message, + timestamp: new Date(), + }; + results.push(errorResult); + options.onCaseComplete?.(errorResult); + } + } + } finally { + // Clean up all sandboxes + await manager.destroyAll(); + } + + const completedAt = new Date(); + const totalDurationMs = completedAt.getTime() - startedAt.getTime(); + + // Calculate summary + const scores = results.map((r) => r.score); + const averageScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0; + + const summary: RunSummary = { + total: results.length, + passed: results.filter((r) => r.passed).length, + failed: results.filter((r) => !r.passed && !r.error).length, + skipped: 0, + timedOut: results.filter((r) => r.timedOut).length, + averageScore, + totalDurationMs, + }; + + return { + id: runId, + timestamp: startedAt, + cases: results, + summary, + durationMs: totalDurationMs, + agent: options.agent, + rubricId, + }; +} + +/** + * Run a single case in a sandbox + */ +async function runSingleCase( + caseData: Case, + manager: ReturnType, + options: RunnerOptions, + caseIndex: number, + totalCases: number +): Promise { + const startTime = Date.now(); + + // Create a temporary directory for this case + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), `sniff-${caseData.id}-`)); + + try { + // Write case files to temp directory (if any) + if (caseData.files) { + for (const file of caseData.files) { + const filePath = path.join(tempDir, file.path); + const fileDir = path.dirname(filePath); + + // Create directories if needed + fs.mkdirSync(fileDir, { recursive: true }); + if (file.content !== undefined) { + fs.writeFileSync(filePath, file.content); + } + } + } + + // Create sandbox + const sandboxConfig: SandboxConfig = { + workdir: tempDir, + image: getImageForLanguage(caseData.language), + timeoutSeconds: options.timeoutSeconds || 300, + networkEnabled: options.networkEnabled || false, + }; + + options.onProgress?.({ + type: 'running', + caseId: caseData.id, + caseIndex, + totalCases, + message: 'Creating sandbox...', + }); + + const sandbox = await manager.create(sandboxConfig); + + try { + // Install dependencies if needed + await installDependencies(sandbox, caseData.language, options, caseIndex, totalCases, caseData.id); + + // Run the agent to attempt to solve the case + options.onProgress?.({ + type: 'running', + caseId: caseData.id, + caseIndex, + totalCases, + message: 'Running agent...', + }); + + const agent = getAgent(options.agent); + const agentResult: AgentResult = await agent.run(caseData.prompt, { + cwd: tempDir, + model: options.model, + timeoutMs: (options.timeoutSeconds || 300) * 1000, + permissionMode: 'acceptEdits', + }); + + if (!agentResult.success) { + throw new Error(`Agent execution failed: ${agentResult.error}`); + } + + // Snapshot files the agent produced (before rubric evaluation) + const agentFiles = snapshotFiles(tempDir, caseData.files); + + // Evaluate using the rubric + options.onProgress?.({ + type: 'validating', + caseId: caseData.id, + caseIndex, + totalCases, + message: 'Evaluating with rubric...', + }); + + const result = await evaluateWithRubric(caseData, sandbox, options, agentResult, agentFiles); + const durationMs = Date.now() - startTime; + + options.onProgress?.({ + type: 'complete', + caseId: caseData.id, + caseIndex, + totalCases, + message: result.passed ? `Passed (${Math.round(result.score)}%)` : `Failed (${Math.round(result.score)}%)`, + }); + + return { + ...result, + agentResponse: agentResult.answer, + agentToolCalls: agentResult.toolCalls.map((t) => ({ + name: t.name, + durationMs: t.durationMs || 0, + success: t.success || false, + })), + agentModel: agentResult.model, + agentTokens: agentResult.tokens + ? { + input: agentResult.tokens.inputTokens, + output: agentResult.tokens.outputTokens, + total: agentResult.tokens.totalTokens, + } + : undefined, + agentFiles, + durationMs, + timestamp: new Date(), + }; + } finally { + await sandbox.destroy(); + } + } finally { + // Clean up temp directory + try { + fs.rmSync(tempDir, { recursive: true, force: true }); + } catch { + // Ignore cleanup errors + } + } +} + +/** + * Evaluate a case using its rubric + */ +async function evaluateWithRubric( + caseData: Case, + sandbox: Sandbox, + _options: RunnerOptions, + agentResult: AgentResult, + agentFiles: { path: string; content: string; changed: boolean }[] +): Promise { + const registry = getRubricRegistry(); + const rubric = registry.resolve(caseData.rubric); + + const criteriaResults: CriterionResult[] = []; + let totalWeightedScore = 0; + let _totalWeight = 0; + + // Evaluate each criterion in the rubric + for (const [criterionKey, criterion] of Object.entries(rubric.criteria)) { + const evaluatorResults: EvaluatorResult[] = []; + let criterionScore = 0; + let evaluatorCount = 0; + + for (const evaluator of criterion.evaluators) { + const evalStartTime = Date.now(); + let evalResult: Omit; + + if (evaluator.type === 'command') { + // Run command evaluator + const result = await sandbox.exec(evaluator.run, { + timeoutSeconds: 60, + }); + + const passed = result.exitCode === 0; + let score = passed ? 1.0 : 0.0; + + // Handle partial credit + if (evaluator.partialCredit && !passed) { + // For test runners, try to parse pass/fail ratio + const testMatch = result.stdout.match(/(\d+) passed/); + const failMatch = result.stdout.match(/(\d+) failed/); + if (testMatch && failMatch) { + const passedTests = parseInt(testMatch[1], 10); + const failedTests = parseInt(failMatch[1], 10); + const total = passedTests + failedTests; + if (total > 0) { + score = passedTests / total; + } + } + } + + evalResult = { + passed, + score, + evidence: (result.stdout + '\n' + result.stderr).trim(), + details: { + exitCode: result.exitCode, + timedOut: result.timedOut, + }, + }; + } else if (evaluator.type === 'pattern') { + // Run pattern evaluator (check for matches in files) + // Default to fail until fully implemented + evalResult = { + passed: false, + score: 0.0, + evidence: 'Pattern check not yet implemented', + }; + } else if (evaluator.type === 'llm_judge') { + // Run LLM judge evaluator + const result = await runLLMJudgeEvaluator(evaluator, agentResult.answer, JSON.stringify(agentFiles)); + evalResult = { + passed: result.passed, + score: result.score, + evidence: result.evidence, + details: result.details, + }; + } else if ((evaluator.type as any) === 'llm_judge_comparison') { + // Run LLM judge comparison evaluator + // TODO: Implement baseline answer storage and comparison + // For now, use a placeholder evaluator + evalResult = { + passed: false, + score: 0.0, + evidence: 'LLM judge comparison not yet fully implemented', + }; + } else { + // Other evaluator types (llm_judge, benchmark, etc.) - not implemented + evalResult = { + passed: false, + score: 0.0, + evidence: `Evaluator type '${evaluator.type}' not yet implemented`, + }; + } + + const evalDurationMs = Date.now() - evalStartTime; + + evaluatorResults.push({ + name: evaluator.name || evaluator.type, + type: evaluator.type as EvaluatorType, + durationMs: evalDurationMs, + ...evalResult, + }); + + if (!evaluator.optional) { + criterionScore += evalResult.score; + evaluatorCount++; + } + } + + // Average score for this criterion + // If no non-optional evaluators ran, this criterion doesn't participate in scoring + const hasRequiredEvaluators = evaluatorCount > 0; + const rawScore = hasRequiredEvaluators ? criterionScore / evaluatorCount : 0.0; + const weightedScore = hasRequiredEvaluators ? (rawScore * criterion.weight) / 100 : 0; + const allPassed = evaluatorResults.filter((e) => !e.passed).length === 0; + + criteriaResults.push({ + name: criterionKey, + weight: criterion.weight, + score: rawScore, + passed: allPassed, + evidence: `Criterion: ${criterionKey}`, + evaluatorResults, + durationMs: evalDurationMs, + }); + + totalWeightedScore += weightedScore; + // Only count weight for criteria that had non-optional evaluators + if (hasRequiredEvaluators) { + _totalWeight += criterion.weight; + } + } + + // Normalize score by participating weight (criteria with only optional evaluators are excluded) + // Each criterion's weightedScore = rawScore * weight / 100, so totalWeightedScore + // is a fraction of 1.0 when all weights sum to 100. When some criteria are excluded, + // rescale so the participating criteria fill the full 0-100% range. + const participatingFraction = _totalWeight / 100; + const overallScore = participatingFraction > 0 ? (totalWeightedScore / participatingFraction) * 100 : 0; + + // Determine pass/fail (default threshold: 70%) + const passThreshold = 70; + const passed = overallScore >= passThreshold; + + return { + id: caseData.id, + title: caseData.title, + score: overallScore, + passed, + evidence: `Overall score: ${overallScore.toFixed(2)}%`, + criteria: criteriaResults, + evaluators: [], + durationMs: Date.now() - evalStartTime, + timestamp: new Date(), + }; +} + +/** + * Install dependencies based on language + */ +async function installDependencies( + sandbox: Sandbox, + language: string, + options: RunnerOptions, + caseIndex: number, + totalCases: number, + caseId: string +): Promise { + const langLower = language.toLowerCase(); + + options.onProgress?.({ + type: 'running', + caseId, + caseIndex, + totalCases, + message: 'Installing dependencies...', + }); + + if (langLower === 'python') { + // Check for requirements.txt + const result = await sandbox.exec('test -f requirements.txt && pip install -r requirements.txt || true'); + if (result.exitCode !== 0 && result.stderr) { + console.warn('Warning: pip install failed:', result.stderr); + } + // Also install pytest if running tests + await sandbox.exec('pip install pytest --quiet 2>/dev/null || true'); + } else if (langLower === 'javascript' || langLower === 'typescript' || langLower === 'node') { + // Check for package.json + const result = await sandbox.exec('test -f package.json && npm install --silent || true'); + if (result.exitCode !== 0 && result.stderr) { + console.warn('Warning: npm install failed:', result.stderr); + } + } else if (langLower === 'go' || langLower === 'golang') { + // Check for go.mod + await sandbox.exec('test -f go.mod && go mod download || true'); + } +} + +/** + * Snapshot all files in the workspace after the agent runs. + * Compares against the original case files to flag which ones changed. + * Reads directly from the host tempDir (bind-mounted into the sandbox). + */ +function snapshotFiles( + tempDir: string, + originalFiles?: CaseFile[] +): { path: string; content: string; changed: boolean }[] { + const results: { path: string; content: string; changed: boolean }[] = []; + const origMap = new Map(); + + // Build map of original file contents for comparison + if (originalFiles) { + for (const f of originalFiles) { + if (f.content !== undefined) { + origMap.set(f.path, f.content); + } + } + } + + // Walk the temp directory and collect all files + function walk(dir: string, prefix: string) { + let entries: fs.Dirent[]; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch { + return; + } + for (const entry of entries) { + const relPath = prefix ? `${prefix}/${entry.name}` : entry.name; + const fullPath = path.join(dir, entry.name); + + // Skip common non-essential directories + if (entry.isDirectory()) { + if (['node_modules', '.git', '__pycache__', '.pytest_cache', 'venv', '.venv'].includes(entry.name)) { + continue; + } + walk(fullPath, relPath); + continue; + } + + if (!entry.isFile()) continue; + + // Skip binary and large files + try { + const stat = fs.statSync(fullPath); + if (stat.size > 100_000) continue; // Skip files over 100KB + } catch { + continue; + } + + try { + const content = fs.readFileSync(fullPath, 'utf-8'); + const original = origMap.get(relPath); + const changed = original === undefined || original !== content; + results.push({ path: relPath, content, changed }); + } catch { + // Skip files that can't be read as UTF-8 + } + } + } + + walk(tempDir, ''); + return results; +}