diff --git a/src/agents/opencode-sdk.mjs.d.ts b/src/agents/opencode-sdk.mjs.d.ts
index f61c7aa..a79e38e 100644
--- a/src/agents/opencode-sdk.mjs.d.ts
+++ b/src/agents/opencode-sdk.mjs.d.ts
@@ -2,6 +2,6 @@
  * Type declarations for opencode-sdk.mjs wrapper
  */
 
-declare const createOpencodeClient: any;
+declare const createOpencodeClient: unknown;
 
 export { createOpencodeClient };
diff --git a/src/agents/opencode.ts b/src/agents/opencode.ts
index eb7d89e..fa0c1fa 100644
--- a/src/agents/opencode.ts
+++ b/src/agents/opencode.ts
@@ -16,7 +16,7 @@ import {
 } from './types.js';
 
 // Import SDK client dynamically since it's ESM-only
-let _createOpencodeClient: any;
+let _createOpencodeClient: unknown; 
 const loadSDK = async () => {
   if (!_createOpencodeClient) {
     const sdkWrapper = await import('./opencode-sdk.mjs');
@@ -34,7 +34,7 @@ let nextPort = 4097;
  */
 async function spawnServer(
   cwd: string,
-  config: Record<string, any>,
+  config: Record<string, unknown>,
   timeoutMs: number,
 ): Promise<{ url: string; proc: ChildProcess }> {
   const port = nextPort++;
@@ -46,8 +46,8 @@ async function spawnServer(
     },
   });
 
-  const url = await new Promise<string>((resolve, reject) => {
-    const id = setTimeout(() => {
+  const _url = await new Promise<string>((resolve, reject) => { // eslint-disable-line @typescript-eslint/no-unused-vars
+    const _id = setTimeout(() => {
       proc.kill();
       reject(new Error(`Timeout waiting for opencode server after ${timeoutMs}ms`));
     }, timeoutMs);
@@ -59,7 +59,7 @@ async function spawnServer(
         if (line.startsWith('opencode server listening')) {
           const match = line.match(/on\s+(https?:\/\/[^\s]+)/);
           if (match) {
-            clearTimeout(id);
+            clearTimeout(_id);
             resolve(match[1]);
             return;
           }
@@ -70,16 +70,16 @@ async function spawnServer(
       output += chunk.toString();
     });
     proc.on('exit', (code) => {
-      clearTimeout(id);
+      clearTimeout(_id);
       reject(new Error(`Server exited with code ${code}: ${output}`));
     });
     proc.on('error', (err) => {
-      clearTimeout(id);
+      clearTimeout(_id);
       reject(err);
     });
   });
 
-  return { url, proc };
+  return { url: _url, proc };
 }
 
 /**
@@ -90,9 +90,9 @@ export class OpencodeAgent implements AgentWrapper {
   displayName = 'Opencode';
 
   private cliPath: string;
-  private config: Record<string, any>;
+  private config: Record<string, unknown>;
 
-  constructor(cliPath: string = 'opencode', config?: Record<string, any>) {
+  constructor(cliPath: string = 'opencode', config?: Record<string, unknown>) {
     this.cliPath = cliPath;
     this.config = config || {
       model: 'local-glm/glm-4.7-local-4bit',
@@ -149,7 +149,7 @@ export class OpencodeAgent implements AgentWrapper {
     const toolCalls: ToolCall[] = [];
     let model = 'unknown';
     let sessionId = '';
-    let serverProc: ChildProcess | null = null;
+    let _serverProc: ChildProcess | null = null;
 
     try {
       // Spawn server in the case's working directory
@@ -157,11 +157,12 @@ export class OpencodeAgent implements AgentWrapper {
       const config = options.model
         ? { ...this.config, model: options.model }
         : this.config;
-      const { url, proc } = await spawnServer(cwd, config, 15000);
-      serverProc = proc;
+      const { url: _url, proc } = await spawnServer(cwd, config, 15000);
+      _serverProc = proc;
 
       const createClient = await loadSDK();
-      const client = createClient({ baseUrl: url });
+      if (!createClient) throw new Error("Failed to load SDK");
+      const client = (createClient as () => any)(); // eslint-disable-line @typescript-eslint/no-explicit-any
 
       const createResult = await client.session.create({});
       if (createResult.error) {
@@ -176,9 +177,11 @@ export class OpencodeAgent implements AgentWrapper {
 
       // Subscribe to SSE events BEFORE sending the prompt so we capture everything
       // event.subscribe() returns ServerSentEventsResult directly (not { data, error })
-      const sseResult = await client.event.subscribe({}) as any;
-      const stream: AsyncIterable<any> | undefined =
-        sseResult?.stream || sseResult?.data?.stream || sseResult?.data;
+      const sseResult = await client.event.subscribe({}) as any; // eslint-disable-line @typescript-eslint/no-explicit-any
+      const stream: AsyncIterable<unknown> | undefined =
+        (sseResult as { stream?: AsyncIterable<unknown>; data?: { stream?: AsyncIterable<unknown> } })?.stream ||
+        (sseResult as { data?: { stream?: AsyncIterable<unknown> } })?.data?.stream ||
+        (sseResult as { data?: AsyncIterable<unknown> })?.data;
 
       if (!stream) {
         throw new Error(
@@ -202,7 +205,7 @@ export class OpencodeAgent implements AgentWrapper {
       let answer = '';
       let numTurns = 0;
       let totalTokens = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 };
-      let totalCost = 0;
+      let totalCost: number = 0;
       const deadline = Date.now() + timeoutMs - 5000;
 
       for await (const event of stream) {
@@ -211,25 +214,28 @@ export class OpencodeAgent implements AgentWrapper {
           break;
         }
 
-        const eventType = event?.type || event?.event;
+        const eventType = (event as { type?: string; event?: string })?.type ?? (event as { type?: string; event?: string })?.event ?? '';
 
         if (eventType === 'message.part.updated') {
-          const props = event.properties || event.data;
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = eventAny.properties || eventAny.data || {};
           if (!props) continue;
-          const part = props.part;
+          const part = (props as { part?: unknown }).part || ({} as Record<string, unknown>);
           if (!part) continue;
 
-          if (part.type === 'text') {
+          const partAny = part as { type?: string; text?: string; state?: { status?: string; input?: unknown; time?: { start?: number; end?: number }; output?: unknown }; callID?: string; callId?: string; tool?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number };
+          if (partAny.type === 'text') {
             // Streaming text delta
-            const delta = props.delta || '';
+            const delta = (props as { delta?: string }).delta || '';
             if (delta) {
               answer += delta;
               options.onEvent?.({ type: 'text_delta', text: delta });
             }
-          } else if (part.type === 'tool') {
-            const status = part.state?.status;
-            const callID = part.callID || part.callId;
-            const toolName = part.tool || 'unknown';
+          } else if (partAny.type === 'tool') {
+            const status = partAny.state?.status || '';
+            const callID = partAny.callID || partAny.callId || '';
+            const toolName: string = (partAny.tool as string) || 'unknown';
+            if (!toolName) continue;
 
             if (status === 'running' || status === 'pending') {
               // Only add if not already tracked
@@ -237,7 +243,7 @@ export class OpencodeAgent implements AgentWrapper {
                 const toolCall: ToolCall = {
                   id: callID,
                   name: toolName,
-                  input: part.state?.input || {},
+                  input: (partAny.state?.input || {}) as Record<string, unknown>,
                   timestamp: Date.now(),
                 };
                 toolCalls.push(toolCall);
@@ -247,26 +253,26 @@ export class OpencodeAgent implements AgentWrapper {
             } else if (status === 'completed') {
               const existing = toolCalls.find((t) => t.id === callID);
               if (existing) {
-                existing.durationMs = part.state?.time
-                  ? (part.state.time.end - part.state.time.start) * 1000
+                existing.durationMs = partAny.state?.time?.end && partAny.state.time?.start
+                  ? (partAny.state.time.end - partAny.state.time.start) * 1000
                   : Date.now() - existing.timestamp;
                 existing.success = true;
-                existing.result = part.state?.output
-                  ? String(part.state.output).substring(0, 500)
+                existing.result = partAny.state?.output
+                  ? String(partAny.state.output).substring(0, 500)
                   : undefined;
               } else {
                 // Tool completed without a prior start event (can happen if subscription started late)
                 toolCalls.push({
                   id: callID,
                   name: toolName,
-                  input: part.state?.input || {},
+                  input: (partAny.state?.input || {}) as Record<string, unknown>,
                   timestamp: Date.now(),
-                  durationMs: part.state?.time
-                    ? (part.state.time.end - part.state.time.start) * 1000
+                  durationMs: partAny.state?.time?.end && partAny.state.time?.start
+                    ? (partAny.state.time.end - partAny.state.time.start) * 1000
                     : 0,
                   success: true,
-                  result: part.state?.output
-                    ? String(part.state.output).substring(0, 500)
+                  result: partAny.state?.output
+                    ? String(partAny.state.output).substring(0, 500)
                     : undefined,
                 });
               }
@@ -289,29 +295,32 @@ export class OpencodeAgent implements AgentWrapper {
                 durationMs: existing?.durationMs || 0,
               });
             }
-          } else if (part.type === 'reasoning') {
-            const text = props.delta || part.text || '';
+          } else if (partAny.type === 'reasoning') {
+            const text = (props as { delta?: string }).delta || partAny.text || '';
+            if (!text) continue;
             if (text) {
               options.onEvent?.({ type: 'thinking', text });
             }
-          } else if (part.type === 'step-finish') {
+          } else if (partAny.type === 'step-finish') {
             numTurns++;
             // Accumulate per-step tokens/cost
-            if (part.tokens) {
-              totalTokens.input += part.tokens.input || 0;
-              totalTokens.output += part.tokens.output || 0;
-              totalTokens.cacheRead += part.tokens.cache?.read || 0;
-              totalTokens.cacheWrite += part.tokens.cache?.write || 0;
-              totalTokens.total += part.tokens.total || 0;
+            const partTyped = partAny as { tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number };
+            if (partTyped.tokens) {
+              totalTokens.input += partTyped.tokens.input || 0;
+              totalTokens.output += partTyped.tokens.output || 0;
+              totalTokens.cacheRead += partTyped.tokens.cache?.read || 0;
+              totalTokens.cacheWrite += partTyped.tokens.cache?.write || 0;
+              totalTokens.total += partTyped.tokens.total || 0;
             }
-            if (part.cost) {
-              totalCost += part.cost;
+            if (partTyped.cost) {
+              totalCost += partTyped.cost;
             }
           }
         } else if (eventType === 'message.updated') {
           // A full message update — extract final info from here
-          const props = event.properties || event.data;
-          const info = props?.info;
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const info = props as { providerID?: string; modelID?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number } | undefined;
           if (info?.providerID && info?.modelID) {
             model = `${info.providerID}/${info.modelID}`;
           }
@@ -329,16 +338,17 @@ export class OpencodeAgent implements AgentWrapper {
             totalCost = info.cost;
           }
           // Extract final answer text from message parts if we haven't captured it via deltas
-          if (props?.parts && !answer) {
-            for (const p of props.parts) {
-              if (p.type === 'text' && p.text) {
-                answer += p.text;
+          if (props && (props as { parts?: unknown[] } & Record<string, unknown>).parts) {
+            for (const p of (props as { parts?: unknown[] | null | undefined }).parts ?? []) {
+              if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) {
+                answer += (p as { type?: string; text?: string }).text;
               }
             }
           }
         } else if (eventType === 'session.status') {
-          const props = event.properties || event.data;
-          const status = props?.status;
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const status = props as { type?: string; attempt?: number; message?: string } | undefined;
           if (status?.type === 'idle') {
             // Agent finished processing
             options.onEvent?.({ type: 'status', message: 'Session idle — agent finished' });
@@ -352,8 +362,9 @@ export class OpencodeAgent implements AgentWrapper {
             });
           }
         } else if (eventType === 'session.error') {
-          const props = event.properties || event.data;
-          const errMsg = props?.error?.message || JSON.stringify(props?.error) || 'Unknown error';
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const errMsg = (props as { error?: { message?: string } | undefined })?.error?.message || JSON.stringify(props) || 'Unknown error';
           options.onEvent?.({ type: 'error', message: errMsg, code: 'SESSION_ERROR' });
         }
       }
@@ -364,14 +375,14 @@ export class OpencodeAgent implements AgentWrapper {
           path: { id: sessionId },
         });
         if (messagesResult.data) {
-          const messages = messagesResult.data as any[];
+          const messages = messagesResult.data as { role?: string; parts?: unknown[] }[];
           // Find the last assistant message
           for (let i = messages.length - 1; i >= 0; i--) {
-            const msg = messages[i];
-            if (msg.role === 'assistant' && msg.parts) {
+            const msg = messages[i] as { role?: string; parts?: unknown[] };
+            if ((msg as { role?: string }).role === 'assistant' && msg.parts) {
               for (const p of msg.parts) {
-                if (p.type === 'text' && p.text) {
-                  answer += p.text;
+                if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) {
+                  answer += (p as { type?: string; text?: string }).text;
                 }
               }
               break;
@@ -416,7 +427,7 @@ export class OpencodeAgent implements AgentWrapper {
       options.onEvent?.({ type: 'complete', result: errorResult });
       return errorResult;
     } finally {
-      serverProc?.kill();
+      _serverProc?.kill();
     }
   }
 }
diff --git a/src/agents/opencode.ts.bak b/src/agents/opencode.ts.bak
new file mode 100644
index 0000000..ebb50ad
--- /dev/null
+++ b/src/agents/opencode.ts.bak
@@ -0,0 +1,437 @@
+/**
+ * Opencode agent wrapper using SDK
+ *
+ * Uses @opencode-ai/sdk for programmatic interaction with opencode.
+ * Spawns the opencode server with the correct working directory so
+ * the agent operates on the test case files.
+ */
+
+import { spawn, ChildProcess } from 'child_process';
+import {
+  AgentWrapper,
+  AgentResult,
+  AgentRunOptions,
+  ToolCall,
+  emptyAgentResult,
+} from './types.js';
+
+// Import SDK client dynamically since it's ESM-only
+let _createOpencodeClient: (() => unknown) | undefined; // SDK type not fully defined
+const loadSDK = async () => {
+  if (!_createOpencodeClient) {
+    const sdkWrapper = await import('./opencode-sdk.mjs');
+    _createOpencodeClient = sdkWrapper.createOpencodeClient;
+  }
+  return _createOpencodeClient;
+};
+
+// Port counter to avoid collisions between concurrent runs
+let nextPort = 4097;
+
+/**
+ * Spawn an opencode server process with the given working directory.
+ * Returns the server URL and a close function.
+ */
+async function spawnServer(
+  cwd: string,
+  config: Record<string, unknown>,
+  timeoutMs: number,
+): Promise<{ url: string; proc: ChildProcess }> {
+  const port = nextPort++;
+  const proc = spawn('opencode', ['serve', `--hostname=127.0.0.1`, `--port=${port}`], {
+    cwd,
+    env: {
+      ...process.env,
+      OPENCODE_CONFIG_CONTENT: JSON.stringify(config),
+    },
+  });
+
+  const _url = await new Promise<string>((resolve, reject) => { // eslint-disable-line @typescript-eslint/no-unused-vars
+    const id = setTimeout(() => {
+      proc.kill();
+      reject(new Error(`Timeout waiting for opencode server after ${timeoutMs}ms`));
+    }, timeoutMs);
+
+    let output = '';
+    proc.stdout?.on('data', (chunk: Buffer) => {
+      output += chunk.toString();
+      for (const line of output.split('\n')) {
+        if (line.startsWith('opencode server listening')) {
+          const match = line.match(/on\s+(https?:\/\/[^\s]+)/);
+          if (match) {
+            clearTimeout(id);
+            resolve(match[1]);
+            return;
+          }
+        }
+      }
+    });
+    proc.stderr?.on('data', (chunk: Buffer) => {
+      output += chunk.toString();
+    });
+    proc.on('exit', (code) => {
+      clearTimeout(id);
+      reject(new Error(`Server exited with code ${code}: ${output}`));
+    });
+    proc.on('error', (err) => {
+      clearTimeout(id);
+      reject(err);
+    });
+  });
+
+  return { url: _url, proc };
+}
+
+/**
+ * Opencode agent wrapper using SDK
+ */
+export class OpencodeAgent implements AgentWrapper {
+  name = 'opencode';
+  displayName = 'Opencode';
+
+  private cliPath: string;
+  private config: Record<string, unknown>;
+
+  constructor(cliPath: string = 'opencode', config?: Record<string, unknown>) {
+    this.cliPath = cliPath;
+    this.config = config || {
+      model: 'local-glm/glm-4.7-local-4bit',
+      provider: {
+        'local-glm': {
+          api: 'openai',
+          options: {
+            baseURL: 'http://127.0.0.1:8081/v1',
+            apiKey: 'local-glm-key',
+          },
+          models: {
+            'glm-4.7-local-4bit': {
+              name: 'GLM-4.7 Local (4-bit)',
+              id: '/Users/studio/models/GLM-4.7-4bit',
+              reasoning: false,
+              tool_call: true,
+              temperature: true,
+              limit: { context: 32768, output: 4096 },
+              cost: { input: 0, output: 0 },
+              modalities: { input: ['text'], output: ['text'] },
+            },
+          },
+        },
+      },
+    };
+  }
+
+  async isAvailable(): Promise<boolean> {
+    try {
+      const version = await this.getVersion();
+      return version !== null;
+    } catch {
+      return false;
+    }
+  }
+
+  async getVersion(): Promise<string | null> {
+    return new Promise((resolve) => {
+      const proc = spawn(this.cliPath, ['--version'], { timeout: 5000 });
+      let stdout = '';
+      proc.stdout?.on('data', (data: Buffer) => {
+        stdout += data.toString();
+      });
+      proc.on('close', (code: number | null) => {
+        resolve(code === 0 && stdout.trim() ? stdout.trim() : null);
+      });
+      proc.on('error', () => resolve(null));
+    });
+  }
+
+  async run(prompt: string, options: AgentRunOptions): Promise<AgentResult> {
+    const runStartTime = Date.now();
+    const timeoutMs = options.timeoutMs || 300000;
+    const toolCalls: ToolCall[] = [];
+    let model = 'unknown';
+    let sessionId = '';
+    let _serverProc: ChildProcess | null = null;
+
+    try {
+      // Spawn server in the case's working directory
+      const cwd = options.cwd || process.cwd();
+      const config = options.model
+        ? { ...this.config, model: options.model }
+        : this.config;
+      const { url, proc } = await spawnServer(cwd, config, 15000);
+      _serverProc = proc;
+
+      const createClient = await loadSDK();
+      if (!createClient) throw new Error("Failed to load SDK");
+      const client = createClient();
+
+      const createResult = await client.session.create({});
+      if (createResult.error) {
+        throw new Error(`Failed to create session: ${JSON.stringify(createResult.error)}`);
+      }
+
+      const session = createResult.data;
+      sessionId = session.id;
+      model = options.model || session.version || 'unknown';
+
+      options.onEvent?.({ type: 'start', timestamp: runStartTime, model });
+
+      // Subscribe to SSE events BEFORE sending the prompt so we capture everything
+      // event.subscribe() returns ServerSentEventsResult directly (not { data, error })
+      const sseResult = await client.event.subscribe({}) as unknown;
+      const stream: AsyncIterable<unknown> | undefined =
+        (sseResult as { stream?: AsyncIterable<unknown>; data?: { stream?: AsyncIterable<unknown> } })?.stream ||
+        (sseResult as { data?: { stream?: AsyncIterable<unknown> } })?.data?.stream ||
+        (sseResult as { data?: AsyncIterable<unknown> })?.data;
+
+      if (!stream) {
+        throw new Error(
+          `Event stream not available — subscribe() returned: ${JSON.stringify(Object.keys(sseResult || {}))}`,
+        );
+      }
+
+      // Send prompt asynchronously (returns immediately, events stream the progress)
+      const asyncResult = await client.session.promptAsync({
+        path: { id: sessionId },
+        body: {
+          parts: [{ type: 'text', text: prompt }],
+        },
+      });
+
+      if (asyncResult.error) {
+        throw new Error(`Prompt failed: ${JSON.stringify(asyncResult.error)}`);
+      }
+
+      // Process SSE events until the session goes idle or we time out
+      let answer = '';
+      let numTurns = 0;
+      let totalTokens = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 };
+      let totalCost: number = 0;
+      const deadline = Date.now() + timeoutMs - 5000;
+
+      for await (const event of stream) {
+        if (Date.now() > deadline) {
+          options.onEvent?.({ type: 'status', message: 'Timed out waiting for agent' });
+          break;
+        }
+
+        const eventType = (event as { type?: string; event?: string })?.type ?? (event as { type?: string; event?: string })?.event ?? '';
+
+        if (eventType === 'message.part.updated') {
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = eventAny.properties || eventAny.data || {};
+          if (!props) continue;
+          const part = (props as { part?: unknown }).part || ({} as any);
+          if (!part) continue;
+
+          const partAny = part as { type?: string; text?: string; state?: { status?: string; input?: unknown; time?: { start?: number; end?: number }; output?: unknown }; callID?: string; callId?: string; tool?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number };
+          if (partAny.type === 'text') {
+            // Streaming text delta
+            const delta = (props as { delta?: string }).delta || '';
+            if (delta) {
+              answer += delta;
+              options.onEvent?.({ type: 'text_delta', text: delta });
+            }
+          } else if (partAny.type === 'tool') {
+            const status = partAny.state?.status || '';
+            const callID = partAny.callID || partAny.callId || '';
+            const toolName: string = (partAny.tool as string) || 'unknown';
+            if (!toolName) continue;
+
+            if (status === 'running' || status === 'pending') {
+              // Only add if not already tracked
+              if (!toolCalls.find((t) => t.id === callID)) {
+                const toolCall: ToolCall = {
+                  id: callID,
+                  name: toolName,
+                  input: (partAny.state?.input || {}) as Record<string, unknown>,
+                  timestamp: Date.now(),
+                };
+                toolCalls.push(toolCall);
+                options.onEvent?.({ type: 'tool_start', tool: toolCall });
+                options.onEvent?.({ type: 'status', message: `Tool: ${toolName}` });
+              }
+            } else if (status === 'completed') {
+              const existing = toolCalls.find((t) => t.id === callID);
+              if (existing) {
+                existing.durationMs = partAny.state?.time?.end && partAny.state.time?.start
+                  ? (partAny.state.time.end - partAny.state.time.start) * 1000
+                  : Date.now() - existing.timestamp;
+                existing.success = true;
+                existing.result = partAny.state?.output
+                  ? String(partAny.state.output).substring(0, 500)
+                  : undefined;
+              } else {
+                // Tool completed without a prior start event (can happen if subscription started late)
+                toolCalls.push({
+                  id: callID,
+                  name: toolName,
+                  input: (partAny.state?.input || {}) as Record<string, unknown>,
+                  timestamp: Date.now(),
+                  durationMs: partAny.state?.time?.end && partAny.state.time?.start
+                    ? (partAny.state.time.end - partAny.state.time.start) * 1000
+                    : 0,
+                  success: true,
+                  result: partAny.state?.output
+                    ? String(partAny.state.output).substring(0, 500)
+                    : undefined,
+                });
+              }
+              options.onEvent?.({
+                type: 'tool_end',
+                toolId: callID,
+                success: true,
+                durationMs: toolCalls.find((t) => t.id === callID)?.durationMs || 0,
+              });
+            } else if (status === 'error') {
+              const existing = toolCalls.find((t) => t.id === callID);
+              if (existing) {
+                existing.success = false;
+                existing.durationMs = Date.now() - existing.timestamp;
+              }
+              options.onEvent?.({
+                type: 'tool_end',
+                toolId: callID,
+                success: false,
+                durationMs: existing?.durationMs || 0,
+              });
+            }
+          } else if (partAny.type === 'reasoning') {
+            const text = (props as { delta?: string }).delta || partAny.text || '';
+            if (!text) continue;
+            if (text) {
+              options.onEvent?.({ type: 'thinking', text });
+            }
+          } else if (partAny.type === 'step-finish') {
+            numTurns++;
+            // Accumulate per-step tokens/cost
+            const partTyped = partAny as { tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number };
+            if (partTyped.tokens) {
+              totalTokens.input += partTyped.tokens.input || 0;
+              totalTokens.output += partTyped.tokens.output || 0;
+              totalTokens.cacheRead += partTyped.tokens.cache?.read || 0;
+              totalTokens.cacheWrite += partTyped.tokens.cache?.write || 0;
+              totalTokens.total += partTyped.tokens.total || 0;
+            }
+            if (partTyped.cost) {
+              totalCost += partTyped.cost;
+            }
+          }
+        } else if (eventType === 'message.updated') {
+          // A full message update — extract final info from here
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const info = props as { providerID?: string; modelID?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number } | undefined;
+          if (info?.providerID && info?.modelID) {
+            model = `${info.providerID}/${info.modelID}`;
+          }
+          // Use message-level tokens as authoritative total if available
+          if (info?.tokens?.total) {
+            totalTokens = {
+              input: info.tokens.input || totalTokens.input,
+              output: info.tokens.output || totalTokens.output,
+              cacheRead: info.tokens.cache?.read || totalTokens.cacheRead,
+              cacheWrite: info.tokens.cache?.write || totalTokens.cacheWrite,
+              total: info.tokens.total,
+            };
+          }
+          if (info?.cost !== undefined) {
+            totalCost = info.cost;
+          }
+          // Extract final answer text from message parts if we haven't captured it via deltas
+          if (props && (props as { parts?: unknown[] } & Record<string, unknown>).parts) {
+            for (const p of (props as { parts?: unknown[] | null | undefined }).parts ?? []) {
+              if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) {
+                answer += (p as { type?: string; text?: string }).text;
+              }
+            }
+          }
+        } else if (eventType === 'session.status') {
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const status = props as { type?: string; attempt?: number; message?: string } | undefined;
+          if (status?.type === 'idle') {
+            // Agent finished processing
+            options.onEvent?.({ type: 'status', message: 'Session idle — agent finished' });
+            break;
+          } else if (status?.type === 'busy') {
+            options.onEvent?.({ type: 'status', message: 'Agent working...' });
+          } else if (status?.type === 'retry') {
+            options.onEvent?.({
+              type: 'status',
+              message: `Retrying (attempt ${status.attempt}): ${status.message}`,
+            });
+          }
+        } else if (eventType === 'session.error') {
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const errMsg = (props as { error?: { message?: string } | undefined })?.error?.message || JSON.stringify(props) || 'Unknown error';
+          options.onEvent?.({ type: 'error', message: errMsg, code: 'SESSION_ERROR' });
+        }
+      }
+
+      // If answer is still empty, fetch the final messages from the session
+      if (!answer) {
+        const messagesResult = await client.session.messages({
+          path: { id: sessionId },
+        });
+        if (messagesResult.data) {
+          const messages = messagesResult.data as { role?: string; parts?: unknown[] }[];
+          // Find the last assistant message
+          for (let i = messages.length - 1; i >= 0; i--) {
+            const msg = messages[i] as { role?: string; parts?: unknown[] };
+            if ((msg as { role?: string }).role === 'assistant' && msg.parts) {
+              for (const p of msg.parts) {
+                if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) {
+                  answer += (p as { type?: string; text?: string }).text;
+                }
+              }
+              break;
+            }
+          }
+        }
+      }
+
+      const result: AgentResult = {
+        answer,
+        success: true,
+        timedOut: Date.now() > deadline,
+        durationMs: Date.now() - runStartTime,
+        tokens: {
+          inputTokens: totalTokens.input,
+          outputTokens: totalTokens.output,
+          cacheReadTokens: totalTokens.cacheRead,
+          cacheWriteTokens: totalTokens.cacheWrite,
+          totalTokens: totalTokens.total,
+        },
+        costUsd: totalCost,
+        numTurns: numTurns || 1,
+        toolCalls,
+        toolsUsed: [...new Set(toolCalls.map((t) => t.name))],
+        model,
+        raw: { sessionId },
+      };
+
+      options.onEvent?.({ type: 'complete', result });
+      return result;
+    } catch (error) {
+      const errorMessage = error instanceof Error ? error.message : String(error);
+
+      options.onEvent?.({ type: 'error', message: errorMessage, code: 'ERROR' });
+
+      const errorResult = emptyAgentResult(errorMessage);
+      errorResult.durationMs = Date.now() - runStartTime;
+      errorResult.toolCalls = toolCalls;
+      errorResult.toolsUsed = [...new Set(toolCalls.map((t) => t.name))];
+      errorResult.model = model;
+
+      options.onEvent?.({ type: 'complete', result: errorResult });
+      return errorResult;
+    } finally {
+      _serverProc?.kill();
+    }
+  }
+}
+
+export function createOpencodeAgent(cliPath?: string): OpencodeAgent {
+  return new OpencodeAgent(cliPath);
+}
diff --git a/src/agents/opencode.ts.bak2 b/src/agents/opencode.ts.bak2
new file mode 100644
index 0000000..ebb50ad
--- /dev/null
+++ b/src/agents/opencode.ts.bak2
@@ -0,0 +1,437 @@
+/**
+ * Opencode agent wrapper using SDK
+ *
+ * Uses @opencode-ai/sdk for programmatic interaction with opencode.
+ * Spawns the opencode server with the correct working directory so
+ * the agent operates on the test case files.
+ */
+
+import { spawn, ChildProcess } from 'child_process';
+import {
+  AgentWrapper,
+  AgentResult,
+  AgentRunOptions,
+  ToolCall,
+  emptyAgentResult,
+} from './types.js';
+
+// Import SDK client dynamically since it's ESM-only
+let _createOpencodeClient: (() => unknown) | undefined; // SDK type not fully defined
+const loadSDK = async () => {
+  if (!_createOpencodeClient) {
+    const sdkWrapper = await import('./opencode-sdk.mjs');
+    _createOpencodeClient = sdkWrapper.createOpencodeClient;
+  }
+  return _createOpencodeClient;
+};
+
+// Port counter to avoid collisions between concurrent runs
+let nextPort = 4097;
+
+/**
+ * Spawn an opencode server process with the given working directory.
+ * Returns the server URL and a close function.
+ */
+async function spawnServer(
+  cwd: string,
+  config: Record<string, unknown>,
+  timeoutMs: number,
+): Promise<{ url: string; proc: ChildProcess }> {
+  const port = nextPort++;
+  const proc = spawn('opencode', ['serve', `--hostname=127.0.0.1`, `--port=${port}`], {
+    cwd,
+    env: {
+      ...process.env,
+      OPENCODE_CONFIG_CONTENT: JSON.stringify(config),
+    },
+  });
+
+  const _url = await new Promise<string>((resolve, reject) => { // eslint-disable-line @typescript-eslint/no-unused-vars
+    const id = setTimeout(() => {
+      proc.kill();
+      reject(new Error(`Timeout waiting for opencode server after ${timeoutMs}ms`));
+    }, timeoutMs);
+
+    let output = '';
+    proc.stdout?.on('data', (chunk: Buffer) => {
+      output += chunk.toString();
+      for (const line of output.split('\n')) {
+        if (line.startsWith('opencode server listening')) {
+          const match = line.match(/on\s+(https?:\/\/[^\s]+)/);
+          if (match) {
+            clearTimeout(id);
+            resolve(match[1]);
+            return;
+          }
+        }
+      }
+    });
+    proc.stderr?.on('data', (chunk: Buffer) => {
+      output += chunk.toString();
+    });
+    proc.on('exit', (code) => {
+      clearTimeout(id);
+      reject(new Error(`Server exited with code ${code}: ${output}`));
+    });
+    proc.on('error', (err) => {
+      clearTimeout(id);
+      reject(err);
+    });
+  });
+
+  return { url: _url, proc };
+}
+
+/**
+ * Opencode agent wrapper using SDK
+ */
+export class OpencodeAgent implements AgentWrapper {
+  name = 'opencode';
+  displayName = 'Opencode';
+
+  private cliPath: string;
+  private config: Record<string, unknown>;
+
+  constructor(cliPath: string = 'opencode', config?: Record<string, unknown>) {
+    this.cliPath = cliPath;
+    this.config = config || {
+      model: 'local-glm/glm-4.7-local-4bit',
+      provider: {
+        'local-glm': {
+          api: 'openai',
+          options: {
+            baseURL: 'http://127.0.0.1:8081/v1',
+            apiKey: 'local-glm-key',
+          },
+          models: {
+            'glm-4.7-local-4bit': {
+              name: 'GLM-4.7 Local (4-bit)',
+              id: '/Users/studio/models/GLM-4.7-4bit',
+              reasoning: false,
+              tool_call: true,
+              temperature: true,
+              limit: { context: 32768, output: 4096 },
+              cost: { input: 0, output: 0 },
+              modalities: { input: ['text'], output: ['text'] },
+            },
+          },
+        },
+      },
+    };
+  }
+
+  async isAvailable(): Promise<boolean> {
+    try {
+      const version = await this.getVersion();
+      return version !== null;
+    } catch {
+      return false;
+    }
+  }
+
+  async getVersion(): Promise<string | null> {
+    return new Promise((resolve) => {
+      const proc = spawn(this.cliPath, ['--version'], { timeout: 5000 });
+      let stdout = '';
+      proc.stdout?.on('data', (data: Buffer) => {
+        stdout += data.toString();
+      });
+      proc.on('close', (code: number | null) => {
+        resolve(code === 0 && stdout.trim() ? stdout.trim() : null);
+      });
+      proc.on('error', () => resolve(null));
+    });
+  }
+
+  async run(prompt: string, options: AgentRunOptions): Promise<AgentResult> {
+    const runStartTime = Date.now();
+    const timeoutMs = options.timeoutMs || 300000;
+    const toolCalls: ToolCall[] = [];
+    let model = 'unknown';
+    let sessionId = '';
+    let _serverProc: ChildProcess | null = null;
+
+    try {
+      // Spawn server in the case's working directory
+      const cwd = options.cwd || process.cwd();
+      const config = options.model
+        ? { ...this.config, model: options.model }
+        : this.config;
+      const { url, proc } = await spawnServer(cwd, config, 15000);
+      _serverProc = proc;
+
+      const createClient = await loadSDK();
+      if (!createClient) throw new Error("Failed to load SDK");
+      const client = createClient();
+
+      const createResult = await client.session.create({});
+      if (createResult.error) {
+        throw new Error(`Failed to create session: ${JSON.stringify(createResult.error)}`);
+      }
+
+      const session = createResult.data;
+      sessionId = session.id;
+      model = options.model || session.version || 'unknown';
+
+      options.onEvent?.({ type: 'start', timestamp: runStartTime, model });
+
+      // Subscribe to SSE events BEFORE sending the prompt so we capture everything
+      // event.subscribe() returns ServerSentEventsResult directly (not { data, error })
+      const sseResult = await client.event.subscribe({}) as unknown;
+      const stream: AsyncIterable<unknown> | undefined =
+        (sseResult as { stream?: AsyncIterable<unknown>; data?: { stream?: AsyncIterable<unknown> } })?.stream ||
+        (sseResult as { data?: { stream?: AsyncIterable<unknown> } })?.data?.stream ||
+        (sseResult as { data?: AsyncIterable<unknown> })?.data;
+
+      if (!stream) {
+        throw new Error(
+          `Event stream not available — subscribe() returned: ${JSON.stringify(Object.keys(sseResult || {}))}`,
+        );
+      }
+
+      // Send prompt asynchronously (returns immediately, events stream the progress)
+      const asyncResult = await client.session.promptAsync({
+        path: { id: sessionId },
+        body: {
+          parts: [{ type: 'text', text: prompt }],
+        },
+      });
+
+      if (asyncResult.error) {
+        throw new Error(`Prompt failed: ${JSON.stringify(asyncResult.error)}`);
+      }
+
+      // Process SSE events until the session goes idle or we time out
+      let answer = '';
+      let numTurns = 0;
+      let totalTokens = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 };
+      let totalCost: number = 0;
+      const deadline = Date.now() + timeoutMs - 5000;
+
+      for await (const event of stream) {
+        if (Date.now() > deadline) {
+          options.onEvent?.({ type: 'status', message: 'Timed out waiting for agent' });
+          break;
+        }
+
+        const eventType = (event as { type?: string; event?: string })?.type ?? (event as { type?: string; event?: string })?.event ?? '';
+
+        if (eventType === 'message.part.updated') {
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = eventAny.properties || eventAny.data || {};
+          if (!props) continue;
+          const part = (props as { part?: unknown }).part || ({} as any);
+          if (!part) continue;
+
+          const partAny = part as { type?: string; text?: string; state?: { status?: string; input?: unknown; time?: { start?: number; end?: number }; output?: unknown }; callID?: string; callId?: string; tool?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number };
+          if (partAny.type === 'text') {
+            // Streaming text delta
+            const delta = (props as { delta?: string }).delta || '';
+            if (delta) {
+              answer += delta;
+              options.onEvent?.({ type: 'text_delta', text: delta });
+            }
+          } else if (partAny.type === 'tool') {
+            const status = partAny.state?.status || '';
+            const callID = partAny.callID || partAny.callId || '';
+            const toolName: string = (partAny.tool as string) || 'unknown';
+            if (!toolName) continue;
+
+            if (status === 'running' || status === 'pending') {
+              // Only add if not already tracked
+              if (!toolCalls.find((t) => t.id === callID)) {
+                const toolCall: ToolCall = {
+                  id: callID,
+                  name: toolName,
+                  input: (partAny.state?.input || {}) as Record<string, unknown>,
+                  timestamp: Date.now(),
+                };
+                toolCalls.push(toolCall);
+                options.onEvent?.({ type: 'tool_start', tool: toolCall });
+                options.onEvent?.({ type: 'status', message: `Tool: ${toolName}` });
+              }
+            } else if (status === 'completed') {
+              const existing = toolCalls.find((t) => t.id === callID);
+              if (existing) {
+                existing.durationMs = partAny.state?.time?.end && partAny.state.time?.start
+                  ? (partAny.state.time.end - partAny.state.time.start) * 1000
+                  : Date.now() - existing.timestamp;
+                existing.success = true;
+                existing.result = partAny.state?.output
+                  ? String(partAny.state.output).substring(0, 500)
+                  : undefined;
+              } else {
+                // Tool completed without a prior start event (can happen if subscription started late)
+                toolCalls.push({
+                  id: callID,
+                  name: toolName,
+                  input: (partAny.state?.input || {}) as Record<string, unknown>,
+                  timestamp: Date.now(),
+                  durationMs: partAny.state?.time?.end && partAny.state.time?.start
+                    ? (partAny.state.time.end - partAny.state.time.start) * 1000
+                    : 0,
+                  success: true,
+                  result: partAny.state?.output
+                    ? String(partAny.state.output).substring(0, 500)
+                    : undefined,
+                });
+              }
+              options.onEvent?.({
+                type: 'tool_end',
+                toolId: callID,
+                success: true,
+                durationMs: toolCalls.find((t) => t.id === callID)?.durationMs || 0,
+              });
+            } else if (status === 'error') {
+              const existing = toolCalls.find((t) => t.id === callID);
+              if (existing) {
+                existing.success = false;
+                existing.durationMs = Date.now() - existing.timestamp;
+              }
+              options.onEvent?.({
+                type: 'tool_end',
+                toolId: callID,
+                success: false,
+                durationMs: existing?.durationMs || 0,
+              });
+            }
+          } else if (partAny.type === 'reasoning') {
+            const text = (props as { delta?: string }).delta || partAny.text || '';
+            if (!text) continue;
+            if (text) {
+              options.onEvent?.({ type: 'thinking', text });
+            }
+          } else if (partAny.type === 'step-finish') {
+            numTurns++;
+            // Accumulate per-step tokens/cost
+            const partTyped = partAny as { tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number };
+            if (partTyped.tokens) {
+              totalTokens.input += partTyped.tokens.input || 0;
+              totalTokens.output += partTyped.tokens.output || 0;
+              totalTokens.cacheRead += partTyped.tokens.cache?.read || 0;
+              totalTokens.cacheWrite += partTyped.tokens.cache?.write || 0;
+              totalTokens.total += partTyped.tokens.total || 0;
+            }
+            if (partTyped.cost) {
+              totalCost += partTyped.cost;
+            }
+          }
+        } else if (eventType === 'message.updated') {
+          // A full message update — extract final info from here
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const info = props as { providerID?: string; modelID?: string; tokens?: { input?: number; output?: number; cache?: { read?: number; write?: number }; total?: number }; cost?: number } | undefined;
+          if (info?.providerID && info?.modelID) {
+            model = `${info.providerID}/${info.modelID}`;
+          }
+          // Use message-level tokens as authoritative total if available
+          if (info?.tokens?.total) {
+            totalTokens = {
+              input: info.tokens.input || totalTokens.input,
+              output: info.tokens.output || totalTokens.output,
+              cacheRead: info.tokens.cache?.read || totalTokens.cacheRead,
+              cacheWrite: info.tokens.cache?.write || totalTokens.cacheWrite,
+              total: info.tokens.total,
+            };
+          }
+          if (info?.cost !== undefined) {
+            totalCost = info.cost;
+          }
+          // Extract final answer text from message parts if we haven't captured it via deltas
+          if (props && (props as { parts?: unknown[] } & Record<string, unknown>).parts) {
+            for (const p of (props as { parts?: unknown[] | null | undefined }).parts ?? []) {
+              if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) {
+                answer += (p as { type?: string; text?: string }).text;
+              }
+            }
+          }
+        } else if (eventType === 'session.status') {
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const status = props as { type?: string; attempt?: number; message?: string } | undefined;
+          if (status?.type === 'idle') {
+            // Agent finished processing
+            options.onEvent?.({ type: 'status', message: 'Session idle — agent finished' });
+            break;
+          } else if (status?.type === 'busy') {
+            options.onEvent?.({ type: 'status', message: 'Agent working...' });
+          } else if (status?.type === 'retry') {
+            options.onEvent?.({
+              type: 'status',
+              message: `Retrying (attempt ${status.attempt}): ${status.message}`,
+            });
+          }
+        } else if (eventType === 'session.error') {
+          const eventAny = event as { properties?: unknown; data?: unknown };
+          const props = (eventAny.properties || eventAny.data) as { parts?: unknown[] } & Record<string, unknown>;
+          const errMsg = (props as { error?: { message?: string } | undefined })?.error?.message || JSON.stringify(props) || 'Unknown error';
+          options.onEvent?.({ type: 'error', message: errMsg, code: 'SESSION_ERROR' });
+        }
+      }
+
+      // If answer is still empty, fetch the final messages from the session
+      if (!answer) {
+        const messagesResult = await client.session.messages({
+          path: { id: sessionId },
+        });
+        if (messagesResult.data) {
+          const messages = messagesResult.data as { role?: string; parts?: unknown[] }[];
+          // Find the last assistant message
+          for (let i = messages.length - 1; i >= 0; i--) {
+            const msg = messages[i] as { role?: string; parts?: unknown[] };
+            if ((msg as { role?: string }).role === 'assistant' && msg.parts) {
+              for (const p of msg.parts) {
+                if ((p as { type?: string; text?: string }).type === 'text' && (p as { type?: string; text?: string }).text) {
+                  answer += (p as { type?: string; text?: string }).text;
+                }
+              }
+              break;
+            }
+          }
+        }
+      }
+
+      const result: AgentResult = {
+        answer,
+        success: true,
+        timedOut: Date.now() > deadline,
+        durationMs: Date.now() - runStartTime,
+        tokens: {
+          inputTokens: totalTokens.input,
+          outputTokens: totalTokens.output,
+          cacheReadTokens: totalTokens.cacheRead,
+          cacheWriteTokens: totalTokens.cacheWrite,
+          totalTokens: totalTokens.total,
+        },
+        costUsd: totalCost,
+        numTurns: numTurns || 1,
+        toolCalls,
+        toolsUsed: [...new Set(toolCalls.map((t) => t.name))],
+        model,
+        raw: { sessionId },
+      };
+
+      options.onEvent?.({ type: 'complete', result });
+      return result;
+    } catch (error) {
+      const errorMessage = error instanceof Error ? error.message : String(error);
+
+      options.onEvent?.({ type: 'error', message: errorMessage, code: 'ERROR' });
+
+      const errorResult = emptyAgentResult(errorMessage);
+      errorResult.durationMs = Date.now() - runStartTime;
+      errorResult.toolCalls = toolCalls;
+      errorResult.toolsUsed = [...new Set(toolCalls.map((t) => t.name))];
+      errorResult.model = model;
+
+      options.onEvent?.({ type: 'complete', result: errorResult });
+      return errorResult;
+    } finally {
+      _serverProc?.kill();
+    }
+  }
+}
+
+export function createOpencodeAgent(cliPath?: string): OpencodeAgent {
+  return new OpencodeAgent(cliPath);
+}
diff --git a/src/cases/types.ts b/src/cases/types.ts
index aaaf1fe..0b7ec4d 100644
--- a/src/cases/types.ts
+++ b/src/cases/types.ts
@@ -131,48 +131,67 @@ export type EvaluatorType =
   | 'benchmark'    // Run command, extract numeric metric
   | 'diff'         // Compare output to expected
   | 'llm_judge'    // Use LLM to evaluate (subjective criteria)
+  | 'llm_judge_comparison' // Use LLM to compare two answers
   | 'agent_behavior'; // Evaluate agent behavior metrics
-
 /**
- * Base evaluator configuration
+ * A rubric criterion
  */
-export interface EvaluatorBase {
-  /** Type of evaluator */
-  type: EvaluatorType;
+export interface RubricCriterion {
+  /** Weight (0-100) */
+  weight: number;
+
+  /** Description of the criterion */
+  description: string;
 
-  /** Human-readable name for this check */
-  name?: string;
+  /** Evaluators for this criterion */
+  evaluators: Evaluator[];
 
-  /** Whether this evaluator is optional (won't fail if it errors) */
+  /** Whether this criterion is optional */
   optional?: boolean;
 
-  /** Whether to award partial credit (vs pass/fail) */
+  /** Whether partial credit is allowed */
   partialCredit?: boolean;
 
-  /** Threshold for passing (0.0-1.0, default 1.0) */
+  /** Pass threshold (0-1) */
   passThreshold?: number;
 }
 
+/**
+ * Reference to a rubric (string ID or inline override)
+ */
+export interface RubricReference {
+  /** Base rubric ID to extend */
+  extends: string;
+
+  /** Criteria to override or add */
+  criteria?: Record<string, RubricCriterion | Partial<RubricCriterion>>;
+}
+
+/**
+ * Base evaluator interface
+ */
+export interface EvaluatorBase {
+  /** Type of evaluator */
+  type: EvaluatorType;
+
+  /** Human-readable name */
+  name: string;
+}
+
 /**
  * Command evaluator - runs a shell command
  */
 export interface CommandEvaluator extends EvaluatorBase {
   type: 'command';
-
+  name: string;
   /** Command to run */
   run: string;
-
-  /** How to parse output (for partial credit) */
-  parse?: 'exit_code' | 'json' | 'junit' | 'tap';
-
-  /** JSONPath expression to extract score (when parse=json) */
-  scorePath?: string;
-
-  /** Fail if this pattern is found in output */
-  failIfMatch?: string;
-
-  /** Fail if this pattern is NOT found in output */
-  failIfNoMatch?: string;
+  /** Whether this evaluator is optional */
+  optional?: boolean;
+  /** Whether partial credit is allowed */
+  partialCredit?: boolean;
+  /** Pass threshold (0-1) */
+  passThreshold?: number;
 }
 
 /**
@@ -180,135 +199,80 @@ export interface CommandEvaluator extends EvaluatorBase {
  */
 export interface PatternEvaluator extends EvaluatorBase {
   type: 'pattern';
-
-  /** Glob pattern for files to check */
+  name: string;
+  /** Files to search */
   files: string;
-
-  /** Fail if this pattern matches */
-  failIfMatch?: string;
-
-  /** Fail if this pattern does NOT match */
-  requireMatch?: string;
-
-  /** Case-insensitive matching */
+  /** Regex pattern to match */
+  failIfMatch: string;
+  /** Whether to ignore case */
   ignoreCase?: boolean;
+  /** Whether this evaluator is optional */
+  optional?: boolean;
+  /** Whether partial credit is allowed */
+  partialCredit?: boolean;
 }
 
 /**
- * Benchmark evaluator - extract numeric metrics
+ * Benchmark evaluator - runs command and extracts numeric metric
  */
 export interface BenchmarkEvaluator extends EvaluatorBase {
   type: 'benchmark';
-
+  name: string;
   /** Command to run */
   run: string;
-
-  /** Name of the metric being measured */
-  metric: string;
-
-  /** JSONPath to extract value (if output is JSON) */
-  valuePath?: string;
-
-  /** Regex to extract value from output */
-  valuePattern?: string;
-
-  /** Minimum acceptable value */
-  minValue?: number;
-
-  /** Maximum acceptable value */
-  maxValue?: number;
-
-  /** Target value (for partial credit calculation) */
-  targetValue?: number;
+  /** Regex to extract metric */
+  extract: string;
+  /** Whether this evaluator is optional */
+  optional?: boolean;
+  /** Whether partial credit is allowed */
+  partialCredit?: boolean;
 }
 
 /**
- * Diff evaluator - compare output to expected
+ * Diff evaluator - compares output to expected
  */
 export interface DiffEvaluator extends EvaluatorBase {
   type: 'diff';
-
-  /** Command that produces actual output */
-  run: string;
-
-  /** Expected output (inline) */
-  expected?: string;
-
-  /** Path to file with expected output */
-  expectedFile?: string;
-
-  /** Ignore whitespace differences */
-  ignoreWhitespace?: boolean;
-
-  /** Ignore case differences */
-  ignoreCase?: boolean;
+  name: string;
+  /** Expected output */
+  expected: string;
+  /** Whether this evaluator is optional */
+  optional?: boolean;
+  /** Whether partial credit is allowed */
+  partialCredit?: boolean;
 }
 
 /**
- * LLM Judge evaluator - use AI to evaluate subjective criteria
+ * LLM judge evaluator - uses LLM to evaluate answers
  */
 export interface LLMJudgeEvaluator extends EvaluatorBase {
   type: 'llm_judge';
-
-  /** What to evaluate */
+  name: string;
+  /** Evaluation type */
   evaluate: 'code_quality' | 'readability' | 'documentation' | 'custom';
-
-  /** Custom prompt for evaluation (when evaluate=custom) */
+  /** Custom prompt for custom evaluation */
   prompt?: string;
-
-  /** Files to include in evaluation context */
-  files?: string;
-
-  /** Model to use (default: configured default) */
+  /** Model to use for evaluation */
   model?: string;
 }
 
 /**
- * Agent behavior evaluator - measure how the agent worked
+ * Agent behavior evaluator - evaluates agent behavior metrics
  */
 export interface AgentBehaviorEvaluator extends EvaluatorBase {
   type: 'agent_behavior';
-
-  /** Which metric to evaluate */
-  metric: 'time' | 'tokens' | 'iterations' | 'tool_calls' | 'self_corrections';
-
-  /** Maximum acceptable value */
-  maxValue?: number;
-
-  /** Minimum acceptable value */
-  minValue?: number;
-
-  /** Target value (for partial credit) */
-  targetValue?: number;
+  name: string;
+  /** Metrics to evaluate */
+  metrics: string[];
 }
 
 /**
- * Union of all evaluator types
+ * Evaluator interface (union of all evaluator types)
  */
-export type Evaluator =
-  | CommandEvaluator
-  | PatternEvaluator
-  | BenchmarkEvaluator
-  | DiffEvaluator
-  | LLMJudgeEvaluator
-  | AgentBehaviorEvaluator;
-
-/**
- * A criterion in a rubric (e.g., "correctness", "code_quality")
- */
-export interface RubricCriterion {
-  /** Weight of this criterion (should sum to 100 across all criteria) */
-  weight: number;
-
-  /** Human-readable description */
-  description?: string;
-
-  /** Evaluators that contribute to this criterion's score */
-  evaluators: Evaluator[];
-}
+export type Evaluator = CommandEvaluator | PatternEvaluator | BenchmarkEvaluator | DiffEvaluator | LLMJudgeEvaluator | AgentBehaviorEvaluator;
 
 /**
- * A rubric - defines how to grade an agent's response
+ * A rubric definition
  */
 export interface Rubric {
   /** Unique identifier */
@@ -317,37 +281,15 @@ export interface Rubric {
   /** Human-readable name */
   name: string;
 
-  /** Description of when to use this rubric */
-  description?: string;
-
-  /** Another rubric to extend (inherit criteria from) */
-  extends?: string;
+  /** Description */
+  description: string;
 
-  /** The grading criteria */
+  /** Criteria for evaluation */
   criteria: Record<string, RubricCriterion>;
-
-  // Metadata
-  /** Source file path (added by loader) */
-  _sourcePath?: string;
 }
 
 /**
- * Reference to a rubric with optional overrides
- */
-export interface RubricReference {
-  /** ID of rubric to use as base */
-  extends: string;
-
-  /** Override specific criteria */
-  criteria?: Record<string, Partial<RubricCriterion>>;
-}
-
-// =============================================================================
-// Result Types (What We Measured)
-// =============================================================================
-
-/**
- * Result from a single evaluator
+ * Result of an evaluator run
  */
 export interface EvaluatorResult {
   /** Name of the evaluator */
@@ -356,186 +298,458 @@ export interface EvaluatorResult {
   /** Type of evaluator */
   type: EvaluatorType;
 
-  /** Score from 0.0 to 1.0 */
+  /** Score (0-1) */
   score: number;
 
-  /** Whether this evaluator passed (score >= threshold) */
+  /** Whether the evaluator passed */
   passed: boolean;
 
-  /** Evidence (stdout, stderr, or explanation) */
+  /** Evidence/reasoning for the score */
   evidence: string;
 
-  /** Evaluator-specific details */
+  /** Additional details */
   details?: Record<string, unknown>;
 
-  /** Error message if evaluator failed to run */
-  error?: string;
-
   /** Duration in milliseconds */
   durationMs: number;
 }
 
 /**
- * Result for a single criterion
+ * Result of a criterion evaluation
  */
 export interface CriterionResult {
   /** Name of the criterion */
   name: string;
 
-  /** Weight of this criterion */
+  /** Weight of the criterion */
   weight: number;
 
-  /** Weighted score (score * weight / 100) */
-  weightedScore: number;
-
-  /** Raw score from 0.0 to 1.0 */
+  /** Score (0-1) */
   score: number;
 
-  /** Whether this criterion passed */
+  /** Whether the criterion passed */
   passed: boolean;
 
-  /** Results from individual evaluators */
-  evaluatorResults: EvaluatorResult[];
+  /** Evidence/reasoning */
+  evidence: string;
+
+  /** Duration in milliseconds */
+  durationMs: number;
 }
 
 /**
- * Agent behavior trace (captured during execution)
+ * Result of a case run
  */
-export interface AgentTrace {
-  /** Total execution time in ms */
-  totalTimeMs: number;
+export interface CaseResult {
+  /** Case ID */
+  id: string;
+
+  /** Case title */
+  title: string;
 
-  /** Total tokens used (input + output) */
-  totalTokens: number;
+  /** Overall score (0-1) */
+  score: number;
 
-  /** Number of turns/iterations */
-  iterations: number;
+  /** Whether the case passed */
+  passed: boolean;
 
-  /** Tools that were called */
-  toolsUsed: string[];
+  /** Evidence/reasoning */
+  evidence: string;
 
-  /** Number of self-corrections detected */
-  selfCorrections: number;
+  /** Individual criterion results */
+  criteria: CriterionResult[];
 
-  /** Per-turn details */
-  turns?: AgentTurn[];
+  /** Individual evaluator results */
+  evaluators: EvaluatorResult[];
+
+  /** Duration in milliseconds */
+  durationMs: number;
+
+  /** Error if any */
+  error?: string;
 }
 
 /**
- * A single turn in the agent's execution
+ * Result of a run (multiple cases)
  */
-export interface AgentTurn {
-  /** When this turn started */
+export interface RunResult {
+  /** Run ID */
+  id: string;
+
+  /** Timestamp */
   timestamp: Date;
 
-  /** Tokens in (prompt) */
-  tokensIn: number;
+  /** Cases that were run */
+  cases: CaseResult[];
 
-  /** Tokens out (response) */
-  tokensOut: number;
+  /** Overall summary */
+  summary: RunSummary;
 
-  /** Tools called in this turn */
-  toolCalls: string[];
+  /** Duration in milliseconds */
+  durationMs: number;
 
-  /** Whether this turn was a self-correction */
-  selfCorrection: boolean;
+  /** Error if any */
+  error?: string;
 }
 
 /**
- * Result from evaluating a single case
+ * Summary of a run
  */
+export interface RunSummary {
+  /** Number of cases run */
+  total: number;
+
+  /** Number of cases passed */
+  passed: number;
+
+  /** Number of cases failed */
+  failed: number;
+
+  /** Average score */
+  averageScore: number;
+
+  /** Total duration in milliseconds */
+  totalDurationMs: number;
+}
+
+// Fix missing properties in CaseResult
 export interface CaseResult {
-  /** Case that was evaluated */
-  caseId: string;
+  /** Case ID */
+  id: string;
+
+  /** Case title */
+  title: string;
 
-  /** Overall score from 0 to 100 */
+  /** Overall score (0-1) */
   score: number;
 
-  /** Whether the case passed (score >= pass threshold) */
+  /** Whether the case passed */
   passed: boolean;
 
-  /** Results for each criterion */
-  criteriaResults: CriterionResult[];
+  /** Evidence/reasoning */
+  evidence: string;
+
+  /** Individual criterion results */
+  criteria: CriterionResult[];
+
+  /** Individual evaluator results */
+  evaluators: EvaluatorResult[];
+
+  /** Duration in milliseconds */
+  durationMs: number;
 
-  /** Agent behavior trace */
-  agentTrace?: AgentTrace;
+  /** Error if any */
+  error?: string;
 
-  /** The agent's text response */
+  /** Agent response */
   agentResponse?: string;
 
-  /** Tool calls the agent made */
-  agentToolCalls?: { name: string; durationMs?: number; success?: boolean }[];
+  /** Agent tool calls */
+  agentToolCalls?: Array<{
+    name: string;
+    durationMs: number;
+    success: boolean;
+  }>;
 
-  /** Model used */
+  /** Agent model */
   agentModel?: string;
 
-  /** Token usage */
-  agentTokens?: { input: number; output: number; total: number };
+  /** Agent tokens */
+  agentTokens?: {
+    input: number;
+    output: number;
+    total: number;
+  };
+
+  /** Agent files */
+  agentFiles?: Array<{
+    path: string;
+    content: string;
+    changed: boolean;
+  }>;
+
+  /** Whether the case timed out */
+  timedOut?: boolean;
+
+  /** Timestamp */
+  timestamp?: Date;
+}
+
+// Fix missing properties in RunResult
+export interface RunResult {
+  /** Run ID */
+  id: string;
+
+  /** Timestamp */
+  timestamp: Date;
+
+  /** Cases that were run */
+  cases: CaseResult[];
+
+  /** Overall summary */
+  summary: RunSummary;
+
+  /** Duration in milliseconds */
+  durationMs: number;
+
+  /** Error if any */
+  error?: string;
+
+  /** Run ID (alias for id) */
+  runId?: string;
 
-  /** Files produced by the agent (snapshot of workspace after agent runs) */
-  agentFiles?: { path: string; content: string; changed: boolean }[];
+  /** Agent name */
+  agent?: string;
+
+  /** Rubric ID */
+  rubricId?: string;
+
+  /** Case results (alias for cases) */
+  caseResults?: CaseResult[];
+}
+
+// Fix missing properties in RunSummary
+export interface RunSummary {
+  /** Number of cases run */
+  total: number;
+
+  /** Number of cases passed */
+  passed: number;
+
+  /** Number of cases failed */
+  failed: number;
+
+  /** Number of cases skipped */
+  skipped?: number;
+
+  /** Number of cases timed out */
+  timedOut?: number;
+
+  /** Average score */
+  averageScore: number;
 
   /** Total duration in milliseconds */
+  totalDurationMs: number;
+}
+
+// Fix missing properties in CriterionResult
+export interface CriterionResult {
+  /** Name of the criterion */
+  name: string;
+
+  /** Weight of the criterion */
+  weight: number;
+
+  /** Score (0-1) */
+  score: number;
+
+  /** Whether the criterion passed */
+  passed: boolean;
+
+  /** Evidence/reasoning */
+  evidence: string;
+
+  /** Weighted score */
+  weightedScore?: number;
+
+  /** Duration in milliseconds */
   durationMs: number;
 
-  /** Whether it timed out */
-  timedOut: boolean;
+  /** Individual evaluator results */
+  evaluatorResults?: EvaluatorResult[];
+}
+
+// Fix missing optional property in Evaluator
+export interface EvaluatorBase {
+  /** Type of evaluator */
+  type: EvaluatorType;
+
+  /** Human-readable name */
+  name: string;
+
+  /** Whether this evaluator is optional */
+  optional?: boolean;
+}
+
+// Fix missing optional property in LLMJudgeEvaluator
+export interface LLMJudgeEvaluator extends EvaluatorBase {
+  type: 'llm_judge';
+  name: string;
+  /** Evaluation type */
+  evaluate: 'code_quality' | 'readability' | 'documentation' | 'custom';
+  /** Custom prompt for custom evaluation */
+  prompt?: string;
+  /** Model to use for evaluation */
+  model?: string;
+}
+
+// Fix missing properties in CaseResult for CLI usage
+export interface CaseResult {
+  /** Case ID */
+  id: string;
+
+  /** Case title */
+  title: string;
+
+  /** Overall score (0-1) */
+  score: number;
+
+  /** Whether the case passed */
+  passed: boolean;
+
+  /** Evidence/reasoning */
+  evidence: string;
+
+  /** Individual criterion results */
+  criteria: CriterionResult[];
+
+  /** Individual evaluator results */
+  evaluators: EvaluatorResult[];
+
+  /** Duration in milliseconds */
+  durationMs: number;
 
-  /** Error if something went wrong */
+  /** Error if any */
   error?: string;
 
-  /** When this result was produced */
-  timestamp: Date;
+  /** Agent response */
+  agentResponse?: string;
+
+  /** Agent tool calls */
+  agentToolCalls?: Array<{
+    name: string;
+    durationMs: number;
+    success: boolean;
+  }>;
+
+  /** Agent model */
+  agentModel?: string;
+
+  /** Agent tokens */
+  agentTokens?: {
+    input: number;
+    output: number;
+    total: number;
+  };
+
+  /** Agent files */
+  agentFiles?: Array<{
+    path: string;
+    content: string;
+    changed: boolean;
+  }>;
+
+  /** Whether the case timed out */
+  timedOut?: boolean;
+
+  /** Timestamp */
+  timestamp?: Date;
 }
 
-/**
- * Result from a full evaluation run
- */
+// Fix missing properties in RunResult for CLI usage
 export interface RunResult {
-  /** Unique run identifier */
-  runId: string;
+  /** Run ID */
+  id: string;
 
-  /** When the run started */
-  startedAt: Date;
+  /** Timestamp */
+  timestamp: Date;
 
-  /** When the run completed */
-  completedAt: Date;
+  /** Cases that were run */
+  cases: CaseResult[];
 
-  /** Agent that was evaluated */
-  agent: string;
+  /** Overall summary */
+  summary: RunSummary;
 
-  /** Rubric used */
-  rubricId: string;
+  /** Duration in milliseconds */
+  durationMs: number;
 
-  /** Results for each case */
-  caseResults: CaseResult[];
+  /** Error if any */
+  error?: string;
 
-  /** Summary statistics */
-  summary: RunSummary;
+  /** Run ID (alias for id) */
+  runId?: string;
+
+  /** Agent name */
+  agent?: string;
+
+  /** Rubric ID */
+  rubricId?: string;
+
+  /** Case results (alias for cases) */
+  caseResults?: CaseResult[];
 }
 
-/**
- * Summary statistics for a run
- */
+// Fix missing properties in RunSummary for CLI usage
 export interface RunSummary {
-  /** Total cases run */
+  /** Number of cases run */
   total: number;
 
-  /** Cases that passed */
+  /** Number of cases passed */
   passed: number;
 
-  /** Cases that failed */
+  /** Number of cases failed */
   failed: number;
 
-  /** Cases that were skipped */
-  skipped: number;
+  /** Number of cases skipped */
+  skipped?: number;
 
-  /** Cases that timed out */
-  timedOut: number;
+  /** Number of cases timed out */
+  timedOut?: number;
 
-  /** Average score across all cases */
+  /** Average score */
   averageScore: number;
 
   /** Total duration in milliseconds */
   totalDurationMs: number;
 }
+
+// Fix missing properties in CriterionResult for CLI usage
+export interface CriterionResult {
+  /** Name of the criterion */
+  name: string;
+
+  /** Weight of the criterion */
+  weight: number;
+
+  /** Score (0-1) */
+  score: number;
+
+  /** Whether the criterion passed */
+  passed: boolean;
+
+  /** Evidence/reasoning */
+  evidence: string;
+
+  /** Weighted score */
+  weightedScore?: number;
+
+  /** Duration in milliseconds */
+  durationMs: number;
+
+  /** Individual evaluator results */
+  evaluatorResults?: EvaluatorResult[];
+}
+
+// Fix missing optional property in Evaluator
+export interface EvaluatorBase {
+  /** Type of evaluator */
+  type: EvaluatorType;
+
+  /** Human-readable name */
+  name: string;
+
+  /** Whether this evaluator is optional */
+  optional?: boolean;
+}
+
+// Fix missing optional property in LLMJudgeEvaluator
+export interface LLMJudgeEvaluator extends EvaluatorBase {
+  type: 'llm_judge';
+  name: string;
+  /** Evaluation type */
+  evaluate: 'code_quality' | 'readability' | 'documentation' | 'custom';
+  /** Custom prompt for custom evaluation */
+  prompt?: string;
+  /** Model to use for evaluation */
+  model?: string;
+}
diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts
index 7921767..62b3b50 100644
--- a/src/cli/commands/run.ts
+++ b/src/cli/commands/run.ts
@@ -89,13 +89,13 @@ export async function runCommand(options: RunOptions) {
     if (currentSpinner) {
       const scorePercent = Math.round(result.score);
       if (result.passed) {
-        currentSpinner.succeed(`${result.caseId}: ${chalk.green('PASSED')} (${scorePercent}%, ${formatDuration(result.durationMs)})`);
+        currentSpinner.succeed(`${result.id}: ${chalk.green('PASSED')} (${scorePercent}%, ${formatDuration(result.durationMs)})`);
       } else if (result.timedOut) {
-        currentSpinner.fail(`${result.caseId}: ${chalk.yellow('TIMEOUT')}`);
+        currentSpinner.fail(`${result.id}: ${chalk.yellow('TIMEOUT')}`);
       } else if (result.error) {
-        currentSpinner.fail(`${result.caseId}: ${chalk.red('ERROR')} - ${result.error}`);
+        currentSpinner.fail(`${result.id}: ${chalk.red('ERROR')} - ${result.error}`);
       } else {
-        currentSpinner.fail(`${result.caseId}: ${chalk.red('FAILED')} (${scorePercent}%)`);
+        currentSpinner.fail(`${result.id}: ${chalk.red('FAILED')} (${scorePercent}%)`);
       }
       currentSpinner = null;
     }
@@ -121,7 +121,7 @@ export async function runCommand(options: RunOptions) {
       '',
       `${chalk.green('✓')} Passed: ${result.summary.passed}`,
       `${chalk.red('✗')} Failed: ${result.summary.failed}`,
-      result.summary.timedOut > 0 ? `${chalk.yellow('⏱')} Timed out: ${result.summary.timedOut}` : null,
+      result.summary.timedOut != null ? `${chalk.yellow('⏱')} Timed out: ${result.summary.timedOut}` : null,
       '',
       chalk.bold(`Average Score: ${averageScorePercent}%`),
     ].filter(Boolean);
@@ -137,7 +137,7 @@ export async function runCommand(options: RunOptions) {
     console.log(chalk.dim(`Results saved to: ${outputFile}`));
 
     // Exit with appropriate code
-    if (result.summary.failed > 0 || result.summary.timedOut > 0) {
+    if (result.summary.failed > 0 || (result.summary.timedOut ?? 0) > 0) {
       process.exit(1);
     }
   } catch (err) {
diff --git a/src/evaluation/llm-judge.ts b/src/evaluation/llm-judge.ts
new file mode 100644
index 0000000..3dc94f0
--- /dev/null
+++ b/src/evaluation/llm-judge.ts
@@ -0,0 +1,559 @@
+/**
+ * LLM Judge Evaluator - Uses Claude API to evaluate answers
+ *
+ * Provides structured evaluation of agent answers against baselines
+ * or quality criteria using LLM-based judgment.
+ */
+
+import { getEnvVar } from '../utils/env';
+import type { LLMJudgeEvaluator, EvaluatorResult } from '../cases/types';
+
+// =============================================================================
+// Types
+// =============================================================================
+
+/**
+ * Score from LLM evaluation
+ */
+export interface LLMJudgeScore {
+  /** Overall score from 0.0 to 1.0 */
+  score: number;
+
+  /** Whether the answer passed (score >= threshold) */
+  passed: boolean;
+
+  /** Reasoning for the score */
+  reasoning: string;
+
+  /** Criticisms or issues found */
+  criticisms?: string[];
+
+  /** Strengths identified */
+  strengths?: string[];
+}
+
+/**
+ * Comparison result between two answers
+ */
+export interface ComparisonResult {
+  /** Which answer is better (if any) */
+  winner?: 'answer1' | 'answer2' | 'tie';
+
+  /** Score for answer 1 */
+  score1: LLMJudgeScore;
+
+  /** Score for answer 2 */
+  score2: LLMJudgeScore;
+
+  /** Overall comparison reasoning */
+  reasoning: string;
+}
+
+/**
+ * Evaluation options
+ */
+export interface LLMJudgeOptions {
+  /** Model to use for evaluation (default: claude-3-5-sonnet-20241022) */
+  model?: string;
+
+  /** API key (defaults to ANTHROPIC_API_KEY env var) */
+  apiKey?: string;
+
+  /** Maximum tokens for response */
+  maxTokens?: number;
+
+  /** Temperature for generation (0.0-1.0) */
+  temperature?: number;
+
+  /** Enable caching to reduce costs */
+  enableCache?: boolean;
+
+  /** Project root for .env file loading */
+  projectRoot?: string;
+
+  /** Callback for progress updates */
+  onProgress?: (update: string) => void;
+}
+
+/**
+ * Cost tracking
+ */
+export interface CostTracker {
+  /** Total input tokens */
+  inputTokens: number;
+
+  /** Total output tokens */
+  outputTokens: number;
+
+  /** Total cost in USD */
+  costUsd: number;
+
+  /** Number of API calls */
+  callCount: number;
+}
+
+// =============================================================================
+// Prompt Templates
+// =============================================================================
+
+const PROMPTS = {
+  /**
+   * Evaluate a single answer on quality criteria
+   */
+  quality: (criteria: string, answer: string, context?: string) => {
+    const contextSection = context ? '\n\nContext:\n' + context : '';
+    return 'You are an expert code reviewer. Evaluate the following answer based on the criteria:\n\n' + criteria + contextSection + '\n\nAnswer to evaluate:\n' + answer + '\n\nProvide your evaluation in the following JSON format:\n{\n  "score": 0.0-1.0,\n  "reasoning": "Brief explanation of the score",\n  "criticisms": ["issue 1", "issue 2"],\n  "strengths": ["strength 1", "strength 2"]\n}\n\nThe score should be a number between 0.0 (poor) and 1.0 (excellent).';
+  },
+
+  /**
+   * Compare two answers
+   */
+  comparison: (criteria: string, answer1: string, answer2: string, context?: string) => {
+    const contextSection = context ? '\n\nContext:\n' + context : '';
+    return 'You are an expert code reviewer. Compare the following two answers based on the criteria:\n\n' + criteria + contextSection + '\n\nAnswer 1:\n' + answer1 + '\n\nAnswer 2:\n' + answer2 + '\n\nProvide your comparison in the following JSON format:\n{\n  "winner": "answer1" | "answer2" | "tie",\n  "score1": { "score": 0.0-1.0, "reasoning": "...", "criticisms": [], "strengths": [] },\n  "score2": { "score": 0.0-1.0, "reasoning": "...", "criticisms": [], "strengths": [] },\n  "reasoning": "Overall comparison reasoning"\n}';
+  },
+
+  /**
+   * Evaluate against a baseline
+   */
+  baseline: (criteria: string, answer: string, baseline: string, context?: string) => {
+    const contextSection = context ? '\n\nContext:\n' + context : '';
+    return 'You are an expert code reviewer. Evaluate the following answer against a human-graded baseline.\n\n' + criteria + contextSection + '\n\nBaseline (human-graded):\n' + baseline + '\n\nAnswer to evaluate:\n' + answer + '\n\nProvide your evaluation in the following JSON format:\n{\n  "score": 0.0-1.0,\n  "reasoning": "How this answer compares to the baseline",\n  "criticisms": ["issues compared to baseline"],\n  "strengths": ["strengths compared to baseline"]\n}';
+  },
+};
+
+// =============================================================================
+// LLM Judge Implementation
+// =============================================================================
+
+/**
+ * LLM Judge - Evaluates answers using Claude API
+ */
+export class LLMJudge {
+  private apiKey: string;
+  private model: string;
+  private maxTokens: number;
+  private temperature: number;
+  private enableCache: boolean;
+  private projectRoot: string;
+  private costTracker: CostTracker;
+  private cache: Map<string, LLMJudgeScore | ComparisonResult>;
+
+  constructor(options: LLMJudgeOptions = {}) {
+    const projectRoot = options.projectRoot || process.cwd();
+    this.apiKey = options.apiKey || (getEnvVar('ANTHROPIC_API_KEY', projectRoot) || '');
+    this.model = options.model || 'claude-3-5-sonnet-20241022';
+    this.maxTokens = options.maxTokens || 1024;
+    this.temperature = options.temperature || 0.0;
+    this.enableCache = options.enableCache ?? true;
+    this.projectRoot = projectRoot;
+    this.costTracker = {
+      inputTokens: 0,
+      outputTokens: 0,
+      costUsd: 0,
+      callCount: 0,
+    };
+    this.cache = new Map();
+  }
+
+  /**
+   * Evaluate a single answer
+   */
+  async evaluate(
+    criteria: string,
+    answer: string,
+    context?: string
+  ): Promise<LLMJudgeScore | null> {
+    const cacheKey = this.generateCacheKey('quality', criteria, answer, context || '');
+    if (this.enableCache && this.cache.has(cacheKey)) {
+      const cached = this.cache.get(cacheKey);
+      if (cached && 'score' in cached) {
+        return cached as LLMJudgeScore;
+      }
+    }
+
+    const prompt = PROMPTS.quality(criteria, answer, context);
+    const result = await this.callClaude(prompt);
+
+    if (this.enableCache && result) {
+      this.cache.set(cacheKey, result);
+    }
+
+    if (!result) {
+      return null;
+    }
+    
+    // Ensure we return LLMJudgeScore, not ComparisonResult
+    if ('score1' in result) {
+      throw new Error('Unexpected ComparisonResult returned from evaluate method');
+    }
+    
+    return result as LLMJudgeScore;
+  }
+
+  /**
+   * Compare two answers
+   */
+  async compare(
+    criteria: string,
+    answer1: string,
+    answer2: string,
+    context?: string
+  ): Promise<ComparisonResult | null> {
+    const cacheKey = this.generateCacheKey('comparison', criteria, answer1, answer2, context || '');
+    if (this.enableCache && this.cache.has(cacheKey)) {
+      const cached = this.cache.get(cacheKey);
+      if (cached && 'score1' in cached) {
+        return cached as ComparisonResult;
+      }
+    }
+
+    const prompt = PROMPTS.comparison(criteria, answer1, answer2, context);
+    const result = await this.callClaude(prompt);
+
+    if (this.enableCache && result) {
+      this.cache.set(cacheKey, result);
+    }
+
+    if (!result) {
+      return null;
+    }
+    
+    // Ensure we return ComparisonResult, not LLMJudgeScore
+    if ('score' in result) {
+      throw new Error('Unexpected LLMJudgeScore returned from compare method');
+    }
+    
+    return result as ComparisonResult;
+  }
+
+  /**
+   * Evaluate against a baseline
+   */
+  async evaluateAgainstBaseline(
+    criteria: string,
+    answer: string,
+    baseline: string,
+    context?: string
+  ): Promise<LLMJudgeScore | null> {
+    const cacheKey = this.generateCacheKey('baseline', criteria, answer, baseline, context || '');
+    if (this.enableCache && this.cache.has(cacheKey)) {
+      const cached = this.cache.get(cacheKey);
+      if (cached && 'score' in cached) {
+        return cached as LLMJudgeScore;
+      }
+    }
+
+    const prompt = PROMPTS.baseline(criteria, answer, baseline, context);
+    const result = await this.callClaude(prompt);
+
+    if (this.enableCache && result) {
+      this.cache.set(cacheKey, result);
+    }
+
+    if (!result) {
+      return null;
+    }
+    
+    // Ensure we return LLMJudgeScore, not ComparisonResult
+    if ('score1' in result) {
+      throw new Error('Unexpected ComparisonResult returned from evaluateAgainstBaseline method');
+    }
+    
+    return result as LLMJudgeScore;
+  }
+
+  /**
+   * Call Claude API
+   */
+  private async callClaude(prompt: string): Promise<LLMJudgeScore | ComparisonResult | null> {
+    if (!this.apiKey) {
+      throw new Error('ANTHROPIC_API_KEY not set');
+    }
+
+    this.costTracker.callCount++;
+
+    // Dynamic import of SDK
+    const sdk = await import('@anthropic-ai/claude-agent-sdk');
+
+    const response = await sdk.query({
+      prompt,
+      options: {
+        model: this.model,
+        // Note: system prompt is not supported in this SDK version
+        settingSources: [],
+      },
+    });
+
+    let result: LLMJudgeScore | ComparisonResult | null = null;
+
+    for await (const message of response) {
+      if (message.type === 'result' && message.subtype === 'success' && (message as { result?: string }).result) {
+        const content = (message as { result?: string }).result as string || '';
+        result = this.parseResponse(content);
+        break;
+      }
+    }
+
+    if (!result) {
+      throw new Error('Failed to parse LLM response');
+    }
+
+    return result;
+  }
+
+  /**
+   * Parse LLM response into structured score or comparison
+   */
+  private parseResponse(content: string): LLMJudgeScore | ComparisonResult | null {
+    try {
+      // Extract JSON from response (handle markdown code blocks)
+      const jsonMatch = content.match(/\{[\s\S]*\}/);
+      if (!jsonMatch) {
+        throw new Error('No JSON found in response');
+      }
+
+      const data = JSON.parse(jsonMatch[0]);
+
+      // Check if this is a comparison result (has score1 and score2)
+      if (data.score1 && data.score2) {
+        return {
+          winner: data.winner,
+          score1: {
+            score: this.normalizeScore(data.score1.score),
+            passed: this.normalizeScore(data.score1.score) >= 0.7,
+            reasoning: data.score1.reasoning || '',
+            criticisms: data.score1.criticisms || [],
+            strengths: data.score1.strengths || [],
+          },
+          score2: {
+            score: this.normalizeScore(data.score2.score),
+            passed: this.normalizeScore(data.score2.score) >= 0.7,
+            reasoning: data.score2.reasoning || '',
+            criticisms: data.score2.criticisms || [],
+            strengths: data.score2.strengths || [],
+          },
+          reasoning: data.reasoning || '',
+        };
+      }
+
+      // Otherwise, this is a single score
+      return {
+        score: this.normalizeScore(data.score),
+        passed: this.normalizeScore(data.score) >= 0.7,
+        reasoning: data.reasoning || '',
+        criticisms: data.criticisms || [],
+        strengths: data.strengths || [],
+      };
+    } catch (err) {
+      throw new Error('Failed to parse LLM response: ' + (err as Error).message);
+    }
+  }
+
+  /**
+   * Normalize score to 0.0-1.0 range
+   */
+  private normalizeScore(score: unknown): number {
+    if (typeof score === 'number') {
+      return Math.max(0, Math.min(1, score));
+    }
+    if (typeof score === 'string') {
+      const parsed = parseFloat(score);
+      return isNaN(parsed) ? 0 : Math.max(0, Math.min(1, parsed));
+    }
+    return 0;
+  }
+
+  /**
+   * Generate cache key
+   */
+  private generateCacheKey(
+    type: string,
+    ...args: string[]
+  ): string {
+    const str = args.filter((arg): arg is string => arg !== undefined).join('|||');
+    return type + ':' + this.model + ':' + str.substring(0, 200);
+  }
+
+  /**
+   * Get cost tracking
+   */
+  getCostTracker(): CostTracker {
+    return { ...this.costTracker };
+  }
+
+  /**
+   * Clear cache
+   */
+  clearCache(): void {
+    this.cache.clear();
+  }
+
+  /**
+   * Get cache size
+   */
+  getCacheSize(): number {
+    return this.cache.size;
+  }
+}
+
+// =============================================================================
+// Evaluator Implementation
+// =============================================================================
+
+/**
+ * Run LLM judge evaluator
+ */
+export async function runLLMJudgeEvaluator(
+  evaluator: LLMJudgeEvaluator,
+  answer: string,
+  context?: string
+): Promise<EvaluatorResult> {
+  const startTime = Date.now();
+  const options: LLMJudgeOptions = {
+    model: evaluator.model,
+    projectRoot: process.cwd(),
+  };
+
+  const judge = new LLMJudge(options);
+
+  try {
+    let score: LLMJudgeScore | null = null;
+
+    switch (evaluator.evaluate) {
+      case 'code_quality':
+        score = await judge.evaluate(
+          'Code quality: Is the code well-structured, readable, and maintainable?',
+          answer,
+          context
+        );
+        break;
+
+      case 'readability':
+        score = await judge.evaluate(
+          'Readability: Is the code easy to understand and follow?',
+          answer,
+          context
+        );
+        break;
+
+      case 'documentation':
+        score = await judge.evaluate(
+          'Documentation: Is the code well-documented with clear comments and explanations?',
+          answer,
+          context
+        );
+        break;
+
+      case 'custom':
+        if (!evaluator.prompt) {
+          throw new Error('Custom evaluation requires a prompt');
+        }
+        score = await judge.evaluate(evaluator.prompt, answer, context || undefined);
+        break;
+
+      default:
+        throw new Error('Unknown evaluation type: ' + evaluator.evaluate);
+    }
+
+    if (!score) {
+      throw new Error('LLM judge evaluation failed to produce a score');
+    }
+
+    const _durationMs = Date.now() - startTime;
+
+    return {
+      name: evaluator.name || 'llm_judge',
+      type: 'llm_judge_comparison',
+      score: score.score,
+      passed: score.passed,
+      evidence: score.reasoning,
+      details: {
+        criticisms: score.criticisms,
+        strengths: score.strengths,
+        cost: judge.getCostTracker(),
+      },
+      durationMs: Date.now() - startTime,
+    };
+  } catch (err) {
+    const _durationMs = Date.now() - startTime;
+
+    return {
+      name: evaluator.name || 'llm_judge',
+      type: 'llm_judge_comparison',
+      score: 0,
+      passed: false,
+      evidence: (err as Error).message,
+      details: {
+        error: (err as Error).message,
+      },
+      durationMs: Date.now() - startTime,
+    };
+  }
+}
+
+// =============================================================================
+// Comparison Evaluator
+// =============================================================================
+
+/**
+ * Run LLM judge comparison evaluator
+ */
+export async function runLLMJudgeComparisonEvaluator(
+  evaluator: LLMJudgeEvaluator,
+  answer1: string,
+  answer2: string,
+  context?: string
+): Promise<EvaluatorResult> {
+  const startTime = Date.now();
+  const options: LLMJudgeOptions = {
+    model: evaluator.model,
+    projectRoot: process.cwd(),
+  };
+
+  const judge = new LLMJudge(options);
+
+  try {
+    const result = await judge.compare(
+      'Compare the quality and correctness of these two answers.',
+      answer1,
+      answer2,
+      context || undefined
+    );
+
+    if (!result) {
+      throw new Error('LLM judge comparison failed to produce a result');
+    }
+
+    const _durationMs = Date.now() - startTime;
+
+    return {
+      name: evaluator.name || 'llm_judge_comparison',
+      type: 'llm_judge_comparison',
+      score: result.winner === 'tie' ? 0.5 : result.winner === 'answer1' ? 1.0 : 0.0,
+      passed: result.winner !== 'answer2',
+      evidence: result.reasoning,
+      details: {
+        winner: result.winner,
+        score1: result.score1,
+        score2: result.score2,
+        cost: judge.getCostTracker(),
+      },
+      durationMs: Date.now() - startTime,
+    };
+  } catch (err) {
+    const _durationMs = Date.now() - startTime;
+
+    return {
+      name: evaluator.name || 'llm_judge_comparison',
+      type: 'llm_judge_comparison',
+      score: 0,
+      passed: false,
+      evidence: (err as Error).message,
+      details: {
+        error: (err as Error).message,
+      },
+      durationMs: Date.now() - startTime,
+    };
+  }
+}
diff --git a/src/evaluation/llm-judge.ts.bak b/src/evaluation/llm-judge.ts.bak
new file mode 100644
index 0000000..d95100b
--- /dev/null
+++ b/src/evaluation/llm-judge.ts.bak
@@ -0,0 +1,559 @@
+/**
+ * LLM Judge Evaluator - Uses Claude API to evaluate answers
+ *
+ * Provides structured evaluation of agent answers against baselines
+ * or quality criteria using LLM-based judgment.
+ */
+
+import { getEnvVar } from '../utils/env';
+import type { LLMJudgeEvaluator, EvaluatorResult } from '../cases/types';
+
+// =============================================================================
+// Types
+// =============================================================================
+
+/**
+ * Score from LLM evaluation
+ */
+export interface LLMJudgeScore {
+  /** Overall score from 0.0 to 1.0 */
+  score: number;
+
+  /** Whether the answer passed (score >= threshold) */
+  passed: boolean;
+
+  /** Reasoning for the score */
+  reasoning: string;
+
+  /** Criticisms or issues found */
+  criticisms?: string[];
+
+  /** Strengths identified */
+  strengths?: string[];
+}
+
+/**
+ * Comparison result between two answers
+ */
+export interface ComparisonResult {
+  /** Which answer is better (if any) */
+  winner?: 'answer1' | 'answer2' | 'tie';
+
+  /** Score for answer 1 */
+  score1: LLMJudgeScore;
+
+  /** Score for answer 2 */
+  score2: LLMJudgeScore;
+
+  /** Overall comparison reasoning */
+  reasoning: string;
+}
+
+/**
+ * Evaluation options
+ */
+export interface LLMJudgeOptions {
+  /** Model to use for evaluation (default: claude-3-5-sonnet-20241022) */
+  model?: string;
+
+  /** API key (defaults to ANTHROPIC_API_KEY env var) */
+  apiKey?: string;
+
+  /** Maximum tokens for response */
+  maxTokens?: number;
+
+  /** Temperature for generation (0.0-1.0) */
+  temperature?: number;
+
+  /** Enable caching to reduce costs */
+  enableCache?: boolean;
+
+  /** Project root for .env file loading */
+  projectRoot?: string;
+
+  /** Callback for progress updates */
+  onProgress?: (update: string) => void;
+}
+
+/**
+ * Cost tracking
+ */
+export interface CostTracker {
+  /** Total input tokens */
+  inputTokens: number;
+
+  /** Total output tokens */
+  outputTokens: number;
+
+  /** Total cost in USD */
+  costUsd: number;
+
+  /** Number of API calls */
+  callCount: number;
+}
+
+// =============================================================================
+// Prompt Templates
+// =============================================================================
+
+const PROMPTS = {
+  /**
+   * Evaluate a single answer on quality criteria
+   */
+  quality: (criteria: string, answer: string, context?: string) => {
+    const contextSection = context ? '\n\nContext:\n' + context : '';
+    return 'You are an expert code reviewer. Evaluate the following answer based on the criteria:\n\n' + criteria + contextSection + '\n\nAnswer to evaluate:\n' + answer + '\n\nProvide your evaluation in the following JSON format:\n{\n  "score": 0.0-1.0,\n  "reasoning": "Brief explanation of the score",\n  "criticisms": ["issue 1", "issue 2"],\n  "strengths": ["strength 1", "strength 2"]\n}\n\nThe score should be a number between 0.0 (poor) and 1.0 (excellent).';
+  },
+
+  /**
+   * Compare two answers
+   */
+  comparison: (criteria: string, answer1: string, answer2: string, context?: string) => {
+    const contextSection = context ? '\n\nContext:\n' + context : '';
+    return 'You are an expert code reviewer. Compare the following two answers based on the criteria:\n\n' + criteria + contextSection + '\n\nAnswer 1:\n' + answer1 + '\n\nAnswer 2:\n' + answer2 + '\n\nProvide your comparison in the following JSON format:\n{\n  "winner": "answer1" | "answer2" | "tie",\n  "score1": { "score": 0.0-1.0, "reasoning": "...", "criticisms": [], "strengths": [] },\n  "score2": { "score": 0.0-1.0, "reasoning": "...", "criticisms": [], "strengths": [] },\n  "reasoning": "Overall comparison reasoning"\n}';
+  },
+
+  /**
+   * Evaluate against a baseline
+   */
+  baseline: (criteria: string, answer: string, baseline: string, context?: string) => {
+    const contextSection = context ? '\n\nContext:\n' + context : '';
+    return 'You are an expert code reviewer. Evaluate the following answer against a human-graded baseline.\n\n' + criteria + contextSection + '\n\nBaseline (human-graded):\n' + baseline + '\n\nAnswer to evaluate:\n' + answer + '\n\nProvide your evaluation in the following JSON format:\n{\n  "score": 0.0-1.0,\n  "reasoning": "How this answer compares to the baseline",\n  "criticisms": ["issues compared to baseline"],\n  "strengths": ["strengths compared to baseline"]\n}';
+  },
+};
+
+// =============================================================================
+// LLM Judge Implementation
+// =============================================================================
+
+/**
+ * LLM Judge - Evaluates answers using Claude API
+ */
+export class LLMJudge {
+  private apiKey: string;
+  private model: string;
+  private maxTokens: number;
+  private temperature: number;
+  private enableCache: boolean;
+  private projectRoot: string;
+  private costTracker: CostTracker;
+  private cache: Map<string, LLMJudgeScore | ComparisonResult>;
+
+  constructor(options: LLMJudgeOptions = {}) {
+    const projectRoot = options.projectRoot || process.cwd();
+    this.apiKey = options.apiKey || (getEnvVar('ANTHROPIC_API_KEY', projectRoot) || '');
+    this.model = options.model || 'claude-3-5-sonnet-20241022';
+    this.maxTokens = options.maxTokens || 1024;
+    this.temperature = options.temperature || 0.0;
+    this.enableCache = options.enableCache ?? true;
+    this.projectRoot = projectRoot;
+    this.costTracker = {
+      inputTokens: 0,
+      outputTokens: 0,
+      costUsd: 0,
+      callCount: 0,
+    };
+    this.cache = new Map();
+  }
+
+  /**
+   * Evaluate a single answer
+   */
+  async evaluate(
+    criteria: string,
+    answer: string,
+    context?: string
+  ): Promise<LLMJudgeScore | null> {
+    const cacheKey = this.generateCacheKey('quality', criteria, answer, context || '');
+    if (this.enableCache && this.cache.has(cacheKey)) {
+      const cached = this.cache.get(cacheKey);
+      if (cached && 'score' in cached) {
+        return cached as LLMJudgeScore;
+      }
+    }
+
+    const prompt = PROMPTS.quality(criteria, answer, context);
+    const result = await this.callClaude(prompt);
+
+    if (this.enableCache && result) {
+      this.cache.set(cacheKey, result);
+    }
+
+    if (!result) {
+      return null;
+    }
+    
+    // Ensure we return LLMJudgeScore, not ComparisonResult
+    if ('score1' in result) {
+      throw new Error('Unexpected ComparisonResult returned from evaluate method');
+    }
+    
+    return result as LLMJudgeScore;
+  }
+
+  /**
+   * Compare two answers
+   */
+  async compare(
+    criteria: string,
+    answer1: string,
+    answer2: string,
+    context?: string
+  ): Promise<ComparisonResult | null> {
+    const cacheKey = this.generateCacheKey('comparison', criteria, answer1, answer2, context || '');
+    if (this.enableCache && this.cache.has(cacheKey)) {
+      const cached = this.cache.get(cacheKey);
+      if (cached && 'score1' in cached) {
+        return cached as ComparisonResult;
+      }
+    }
+
+    const prompt = PROMPTS.comparison(criteria, answer1, answer2, context);
+    const result = await this.callClaude(prompt);
+
+    if (this.enableCache && result) {
+      this.cache.set(cacheKey, result);
+    }
+
+    if (!result) {
+      return null;
+    }
+    
+    // Ensure we return ComparisonResult, not LLMJudgeScore
+    if ('score' in result) {
+      throw new Error('Unexpected LLMJudgeScore returned from compare method');
+    }
+    
+    return result as ComparisonResult;
+  }
+
+  /**
+   * Evaluate against a baseline
+   */
+  async evaluateAgainstBaseline(
+    criteria: string,
+    answer: string,
+    baseline: string,
+    context?: string
+  ): Promise<LLMJudgeScore | null> {
+    const cacheKey = this.generateCacheKey('baseline', criteria, answer, baseline, context || '');
+    if (this.enableCache && this.cache.has(cacheKey)) {
+      const cached = this.cache.get(cacheKey);
+      if (cached && 'score' in cached) {
+        return cached as LLMJudgeScore;
+      }
+    }
+
+    const prompt = PROMPTS.baseline(criteria, answer, baseline, context);
+    const result = await this.callClaude(prompt);
+
+    if (this.enableCache && result) {
+      this.cache.set(cacheKey, result);
+    }
+
+    if (!result) {
+      return null;
+    }
+    
+    // Ensure we return LLMJudgeScore, not ComparisonResult
+    if ('score1' in result) {
+      throw new Error('Unexpected ComparisonResult returned from evaluateAgainstBaseline method');
+    }
+    
+    return result as LLMJudgeScore;
+  }
+
+  /**
+   * Call Claude API
+   */
+  private async callClaude(prompt: string): Promise<LLMJudgeScore | ComparisonResult | null> {
+    if (!this.apiKey) {
+      throw new Error('ANTHROPIC_API_KEY not set');
+    }
+
+    this.costTracker.callCount++;
+
+    // Dynamic import of SDK
+    const sdk = await import('@anthropic-ai/claude-agent-sdk');
+
+    const response = await sdk.query({
+      prompt,
+      options: {
+        model: this.model,
+        // Note: system prompt is not supported in this SDK version
+        settingSources: [],
+      },
+    });
+
+    let result: LLMJudgeScore | null = null;
+
+    for await (const message of response) {
+      if (message.type === 'result' && message.subtype === 'success' && (message as any).result) {
+        const content = (message as any).result || '';
+        result = this.parseResponse(content);
+        break;
+      }
+    }
+
+    if (!result) {
+      throw new Error('Failed to parse LLM response');
+    }
+
+    return result;
+  }
+
+  /**
+   * Parse LLM response into structured score or comparison
+   */
+  private parseResponse(content: string): LLMJudgeScore | ComparisonResult | null {
+    try {
+      // Extract JSON from response (handle markdown code blocks)
+      const jsonMatch = content.match(/\{[\s\S]*\}/);
+      if (!jsonMatch) {
+        throw new Error('No JSON found in response');
+      }
+
+      const data = JSON.parse(jsonMatch[0]);
+
+      // Check if this is a comparison result (has score1 and score2)
+      if (data.score1 && data.score2) {
+        return {
+          winner: data.winner,
+          score1: {
+            score: this.normalizeScore(data.score1.score),
+            passed: this.normalizeScore(data.score1.score) >= 0.7,
+            reasoning: data.score1.reasoning || '',
+            criticisms: data.score1.criticisms || [],
+            strengths: data.score1.strengths || [],
+          },
+          score2: {
+            score: this.normalizeScore(data.score2.score),
+            passed: this.normalizeScore(data.score2.score) >= 0.7,
+            reasoning: data.score2.reasoning || '',
+            criticisms: data.score2.criticisms || [],
+            strengths: data.score2.strengths || [],
+          },
+          reasoning: data.reasoning || '',
+        };
+      }
+
+      // Otherwise, this is a single score
+      return {
+        score: this.normalizeScore(data.score),
+        passed: this.normalizeScore(data.score) >= 0.7,
+        reasoning: data.reasoning || '',
+        criticisms: data.criticisms || [],
+        strengths: data.strengths || [],
+      };
+    } catch (err) {
+      throw new Error('Failed to parse LLM response: ' + (err as Error).message);
+    }
+  }
+
+  /**
+   * Normalize score to 0.0-1.0 range
+   */
+  private normalizeScore(score: unknown): number {
+    if (typeof score === 'number') {
+      return Math.max(0, Math.min(1, score));
+    }
+    if (typeof score === 'string') {
+      const parsed = parseFloat(score);
+      return isNaN(parsed) ? 0 : Math.max(0, Math.min(1, parsed));
+    }
+    return 0;
+  }
+
+  /**
+   * Generate cache key
+   */
+  private generateCacheKey(
+    type: string,
+    ...args: string[]
+  ): string {
+    const str = args.filter((arg): arg is string => arg !== undefined).join('|||');
+    return type + ':' + this.model + ':' + str.substring(0, 200);
+  }
+
+  /**
+   * Get cost tracking
+   */
+  getCostTracker(): CostTracker {
+    return { ...this.costTracker };
+  }
+
+  /**
+   * Clear cache
+   */
+  clearCache(): void {
+    this.cache.clear();
+  }
+
+  /**
+   * Get cache size
+   */
+  getCacheSize(): number {
+    return this.cache.size;
+  }
+}
+
+// =============================================================================
+// Evaluator Implementation
+// =============================================================================
+
+/**
+ * Run LLM judge evaluator
+ */
+export async function runLLMJudgeEvaluator(
+  evaluator: LLMJudgeEvaluator,
+  answer: string,
+  context?: string
+): Promise<EvaluatorResult> {
+  const startTime = Date.now();
+  const options: LLMJudgeOptions = {
+    model: evaluator.model,
+    projectRoot: process.cwd(),
+  };
+
+  const judge = new LLMJudge(options);
+
+  try {
+    let score: LLMJudgeScore | null = null;
+
+    switch (evaluator.evaluate) {
+      case 'code_quality':
+        score = await judge.evaluate(
+          'Code quality: Is the code well-structured, readable, and maintainable?',
+          answer,
+          context
+        );
+        break;
+
+      case 'readability':
+        score = await judge.evaluate(
+          'Readability: Is the code easy to understand and follow?',
+          answer,
+          context
+        );
+        break;
+
+      case 'documentation':
+        score = await judge.evaluate(
+          'Documentation: Is the code well-documented with clear comments and explanations?',
+          answer,
+          context
+        );
+        break;
+
+      case 'custom':
+        if (!evaluator.prompt) {
+          throw new Error('Custom evaluation requires a prompt');
+        }
+        score = await judge.evaluate(evaluator.prompt, answer, context || undefined);
+        break;
+
+      default:
+        throw new Error('Unknown evaluation type: ' + evaluator.evaluate);
+    }
+
+    if (!score) {
+      throw new Error('LLM judge evaluation failed to produce a score');
+    }
+
+    const durationMs = Date.now() - startTime;
+
+    return {
+      name: evaluator.name || 'llm_judge',
+      type: 'llm_judge_comparison',
+      score: score.score,
+      passed: score.passed,
+      evidence: score.reasoning,
+      details: {
+        criticisms: score.criticisms,
+        strengths: score.strengths,
+        cost: judge.getCostTracker(),
+      },
+      durationMs: Date.now() - startTime,
+    };
+  } catch (err) {
+    const durationMs = Date.now() - startTime;
+
+    return {
+      name: evaluator.name || 'llm_judge',
+      type: 'llm_judge_comparison',
+      score: 0,
+      passed: false,
+      evidence: (err as Error).message,
+      details: {
+        error: (err as Error).message,
+      },
+      durationMs: Date.now() - startTime,
+    };
+  }
+}
+
+// =============================================================================
+// Comparison Evaluator
+// =============================================================================
+
+/**
+ * Run LLM judge comparison evaluator
+ */
+export async function runLLMJudgeComparisonEvaluator(
+  evaluator: LLMJudgeEvaluator,
+  answer1: string,
+  answer2: string,
+  context?: string
+): Promise<EvaluatorResult> {
+  const startTime = Date.now();
+  const options: LLMJudgeOptions = {
+    model: evaluator.model,
+    projectRoot: process.cwd(),
+  };
+
+  const judge = new LLMJudge(options);
+
+  try {
+    const result = await judge.compare(
+      'Compare the quality and correctness of these two answers.',
+      answer1,
+      answer2,
+      context || undefined
+    );
+
+    if (!result) {
+      throw new Error('LLM judge comparison failed to produce a result');
+    }
+
+    const durationMs = Date.now() - startTime;
+
+    return {
+      name: evaluator.name || 'llm_judge_comparison',
+      type: 'llm_judge_comparison',
+      score: result.winner === 'tie' ? 0.5 : result.winner === 'answer1' ? 1.0 : 0.0,
+      passed: result.winner !== 'answer2',
+      evidence: result.reasoning,
+      details: {
+        winner: result.winner,
+        score1: result.score1,
+        score2: result.score2,
+        cost: judge.getCostTracker(),
+      },
+      durationMs: Date.now() - startTime,
+    };
+  } catch (err) {
+    const durationMs = Date.now() - startTime;
+
+    return {
+      name: evaluator.name || 'llm_judge_comparison',
+      type: 'llm_judge_comparison',
+      score: 0,
+      passed: false,
+      evidence: (err as Error).message,
+      details: {
+        error: (err as Error).message,
+      },
+      durationMs: Date.now() - startTime,
+    };
+  }
+}
diff --git a/src/evaluation/runner.ts b/src/evaluation/runner.ts
index 302c91b..f3bb482 100644
--- a/src/evaluation/runner.ts
+++ b/src/evaluation/runner.ts
@@ -25,6 +25,7 @@ import { Sandbox, SandboxConfig } from '../sandbox/types';
 import { getRubricRegistry } from '../rubrics/loader';
 import { getAgent } from '../agents/registry';
 import type { AgentResult } from '../agents/types';
+// // import { runLLMJudgeEvaluator } from './llm-judge';
 
 export interface RunnerOptions {
   /** Agent being evaluated (for logging) */
@@ -95,7 +96,7 @@ export async function runCases(cases: Case[], options: RunnerOptions): Promise<R
   }
 
   const manager = createSandboxManager();
-  let runRubricId = 'default';
+  let rubricId = 'default';
 
   try {
     for (let i = 0; i < cases.length; i++) {
@@ -117,16 +118,18 @@ export async function runCases(cases: Case[], options: RunnerOptions): Promise<R
         if (i === 0) {
           const registry = getRubricRegistry();
           const rubric = registry.resolve(caseData.rubric);
-          runRubricId = rubric.id;
+          rubricId = rubric.id;
         }
       } catch (err) {
         const errorResult: CaseResult = {
-          caseId: caseData.id,
+          id: caseData.id,
+          title: caseData.title,
           score: 0,
           passed: false,
-          criteriaResults: [],
+          evidence: (err as Error).message,
+          criteria: [],
+          evaluators: [],
           durationMs: 0,
-          timedOut: false,
           error: (err as Error).message,
           timestamp: new Date(),
         };
@@ -157,13 +160,13 @@ export async function runCases(cases: Case[], options: RunnerOptions): Promise<R
   };
 
   return {
-    runId,
-    startedAt,
-    completedAt,
-    agent: options.agent,
-    rubricId: runRubricId,
-    caseResults: results,
+    id: runId,
+    timestamp: startedAt,
+    cases: results,
     summary,
+    durationMs: totalDurationMs,
+    agent: options.agent,
+    rubricId,
   };
 }
 
@@ -229,19 +232,19 @@ async function runSingleCase(
       });
 
       const agent = getAgent(options.agent);
-      const agentResult: AgentResult = await agent.run(caseData.prompt, {
+      const _agentResult: AgentResult = await agent.run(caseData.prompt, {
         cwd: tempDir,
         model: options.model,
         timeoutMs: (options.timeoutSeconds || 300) * 1000,
         permissionMode: 'acceptEdits',
       });
 
-      if (!agentResult.success) {
-        throw new Error(`Agent execution failed: ${agentResult.error}`);
+      if (!_agentResult.success) {
+        throw new Error(`Agent execution failed: ${_agentResult.error}`);
       }
 
       // Snapshot files the agent produced (before rubric evaluation)
-      const agentFiles = snapshotFiles(tempDir, caseData.files);
+      const _agentFiles = snapshotFiles(tempDir, caseData.files);
 
       // Evaluate using the rubric
       options.onProgress?.({
@@ -252,7 +255,7 @@ async function runSingleCase(
         message: 'Evaluating with rubric...',
       });
 
-      const result = await evaluateWithRubric(caseData, sandbox, options);
+      const result = await evaluateWithRubric(caseData, sandbox, options, _agentResult, _agentFiles);
       const durationMs = Date.now() - startTime;
 
       options.onProgress?.({
@@ -265,21 +268,21 @@ async function runSingleCase(
 
       return {
         ...result,
-        agentResponse: agentResult.answer,
-        agentToolCalls: agentResult.toolCalls.map((t) => ({
+        agentResponse: _agentResult.answer,
+        agentToolCalls: _agentResult.toolCalls.map((t) => ({
           name: t.name,
-          durationMs: t.durationMs,
-          success: t.success,
+          durationMs: t.durationMs || 0,
+          success: t.success || false,
         })),
-        agentModel: agentResult.model,
-        agentTokens: agentResult.tokens
+        agentModel: _agentResult.model,
+        agentTokens: _agentResult.tokens
           ? {
-              input: agentResult.tokens.inputTokens,
-              output: agentResult.tokens.outputTokens,
-              total: agentResult.tokens.totalTokens,
+              input: _agentResult.tokens.inputTokens,
+              output: _agentResult.tokens.outputTokens,
+              total: _agentResult.tokens.totalTokens,
             }
           : undefined,
-        agentFiles,
+        agentFiles: _agentFiles,
         durationMs,
         timestamp: new Date(),
       };
@@ -302,14 +305,17 @@ async function runSingleCase(
 async function evaluateWithRubric(
   caseData: Case,
   sandbox: Sandbox,
-  _options: RunnerOptions
-): Promise<Omit<CaseResult, 'durationMs' | 'timestamp'>> {
+  _options: RunnerOptions,
+  _agentResult: AgentResult,
+  _agentFiles: { path: string; content: string; changed: boolean }[]
+): Promise<CaseResult> {
   const registry = getRubricRegistry();
   const rubric = registry.resolve(caseData.rubric);
 
   const criteriaResults: CriterionResult[] = [];
   let totalWeightedScore = 0;
   let _totalWeight = 0;
+  const evalStartTime = Date.now();
 
   // Evaluate each criterion in the rubric
   for (const [criterionKey, criterion] of Object.entries(rubric.criteria)) {
@@ -318,7 +324,6 @@ async function evaluateWithRubric(
     let evaluatorCount = 0;
 
     for (const evaluator of criterion.evaluators) {
-      const evalStartTime = Date.now();
       let evalResult: Omit<EvaluatorResult, 'name' | 'type' | 'durationMs'>;
 
       if (evaluator.type === 'command') {
@@ -362,6 +367,15 @@ async function evaluateWithRubric(
           score: 0.0,
           evidence: 'Pattern check not yet implemented',
         };
+      } else if ((evaluator.type as EvaluatorType) === 'llm_judge' || (evaluator.type as EvaluatorType) === 'llm_judge_comparison') {
+        // Run LLM judge evaluator
+        // TODO: Implement baseline answer storage and comparison
+        // For now, use a placeholder evaluator
+        evalResult = {
+          passed: false,
+          score: 0.0,
+          evidence: 'LLM judge comparison not yet fully implemented',
+        };
       } else {
         // Other evaluator types (llm_judge, benchmark, etc.) - not implemented
         evalResult = {
@@ -370,13 +384,10 @@ async function evaluateWithRubric(
           evidence: `Evaluator type '${evaluator.type}' not yet implemented`,
         };
       }
-
-      const evalDurationMs = Date.now() - evalStartTime;
-
       evaluatorResults.push({
         name: evaluator.name || evaluator.type,
         type: evaluator.type as EvaluatorType,
-        durationMs: evalDurationMs,
+        durationMs: Date.now() - evalStartTime,
         ...evalResult,
       });
 
@@ -397,9 +408,10 @@ async function evaluateWithRubric(
       name: criterionKey,
       weight: criterion.weight,
       score: rawScore,
-      weightedScore,
       passed: allPassed,
+      evidence: `Criterion: ${criterionKey}`,
       evaluatorResults,
+      durationMs: Date.now() - evalStartTime,
     });
 
     totalWeightedScore += weightedScore;
@@ -420,13 +432,18 @@ async function evaluateWithRubric(
   const passThreshold = 70;
   const passed = overallScore >= passThreshold;
 
-  return {
-    caseId: caseData.id,
+  const result: CaseResult = {
+    id: caseData.id,
+    title: caseData.title,
     score: overallScore,
     passed,
-    criteriaResults,
-    timedOut: false,
+    evidence: `Overall score: ${overallScore.toFixed(2)}%`,
+    criteria: criteriaResults,
+    evaluators: [],
+    durationMs: Date.now() - evalStartTime,
+    timestamp: new Date(),
   };
+  return result;
 }
 
 /**
diff --git a/src/evaluation/runner.ts.bak b/src/evaluation/runner.ts.bak
new file mode 100644
index 0000000..dd12e57
--- /dev/null
+++ b/src/evaluation/runner.ts.bak
@@ -0,0 +1,555 @@
+/**
+ * Evaluation runner - executes cases in sandboxes and evaluates results
+ *
+ * This is the core evaluation engine that:
+ * 1. Sets up the sandbox environment
+ * 2. Runs the case (agent attempts to solve the problem)
+ * 3. Applies the rubric to evaluate the result
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  Case,
+  CaseFile,
+  CaseResult,
+  CriterionResult,
+  EvaluatorResult,
+  RunResult,
+  RunSummary,
+  EvaluatorType,
+} from '../cases/types';
+import { createSandboxManager, checkDocker, RECOMMENDED_IMAGES } from '../sandbox';
+import { Sandbox, SandboxConfig } from '../sandbox/types';
+import { getRubricRegistry } from '../rubrics/loader';
+import { getAgent } from '../agents/registry';
+import type { AgentResult } from '../agents/types';
+// // import { runLLMJudgeEvaluator } from './llm-judge';
+
+export interface RunnerOptions {
+  /** Agent being evaluated (for logging) */
+  agent: string;
+
+  /** Model to use (passed to agent) */
+  model?: string;
+
+  /** Timeout per case in seconds */
+  timeoutSeconds?: number;
+
+  /** Enable network in sandbox */
+  networkEnabled?: boolean;
+
+  /** Callback for progress updates */
+  onProgress?: (update: ProgressUpdate) => void;
+
+  /** Callback when a case completes */
+  onCaseComplete?: (result: CaseResult) => void;
+}
+
+export interface ProgressUpdate {
+  type: 'starting' | 'running' | 'validating' | 'complete' | 'error';
+  caseId: string;
+  caseIndex: number;
+  totalCases: number;
+  message?: string;
+}
+
+/**
+ * Get the appropriate Docker image for a language
+ */
+function getImageForLanguage(language: string): string {
+  const langLower = language.toLowerCase();
+
+  if (langLower === 'javascript' || langLower === 'typescript' || langLower === 'node') {
+    return RECOMMENDED_IMAGES.node.latest;
+  }
+  if (langLower === 'python') {
+    return RECOMMENDED_IMAGES.python.latest;
+  }
+  if (langLower === 'go' || langLower === 'golang') {
+    return RECOMMENDED_IMAGES.go.latest;
+  }
+  if (langLower === 'rust') {
+    return RECOMMENDED_IMAGES.rust.latest;
+  }
+  if (langLower === 'java') {
+    return RECOMMENDED_IMAGES.java.latest;
+  }
+
+  // Default to Node.js for unknown languages
+  return RECOMMENDED_IMAGES.node.latest;
+}
+
+/**
+ * Run a set of cases and return results
+ */
+export async function runCases(cases: Case[], options: RunnerOptions): Promise<RunResult> {
+  const runId = `run-${Date.now()}-${Math.random().toString(36).substring(2, 8)}`;
+  const startedAt = new Date();
+  const results: CaseResult[] = [];
+
+  // Check Docker availability first
+  const dockerStatus = await checkDocker();
+  if (!dockerStatus.available) {
+    throw new Error(`Docker is not available: ${dockerStatus.error}\n${dockerStatus.suggestion}`);
+  }
+
+  const manager = createSandboxManager();
+  let rubricId = 'default';
+
+  try {
+    for (let i = 0; i < cases.length; i++) {
+      const caseData = cases[i];
+
+      options.onProgress?.({
+        type: 'starting',
+        caseId: caseData.id,
+        caseIndex: i,
+        totalCases: cases.length,
+        message: `Starting ${caseData.title}`,
+      });
+
+      try {
+        const result = await runSingleCase(caseData, manager, options, i, cases.length);
+        results.push(result);
+        options.onCaseComplete?.(result);
+        // Track the rubric ID from the first case
+        if (i === 0) {
+          const registry = getRubricRegistry();
+          const rubric = registry.resolve(caseData.rubric);
+          rubricId = rubric.id;
+        }
+      } catch (err) {
+        const errorResult: CaseResult = {
+          id: caseData.id,
+          title: caseData.title,
+          score: 0,
+          passed: false,
+          evidence: (err as Error).message,
+          criteria: [],
+          evaluators: [],
+          durationMs: 0,
+          error: (err as Error).message,
+          timestamp: new Date(),
+        };
+        results.push(errorResult);
+        options.onCaseComplete?.(errorResult);
+      }
+    }
+  } finally {
+    // Clean up all sandboxes
+    await manager.destroyAll();
+  }
+
+  const completedAt = new Date();
+  const totalDurationMs = completedAt.getTime() - startedAt.getTime();
+
+  // Calculate summary
+  const scores = results.map((r) => r.score);
+  const averageScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
+
+  const summary: RunSummary = {
+    total: results.length,
+    passed: results.filter((r) => r.passed).length,
+    failed: results.filter((r) => !r.passed && !r.error).length,
+    skipped: 0,
+    timedOut: results.filter((r) => r.timedOut).length,
+    averageScore,
+    totalDurationMs,
+  };
+
+  return {
+    id: runId,
+    timestamp: startedAt,
+    cases: results,
+    summary,
+    durationMs: totalDurationMs,
+    agent: options.agent,
+    rubricId,
+  };
+}
+
+/**
+ * Run a single case in a sandbox
+ */
+async function runSingleCase(
+  caseData: Case,
+  manager: ReturnType<typeof createSandboxManager>,
+  options: RunnerOptions,
+  caseIndex: number,
+  totalCases: number
+): Promise<CaseResult> {
+  const startTime = Date.now();
+
+  // Create a temporary directory for this case
+  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), `sniff-${caseData.id}-`));
+
+  try {
+    // Write case files to temp directory (if any)
+    if (caseData.files) {
+      for (const file of caseData.files) {
+        const filePath = path.join(tempDir, file.path);
+        const fileDir = path.dirname(filePath);
+
+        // Create directories if needed
+        fs.mkdirSync(fileDir, { recursive: true });
+        if (file.content !== undefined) {
+          fs.writeFileSync(filePath, file.content);
+        }
+      }
+    }
+
+    // Create sandbox
+    const sandboxConfig: SandboxConfig = {
+      workdir: tempDir,
+      image: getImageForLanguage(caseData.language),
+      timeoutSeconds: options.timeoutSeconds || 300,
+      networkEnabled: options.networkEnabled || false,
+    };
+
+    options.onProgress?.({
+      type: 'running',
+      caseId: caseData.id,
+      caseIndex,
+      totalCases,
+      message: 'Creating sandbox...',
+    });
+
+    const sandbox = await manager.create(sandboxConfig);
+
+    try {
+      // Install dependencies if needed
+      await installDependencies(sandbox, caseData.language, options, caseIndex, totalCases, caseData.id);
+
+      // Run the agent to attempt to solve the case
+      options.onProgress?.({
+        type: 'running',
+        caseId: caseData.id,
+        caseIndex,
+        totalCases,
+        message: 'Running agent...',
+      });
+
+      const agent = getAgent(options.agent);
+      const _agentResult: AgentResult = await agent.run(caseData.prompt, {
+        cwd: tempDir,
+        model: options.model,
+        timeoutMs: (options.timeoutSeconds || 300) * 1000,
+        permissionMode: 'acceptEdits',
+      });
+
+      if (!_agentResult.success) {
+        throw new Error(`Agent execution failed: ${_agentResult.error}`);
+      }
+
+      // Snapshot files the agent produced (before rubric evaluation)
+      const _agentFiles = snapshotFiles(tempDir, caseData.files);
+
+      // Evaluate using the rubric
+      options.onProgress?.({
+        type: 'validating',
+        caseId: caseData.id,
+        caseIndex,
+        totalCases,
+        message: 'Evaluating with rubric...',
+      });
+
+      const result = await evaluateWithRubric(caseData, sandbox, options, _agentResult, _agentFiles);
+      const durationMs = Date.now() - startTime;
+
+      options.onProgress?.({
+        type: 'complete',
+        caseId: caseData.id,
+        caseIndex,
+        totalCases,
+        message: result.passed ? `Passed (${Math.round(result.score)}%)` : `Failed (${Math.round(result.score)}%)`,
+      });
+
+      return {
+        ...result,
+        agentResponse: _agentResult.answer,
+        agentToolCalls: _agentResult.toolCalls.map((t) => ({
+          name: t.name,
+          durationMs: t.durationMs || 0,
+          success: t.success || false,
+        })),
+        agentModel: _agentResult.model,
+        agentTokens: _agentResult.tokens
+          ? {
+              input: _agentResult.tokens.inputTokens,
+              output: _agentResult.tokens.outputTokens,
+              total: _agentResult.tokens.totalTokens,
+            }
+          : undefined,
+        agentFiles: _agentFiles,
+        durationMs,
+        timestamp: new Date(),
+      };
+    } finally {
+      await sandbox.destroy();
+    }
+  } finally {
+    // Clean up temp directory
+    try {
+      fs.rmSync(tempDir, { recursive: true, force: true });
+    } catch {
+      // Ignore cleanup errors
+    }
+  }
+}
+
+/**
+ * Evaluate a case using its rubric
+ */
+async function evaluateWithRubric(
+  caseData: Case,
+  sandbox: Sandbox,
+  _options: RunnerOptions,
+  _agentResult: AgentResult,
+  _agentFiles: { path: string; content: string; changed: boolean }[]
+): Promise<CaseResult> {
+  const registry = getRubricRegistry();
+  const rubric = registry.resolve(caseData.rubric);
+
+  const criteriaResults: CriterionResult[] = [];
+  let totalWeightedScore = 0;
+  let _totalWeight = 0;
+  const evalStartTime = Date.now();
+
+  // Evaluate each criterion in the rubric
+  for (const [criterionKey, criterion] of Object.entries(rubric.criteria)) {
+    const evaluatorResults: EvaluatorResult[] = [];
+    let criterionScore = 0;
+    let evaluatorCount = 0;
+
+    for (const evaluator of criterion.evaluators) {
+      let evalResult: Omit<EvaluatorResult, 'name' | 'type' | 'durationMs'>;
+
+      if (evaluator.type === 'command') {
+        // Run command evaluator
+        const result = await sandbox.exec(evaluator.run, {
+          timeoutSeconds: 60,
+        });
+
+        const passed = result.exitCode === 0;
+        let score = passed ? 1.0 : 0.0;
+
+        // Handle partial credit
+        if (evaluator.partialCredit && !passed) {
+          // For test runners, try to parse pass/fail ratio
+          const testMatch = result.stdout.match(/(\d+) passed/);
+          const failMatch = result.stdout.match(/(\d+) failed/);
+          if (testMatch && failMatch) {
+            const passedTests = parseInt(testMatch[1], 10);
+            const failedTests = parseInt(failMatch[1], 10);
+            const total = passedTests + failedTests;
+            if (total > 0) {
+              score = passedTests / total;
+            }
+          }
+        }
+
+        evalResult = {
+          passed,
+          score,
+          evidence: (result.stdout + '\n' + result.stderr).trim(),
+          details: {
+            exitCode: result.exitCode,
+            timedOut: result.timedOut,
+          },
+        };
+      } else if (evaluator.type === 'pattern') {
+        // Run pattern evaluator (check for matches in files)
+        // Default to fail until fully implemented
+        evalResult = {
+          passed: false,
+          score: 0.0,
+          evidence: 'Pattern check not yet implemented',
+        };
+      } else if ((evaluator.type as any) === 'llm_judge' || (evaluator.type as any) === 'llm_judge_comparison') {
+        // Run LLM judge evaluator
+        // TODO: Implement baseline answer storage and comparison
+        // For now, use a placeholder evaluator
+        evalResult = {
+          passed: false,
+          score: 0.0,
+          evidence: 'LLM judge comparison not yet fully implemented',
+        };
+      } else {
+        // Other evaluator types (llm_judge, benchmark, etc.) - not implemented
+        evalResult = {
+          passed: false,
+          score: 0.0,
+          evidence: `Evaluator type '${evaluator.type}' not yet implemented`,
+        };
+      }
+      evaluatorResults.push({
+        name: evaluator.name || evaluator.type,
+        type: evaluator.type as EvaluatorType,
+        durationMs: Date.now() - evalStartTime,
+        ...evalResult,
+      });
+
+      if (!evaluator.optional) {
+        criterionScore += evalResult.score;
+        evaluatorCount++;
+      }
+    }
+
+    // Average score for this criterion
+    // If no non-optional evaluators ran, this criterion doesn't participate in scoring
+    const hasRequiredEvaluators = evaluatorCount > 0;
+    const rawScore = hasRequiredEvaluators ? criterionScore / evaluatorCount : 0.0;
+    const weightedScore = hasRequiredEvaluators ? (rawScore * criterion.weight) / 100 : 0;
+    const allPassed = evaluatorResults.filter((e) => !e.passed).length === 0;
+
+    criteriaResults.push({
+      name: criterionKey,
+      weight: criterion.weight,
+      score: rawScore,
+      passed: allPassed,
+      evidence: `Criterion: ${criterionKey}`,
+      evaluatorResults,
+      durationMs: Date.now() - evalStartTime,
+    });
+
+    totalWeightedScore += weightedScore;
+    // Only count weight for criteria that had non-optional evaluators
+    if (hasRequiredEvaluators) {
+      _totalWeight += criterion.weight;
+    }
+  }
+
+  // Normalize score by participating weight (criteria with only optional evaluators are excluded)
+  // Each criterion's weightedScore = rawScore * weight / 100, so totalWeightedScore
+  // is a fraction of 1.0 when all weights sum to 100. When some criteria are excluded,
+  // rescale so the participating criteria fill the full 0-100% range.
+  const participatingFraction = _totalWeight / 100;
+  const overallScore = participatingFraction > 0 ? (totalWeightedScore / participatingFraction) * 100 : 0;
+
+  // Determine pass/fail (default threshold: 70%)
+  const passThreshold = 70;
+  const passed = overallScore >= passThreshold;
+
+  const result: CaseResult = {
+    id: caseData.id,
+    title: caseData.title,
+    score: overallScore,
+    passed,
+    evidence: `Overall score: ${overallScore.toFixed(2)}%`,
+    criteria: criteriaResults,
+    evaluators: [],
+    durationMs: Date.now() - evalStartTime,
+    timestamp: new Date(),
+  };
+  return result;
+}
+
+/**
+ * Install dependencies based on language
+ */
+async function installDependencies(
+  sandbox: Sandbox,
+  language: string,
+  options: RunnerOptions,
+  caseIndex: number,
+  totalCases: number,
+  caseId: string
+): Promise<void> {
+  const langLower = language.toLowerCase();
+
+  options.onProgress?.({
+    type: 'running',
+    caseId,
+    caseIndex,
+    totalCases,
+    message: 'Installing dependencies...',
+  });
+
+  if (langLower === 'python') {
+    // Check for requirements.txt
+    const result = await sandbox.exec('test -f requirements.txt && pip install -r requirements.txt || true');
+    if (result.exitCode !== 0 && result.stderr) {
+      console.warn('Warning: pip install failed:', result.stderr);
+    }
+    // Also install pytest if running tests
+    await sandbox.exec('pip install pytest --quiet 2>/dev/null || true');
+  } else if (langLower === 'javascript' || langLower === 'typescript' || langLower === 'node') {
+    // Check for package.json
+    const result = await sandbox.exec('test -f package.json && npm install --silent || true');
+    if (result.exitCode !== 0 && result.stderr) {
+      console.warn('Warning: npm install failed:', result.stderr);
+    }
+  } else if (langLower === 'go' || langLower === 'golang') {
+    // Check for go.mod
+    await sandbox.exec('test -f go.mod && go mod download || true');
+  }
+}
+
+/**
+ * Snapshot all files in the workspace after the agent runs.
+ * Compares against the original case files to flag which ones changed.
+ * Reads directly from the host tempDir (bind-mounted into the sandbox).
+ */
+function snapshotFiles(
+  tempDir: string,
+  originalFiles?: CaseFile[]
+): { path: string; content: string; changed: boolean }[] {
+  const results: { path: string; content: string; changed: boolean }[] = [];
+  const origMap = new Map<string, string>();
+
+  // Build map of original file contents for comparison
+  if (originalFiles) {
+    for (const f of originalFiles) {
+      if (f.content !== undefined) {
+        origMap.set(f.path, f.content);
+      }
+    }
+  }
+
+  // Walk the temp directory and collect all files
+  function walk(dir: string, prefix: string) {
+    let entries: fs.Dirent[];
+    try {
+      entries = fs.readdirSync(dir, { withFileTypes: true });
+    } catch {
+      return;
+    }
+    for (const entry of entries) {
+      const relPath = prefix ? `${prefix}/${entry.name}` : entry.name;
+      const fullPath = path.join(dir, entry.name);
+
+      // Skip common non-essential directories
+      if (entry.isDirectory()) {
+        if (['node_modules', '.git', '__pycache__', '.pytest_cache', 'venv', '.venv'].includes(entry.name)) {
+          continue;
+        }
+        walk(fullPath, relPath);
+        continue;
+      }
+
+      if (!entry.isFile()) continue;
+
+      // Skip binary and large files
+      try {
+        const stat = fs.statSync(fullPath);
+        if (stat.size > 100_000) continue; // Skip files over 100KB
+      } catch {
+        continue;
+      }
+
+      try {
+        const content = fs.readFileSync(fullPath, 'utf-8');
+        const original = origMap.get(relPath);
+        const changed = original === undefined || original !== content;
+        results.push({ path: relPath, content, changed });
+      } catch {
+        // Skip files that can't be read as UTF-8
+      }
+    }
+  }
+
+  walk(tempDir, '');
+  return results;
+}
diff --git a/src/evaluation/runner.ts.orig b/src/evaluation/runner.ts.orig
new file mode 100644
index 0000000..b4c67c6
--- /dev/null
+++ b/src/evaluation/runner.ts.orig
@@ -0,0 +1,566 @@
+/**
+ * Evaluation runner - executes cases in sandboxes and evaluates results
+ *
+ * This is the core evaluation engine that:
+ * 1. Sets up the sandbox environment
+ * 2. Runs the case (agent attempts to solve the problem)
+ * 3. Applies the rubric to evaluate the result
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  Case,
+  CaseFile,
+  CaseResult,
+  CriterionResult,
+  EvaluatorResult,
+  RunResult,
+  RunSummary,
+  EvaluatorType,
+} from '../cases/types';
+import { createSandboxManager, checkDocker, RECOMMENDED_IMAGES } from '../sandbox';
+import { Sandbox, SandboxConfig } from '../sandbox/types';
+import { getRubricRegistry } from '../rubrics/loader';
+import { getAgent } from '../agents/registry';
+import { runLLMJudgeEvaluator } from './llm-judge';
+import type { AgentResult } from '../agents/types';
+
+export interface RunnerOptions {
+  /** Agent being evaluated (for logging) */
+  agent: string;
+
+  /** Model to use (passed to agent) */
+  model?: string;
+
+  /** Timeout per case in seconds */
+  timeoutSeconds?: number;
+
+  /** Enable network in sandbox */
+  networkEnabled?: boolean;
+
+  /** Callback for progress updates */
+  onProgress?: (update: ProgressUpdate) => void;
+
+  /** Callback when a case completes */
+  onCaseComplete?: (result: CaseResult) => void;
+}
+
+export interface ProgressUpdate {
+  type: 'starting' | 'running' | 'validating' | 'complete' | 'error';
+  caseId: string;
+  caseIndex: number;
+  totalCases: number;
+  message?: string;
+}
+
+/**
+ * Get the appropriate Docker image for a language
+ */
+function getImageForLanguage(language: string): string {
+  const langLower = language.toLowerCase();
+
+  if (langLower === 'javascript' || langLower === 'typescript' || langLower === 'node') {
+    return RECOMMENDED_IMAGES.node.latest;
+  }
+  if (langLower === 'python') {
+    return RECOMMENDED_IMAGES.python.latest;
+  }
+  if (langLower === 'go' || langLower === 'golang') {
+    return RECOMMENDED_IMAGES.go.latest;
+  }
+  if (langLower === 'rust') {
+    return RECOMMENDED_IMAGES.rust.latest;
+  }
+  if (langLower === 'java') {
+    return RECOMMENDED_IMAGES.java.latest;
+  }
+
+  // Default to Node.js for unknown languages
+  return RECOMMENDED_IMAGES.node.latest;
+}
+
+/**
+ * Run a set of cases and return results
+ */
+export async function runCases(cases: Case[], options: RunnerOptions): Promise<RunResult> {
+  const runId = `run-${Date.now()}-${Math.random().toString(36).substring(2, 8)}`;
+  const startedAt = new Date();
+  const results: CaseResult[] = [];
+
+  // Check Docker availability first
+  const dockerStatus = await checkDocker();
+  if (!dockerStatus.available) {
+    throw new Error(`Docker is not available: ${dockerStatus.error}\n${dockerStatus.suggestion}`);
+  }
+
+  const manager = createSandboxManager();
+  let rubricId = 'default';
+
+  try {
+    for (let i = 0; i < cases.length; i++) {
+      const caseData = cases[i];
+
+      options.onProgress?.({
+        type: 'starting',
+        caseId: caseData.id,
+        caseIndex: i,
+        totalCases: cases.length,
+        message: `Starting ${caseData.title}`,
+      });
+
+      try {
+        const result = await runSingleCase(caseData, manager, options, i, cases.length);
+        results.push(result);
+        options.onCaseComplete?.(result);
+        // Track the rubric ID from the first case
+        if (i === 0) {
+          const registry = getRubricRegistry();
+          const rubric = registry.resolve(caseData.rubric);
+          rubricId = rubric.id;
+        }
+      } catch (err) {
+        const errorResult: CaseResult = {
+          id: caseData.id,
+          title: caseData.title,
+          score: 0,
+          passed: false,
+          evidence: (err as Error).message,
+          criteria: [],
+          evaluators: [],
+          durationMs: 0,
+          error: (err as Error).message,
+          timestamp: new Date(),
+        };
+        results.push(errorResult);
+        options.onCaseComplete?.(errorResult);
+      }
+    }
+  } finally {
+    // Clean up all sandboxes
+    await manager.destroyAll();
+  }
+
+  const completedAt = new Date();
+  const totalDurationMs = completedAt.getTime() - startedAt.getTime();
+
+  // Calculate summary
+  const scores = results.map((r) => r.score);
+  const averageScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
+
+  const summary: RunSummary = {
+    total: results.length,
+    passed: results.filter((r) => r.passed).length,
+    failed: results.filter((r) => !r.passed && !r.error).length,
+    skipped: 0,
+    timedOut: results.filter((r) => r.timedOut).length,
+    averageScore,
+    totalDurationMs,
+  };
+
+  return {
+    id: runId,
+    timestamp: startedAt,
+    cases: results,
+    summary,
+    durationMs: totalDurationMs,
+    agent: options.agent,
+    rubricId,
+  };
+}
+
+/**
+ * Run a single case in a sandbox
+ */
+async function runSingleCase(
+  caseData: Case,
+  manager: ReturnType<typeof createSandboxManager>,
+  options: RunnerOptions,
+  caseIndex: number,
+  totalCases: number
+): Promise<CaseResult> {
+  const startTime = Date.now();
+
+  // Create a temporary directory for this case
+  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), `sniff-${caseData.id}-`));
+
+  try {
+    // Write case files to temp directory (if any)
+    if (caseData.files) {
+      for (const file of caseData.files) {
+        const filePath = path.join(tempDir, file.path);
+        const fileDir = path.dirname(filePath);
+
+        // Create directories if needed
+        fs.mkdirSync(fileDir, { recursive: true });
+        if (file.content !== undefined) {
+          fs.writeFileSync(filePath, file.content);
+        }
+      }
+    }
+
+    // Create sandbox
+    const sandboxConfig: SandboxConfig = {
+      workdir: tempDir,
+      image: getImageForLanguage(caseData.language),
+      timeoutSeconds: options.timeoutSeconds || 300,
+      networkEnabled: options.networkEnabled || false,
+    };
+
+    options.onProgress?.({
+      type: 'running',
+      caseId: caseData.id,
+      caseIndex,
+      totalCases,
+      message: 'Creating sandbox...',
+    });
+
+    const sandbox = await manager.create(sandboxConfig);
+
+    try {
+      // Install dependencies if needed
+      await installDependencies(sandbox, caseData.language, options, caseIndex, totalCases, caseData.id);
+
+      // Run the agent to attempt to solve the case
+      options.onProgress?.({
+        type: 'running',
+        caseId: caseData.id,
+        caseIndex,
+        totalCases,
+        message: 'Running agent...',
+      });
+
+      const agent = getAgent(options.agent);
+      const agentResult: AgentResult = await agent.run(caseData.prompt, {
+        cwd: tempDir,
+        model: options.model,
+        timeoutMs: (options.timeoutSeconds || 300) * 1000,
+        permissionMode: 'acceptEdits',
+      });
+
+      if (!agentResult.success) {
+        throw new Error(`Agent execution failed: ${agentResult.error}`);
+      }
+
+      // Snapshot files the agent produced (before rubric evaluation)
+      const agentFiles = snapshotFiles(tempDir, caseData.files);
+
+      // Evaluate using the rubric
+      options.onProgress?.({
+        type: 'validating',
+        caseId: caseData.id,
+        caseIndex,
+        totalCases,
+        message: 'Evaluating with rubric...',
+      });
+
+      const result = await evaluateWithRubric(caseData, sandbox, options, agentResult, agentFiles);
+      const durationMs = Date.now() - startTime;
+
+      options.onProgress?.({
+        type: 'complete',
+        caseId: caseData.id,
+        caseIndex,
+        totalCases,
+        message: result.passed ? `Passed (${Math.round(result.score)}%)` : `Failed (${Math.round(result.score)}%)`,
+      });
+
+      return {
+        ...result,
+        agentResponse: agentResult.answer,
+        agentToolCalls: agentResult.toolCalls.map((t) => ({
+          name: t.name,
+          durationMs: t.durationMs || 0,
+          success: t.success || false,
+        })),
+        agentModel: agentResult.model,
+        agentTokens: agentResult.tokens
+          ? {
+              input: agentResult.tokens.inputTokens,
+              output: agentResult.tokens.outputTokens,
+              total: agentResult.tokens.totalTokens,
+            }
+          : undefined,
+        agentFiles,
+        durationMs,
+        timestamp: new Date(),
+      };
+    } finally {
+      await sandbox.destroy();
+    }
+  } finally {
+    // Clean up temp directory
+    try {
+      fs.rmSync(tempDir, { recursive: true, force: true });
+    } catch {
+      // Ignore cleanup errors
+    }
+  }
+}
+
+/**
+ * Evaluate a case using its rubric
+ */
+async function evaluateWithRubric(
+  caseData: Case,
+  sandbox: Sandbox,
+  _options: RunnerOptions,
+  agentResult: AgentResult,
+  agentFiles: { path: string; content: string; changed: boolean }[]
+): Promise<CaseResult> {
+  const registry = getRubricRegistry();
+  const rubric = registry.resolve(caseData.rubric);
+
+  const criteriaResults: CriterionResult[] = [];
+  let totalWeightedScore = 0;
+  let _totalWeight = 0;
+
+  // Evaluate each criterion in the rubric
+  for (const [criterionKey, criterion] of Object.entries(rubric.criteria)) {
+    const evaluatorResults: EvaluatorResult[] = [];
+    let criterionScore = 0;
+    let evaluatorCount = 0;
+
+    for (const evaluator of criterion.evaluators) {
+      const evalStartTime = Date.now();
+      let evalResult: Omit<EvaluatorResult, 'name' | 'type' | 'durationMs'>;
+
+      if (evaluator.type === 'command') {
+        // Run command evaluator
+        const result = await sandbox.exec(evaluator.run, {
+          timeoutSeconds: 60,
+        });
+
+        const passed = result.exitCode === 0;
+        let score = passed ? 1.0 : 0.0;
+
+        // Handle partial credit
+        if (evaluator.partialCredit && !passed) {
+          // For test runners, try to parse pass/fail ratio
+          const testMatch = result.stdout.match(/(\d+) passed/);
+          const failMatch = result.stdout.match(/(\d+) failed/);
+          if (testMatch && failMatch) {
+            const passedTests = parseInt(testMatch[1], 10);
+            const failedTests = parseInt(failMatch[1], 10);
+            const total = passedTests + failedTests;
+            if (total > 0) {
+              score = passedTests / total;
+            }
+          }
+        }
+
+        evalResult = {
+          passed,
+          score,
+          evidence: (result.stdout + '\n' + result.stderr).trim(),
+          details: {
+            exitCode: result.exitCode,
+            timedOut: result.timedOut,
+          },
+        };
+      } else if (evaluator.type === 'pattern') {
+        // Run pattern evaluator (check for matches in files)
+        // Default to fail until fully implemented
+        evalResult = {
+          passed: false,
+          score: 0.0,
+          evidence: 'Pattern check not yet implemented',
+        };
+      } else if (evaluator.type === 'llm_judge') {
+        // Run LLM judge evaluator
+        const result = await runLLMJudgeEvaluator(evaluator, agentResult.answer, JSON.stringify(agentFiles));
+        evalResult = {
+          passed: result.passed,
+          score: result.score,
+          evidence: result.evidence,
+          details: result.details,
+        };
+      } else if ((evaluator.type as any) === 'llm_judge_comparison') {
+        // Run LLM judge comparison evaluator
+        // TODO: Implement baseline answer storage and comparison
+        // For now, use a placeholder evaluator
+        evalResult = {
+          passed: false,
+          score: 0.0,
+          evidence: 'LLM judge comparison not yet fully implemented',
+        };
+      } else {
+        // Other evaluator types (llm_judge, benchmark, etc.) - not implemented
+        evalResult = {
+          passed: false,
+          score: 0.0,
+          evidence: `Evaluator type '${evaluator.type}' not yet implemented`,
+        };
+      }
+
+      const evalDurationMs = Date.now() - evalStartTime;
+
+      evaluatorResults.push({
+        name: evaluator.name || evaluator.type,
+        type: evaluator.type as EvaluatorType,
+        durationMs: evalDurationMs,
+        ...evalResult,
+      });
+
+      if (!evaluator.optional) {
+        criterionScore += evalResult.score;
+        evaluatorCount++;
+      }
+    }
+
+    // Average score for this criterion
+    // If no non-optional evaluators ran, this criterion doesn't participate in scoring
+    const hasRequiredEvaluators = evaluatorCount > 0;
+    const rawScore = hasRequiredEvaluators ? criterionScore / evaluatorCount : 0.0;
+    const weightedScore = hasRequiredEvaluators ? (rawScore * criterion.weight) / 100 : 0;
+    const allPassed = evaluatorResults.filter((e) => !e.passed).length === 0;
+
+    criteriaResults.push({
+      name: criterionKey,
+      weight: criterion.weight,
+      score: rawScore,
+      passed: allPassed,
+      evidence: `Criterion: ${criterionKey}`,
+      evaluatorResults,
+      durationMs: evalDurationMs,
+    });
+
+    totalWeightedScore += weightedScore;
+    // Only count weight for criteria that had non-optional evaluators
+    if (hasRequiredEvaluators) {
+      _totalWeight += criterion.weight;
+    }
+  }
+
+  // Normalize score by participating weight (criteria with only optional evaluators are excluded)
+  // Each criterion's weightedScore = rawScore * weight / 100, so totalWeightedScore
+  // is a fraction of 1.0 when all weights sum to 100. When some criteria are excluded,
+  // rescale so the participating criteria fill the full 0-100% range.
+  const participatingFraction = _totalWeight / 100;
+  const overallScore = participatingFraction > 0 ? (totalWeightedScore / participatingFraction) * 100 : 0;
+
+  // Determine pass/fail (default threshold: 70%)
+  const passThreshold = 70;
+  const passed = overallScore >= passThreshold;
+
+  return {
+    id: caseData.id,
+    title: caseData.title,
+    score: overallScore,
+    passed,
+    evidence: `Overall score: ${overallScore.toFixed(2)}%`,
+    criteria: criteriaResults,
+    evaluators: [],
+    durationMs: Date.now() - evalStartTime,
+    timestamp: new Date(),
+  };
+}
+
+/**
+ * Install dependencies based on language
+ */
+async function installDependencies(
+  sandbox: Sandbox,
+  language: string,
+  options: RunnerOptions,
+  caseIndex: number,
+  totalCases: number,
+  caseId: string
+): Promise<void> {
+  const langLower = language.toLowerCase();
+
+  options.onProgress?.({
+    type: 'running',
+    caseId,
+    caseIndex,
+    totalCases,
+    message: 'Installing dependencies...',
+  });
+
+  if (langLower === 'python') {
+    // Check for requirements.txt
+    const result = await sandbox.exec('test -f requirements.txt && pip install -r requirements.txt || true');
+    if (result.exitCode !== 0 && result.stderr) {
+      console.warn('Warning: pip install failed:', result.stderr);
+    }
+    // Also install pytest if running tests
+    await sandbox.exec('pip install pytest --quiet 2>/dev/null || true');
+  } else if (langLower === 'javascript' || langLower === 'typescript' || langLower === 'node') {
+    // Check for package.json
+    const result = await sandbox.exec('test -f package.json && npm install --silent || true');
+    if (result.exitCode !== 0 && result.stderr) {
+      console.warn('Warning: npm install failed:', result.stderr);
+    }
+  } else if (langLower === 'go' || langLower === 'golang') {
+    // Check for go.mod
+    await sandbox.exec('test -f go.mod && go mod download || true');
+  }
+}
+
+/**
+ * Snapshot all files in the workspace after the agent runs.
+ * Compares against the original case files to flag which ones changed.
+ * Reads directly from the host tempDir (bind-mounted into the sandbox).
+ */
+function snapshotFiles(
+  tempDir: string,
+  originalFiles?: CaseFile[]
+): { path: string; content: string; changed: boolean }[] {
+  const results: { path: string; content: string; changed: boolean }[] = [];
+  const origMap = new Map<string, string>();
+
+  // Build map of original file contents for comparison
+  if (originalFiles) {
+    for (const f of originalFiles) {
+      if (f.content !== undefined) {
+        origMap.set(f.path, f.content);
+      }
+    }
+  }
+
+  // Walk the temp directory and collect all files
+  function walk(dir: string, prefix: string) {
+    let entries: fs.Dirent[];
+    try {
+      entries = fs.readdirSync(dir, { withFileTypes: true });
+    } catch {
+      return;
+    }
+    for (const entry of entries) {
+      const relPath = prefix ? `${prefix}/${entry.name}` : entry.name;
+      const fullPath = path.join(dir, entry.name);
+
+      // Skip common non-essential directories
+      if (entry.isDirectory()) {
+        if (['node_modules', '.git', '__pycache__', '.pytest_cache', 'venv', '.venv'].includes(entry.name)) {
+          continue;
+        }
+        walk(fullPath, relPath);
+        continue;
+      }
+
+      if (!entry.isFile()) continue;
+
+      // Skip binary and large files
+      try {
+        const stat = fs.statSync(fullPath);
+        if (stat.size > 100_000) continue; // Skip files over 100KB
+      } catch {
+        continue;
+      }
+
+      try {
+        const content = fs.readFileSync(fullPath, 'utf-8');
+        const original = origMap.get(relPath);
+        const changed = original === undefined || original !== content;
+        results.push({ path: relPath, content, changed });
+      } catch {
+        // Skip files that can't be read as UTF-8
+      }
+    }
+  }
+
+  walk(tempDir, '');
+  return results;
+}