diff --git a/HANDOFF.md b/HANDOFF.md
new file mode 100644
index 0000000..0502097
--- /dev/null
+++ b/HANDOFF.md
@@ -0,0 +1,72 @@
+# Handoff Notes
+
+## Current state
+
+The `add-glm-agent` branch (PR #48) adds an opencode agent integration to sniffbench. It works end-to-end: spawns an opencode server, sends prompts via the SDK, streams events via SSE, captures tool calls and file snapshots, evaluates with the rubric, and saves results.
+
+**To test:**
+```bash
+pnpm run build
+npx sniff run --agent opencode --cases bootstrap-005
+npx sniff closed-issues run --agent opencode --local
+```
+
+## Test results (GLM-4.7-4bit via opencode)
+
+### Bootstrap cases
+
+| Case | Score | Verdict | Notes |
+|------|-------|---------|-------|
+| bootstrap-003 (Python Unit Tests) | 100% | **False pass** | Starter code already passes all tests — no-op |
+| bootstrap-004 (Palindrome Checker) | 0% | Fail | Agent edited file but tests failed. JS test runner (`node *.test.js`) was missing from rubric command chain — now fixed |
+| bootstrap-005 (Binary Search) | 100% | **Legit pass** | Agent implemented full binary search from `pass` stub |
+| bootstrap-006 (Refactor Bad Code) | 0% | Fail | Agent refactored and renamed function `c`, breaking `require('./shipping_calculator').c` in test file. Also hit missing JS test runner issue |
+| bootstrap-007 (CSV Parser) | 100% | **Legit pass** | Agent implemented CSV parser from empty `return []` stub |
+| simple-001 (Simple math) | 0% | Fail | Code already passes — should be free win. Failed because JS test runner was missing from rubric |
+| fail-001 (Intentionally failing) | 0% | Expected fail | Case exists to verify failure reporting works |
+
+**Key fix:** Added `node *.test.js` to the test command chain in `defaults.ts`. All JS cases were failing because the rubric only tried `npm test` (no package.json) then fell through to Python/Go runners.
+
+### Closed-issues cases (real GitHub issues)
+
+| Case | Score | Notes |
+|------|-------|-------|
+| #12 (Add --compare flag) | 10/100 | 0 files changed. Agent explored but produced no edits. Hard task: 363 additions across 2 files |
+| #38 (Split variant/variants) | 10/100 | Created new `variants.ts` instead of modifying existing files. Understood intent but wrong approach |
+
+These are significantly harder than bootstrap cases — require understanding a real codebase and making coordinated multi-file changes. Likely beyond a 4-bit quantized local model's capability.
+
+## What's next
+
+### 1. Bootstrap test cases need work
+- **bootstrap-003 (python-unit-test):** No-op — starter code already passes all tests. Make it actually buggy (e.g., `text.split(' ')` instead of `text.split()`).
+- **bootstrap-006 (refactor):** Test file imports `c` by name. If the agent renames it (the whole point of refactoring), tests break. Either update tests to import by new name, or make the test more flexible.
+
+### 2. Re-run JS cases
+After the `node *.test.js` fix, bootstrap-004 and simple-001 should be re-run to get accurate scores.
+
+### 3. CodeRabbit review items
+- Remove unused `randombytes` dependency from package.json
+- Remove redundant `allowSyntheticDefaultImports` from tsconfig.json
+- Remove redundant `"dist/**/*"` from tsconfig exclude
+
+### 4. Hardcoded model config
+The local-glm provider config (baseURL, model path, API key) is hardcoded in `src/agents/opencode.ts` constructor default. Should be externalized — read from opencode config file or a sniffbench config file.
+
+### 5. Comprehension cases
+The 12 comp-* cases reference a `comprehension` rubric that doesn't exist yet. These are Q&A tasks that need LLM-judge or human evaluation, not test suites.
+
+## Key technical details
+
+- **ESM wrapper:** The `@opencode-ai/sdk` is ESM-only but the project is CommonJS. The `.mjs` wrapper in `src/agents/opencode-sdk.mjs` bridges this. tsc doesn't copy `.mjs` files, so the build script includes a manual `cp` step.
+- **Port management:** `nextPort` counter in opencode.ts increments per run to avoid collisions. Resets on process restart.
+- **Server lifecycle:** Each agent run spawns a real opencode server process on a unique port, with the case's temp dir as cwd. The server is killed in the `finally` block.
+- **Event streaming:** Uses `client.event.subscribe()` (SSE) + `session.promptAsync()`. The stream object is at `sseResult.stream` (not `.data`). Events arrive as `message.part.updated` (tool calls, text, reasoning), `message.updated` (final tokens/cost), `session.status` (idle = done).
+- **File snapshots:** After agent runs, `snapshotFiles()` in runner.ts walks the host tempDir and captures all files with a `changed` flag vs originals. Skips node_modules, .git, __pycache__, files >100KB.
+- **Agent config:** `DEFAULT_AGENT` constant in `src/agents/registry.ts` is used across all CLI commands. No more hardcoded `'claude-code'` strings.
+- **Closed-issues runner:** Now routes through agent wrappers (supports `--agent` and `--model` flags) instead of shelling out directly to `claude` CLI.
+
+## Git conventions
+- Do NOT include `Co-Authored-By` or Anthropic email in commits
+- Include model version (e.g., `claude-opus-4-6`) in commit body if desired
+- Never amend commits — always create new ones
diff --git a/cases/bootstrap/binary-search.yaml b/cases/bootstrap/binary-search.yaml
new file mode 100644
index 0000000..1f53899
--- /dev/null
+++ b/cases/bootstrap/binary-search.yaml
@@ -0,0 +1,91 @@
+id: bootstrap-005
+title: "Binary Search Implementation"
+prompt: |
+  Complete the binary_search function implementation. The function
+  should find the index of a target value in a sorted array, or
+  return -1 if not found.
+
+  Binary search must:
+  - Run in O(log n) time complexity
+  - Handle empty arrays
+  - Handle values not present in the array
+  - Work with any comparable values
+
+  Run: python binary_search.test.py
+  Make all tests pass.
+
+source: bootstrap
+category: codefix
+language: python
+difficulty: medium
+
+tags:
+  - python
+  - algorithms
+  - binary-search
+
+files:
+  - path: binary_search.py
+    content: |
+      def binary_search(arr, target):
+          """
+          Perform binary search on a sorted array.
+
+          Args:
+              arr: Sorted list of comparable elements
+              target: Value to search for
+
+          Returns:
+              Index of target if found, -1 otherwise
+
+          Time complexity: O(log n)
+          """
+          # TODO: Implement binary search
+          pass
+
+  - path: binary_search.test.py
+    content: |
+      import unittest
+      from binary_search import binary_search
+
+      class TestBinarySearch(unittest.TestCase):
+
+          def test_found_elements(self):
+              arr = [1, 3, 5, 7, 9, 11, 13, 15]
+              self.assertEqual(binary_search(arr, 7), 3)
+              self.assertEqual(binary_search(arr, 1), 0)
+              self.assertEqual(binary_search(arr, 15), 7)
+              self.assertEqual(binary_search(arr, 9), 4)
+
+          def test_not_found(self):
+              arr = [1, 3, 5, 7, 9]
+              self.assertEqual(binary_search(arr, 2), -1)
+              self.assertEqual(binary_search(arr, 6), -1)
+              self.assertEqual(binary_search(arr, 10), -1)
+
+          def test_empty_array(self):
+              self.assertEqual(binary_search([], 5), -1)
+
+          def test_single_element(self):
+              arr = [42]
+              self.assertEqual(binary_search(arr, 42), 0)
+              self.assertEqual(binary_search(arr, 0), -1)
+
+          def test_two_elements(self):
+              arr = [1, 2]
+              self.assertEqual(binary_search(arr, 1), 0)
+              self.assertEqual(binary_search(arr, 2), 1)
+
+          def test_strings(self):
+              arr = ['apple', 'banana', 'cherry', 'date']
+              self.assertEqual(binary_search(arr, 'cherry'), 2)
+              self.assertEqual(binary_search(arr, 'grape'), -1)
+
+          def test_large_array(self):
+              arr = list(range(1000))
+              self.assertEqual(binary_search(arr, 42), 42)
+              self.assertEqual(binary_search(arr, 999), 999)
+              self.assertEqual(binary_search(arr, 1000), -1)
+
+      if __name__ == '__main__':
+          unittest.main()
\ No newline at end of file
diff --git a/cases/bootstrap/csv-parser.yaml b/cases/bootstrap/csv-parser.yaml
new file mode 100644
index 0000000..e71369a
--- /dev/null
+++ b/cases/bootstrap/csv-parser.yaml
@@ -0,0 +1,125 @@
+id: bootstrap-007
+title: "File Processing - CSV Parser"
+prompt: |
+  Implement a CSV parser that can read and parse a CSV file.
+  The implementation should handle:
+  - Basic comma-separated values
+  - Quoted fields containing commas
+  - Header row extraction
+  - Converting to array of objects
+
+  Run: python csv_parser.test.py
+  Make all tests pass.
+
+source: bootstrap
+category: codefix
+language: python
+difficulty: medium
+
+tags:
+  - python
+  - file-processing
+  - csv
+
+files:
+  - path: csv_parser.py
+    content: |
+      import csv
+
+      def parse_csv(filepath, has_header=True):
+          """
+          Parse a CSV file and return data as list of dicts (or lists).
+
+          Args:
+              filepath: Path to the CSV file
+              has_header: Whether the first row is a header row
+
+          Returns:
+              List of dictionaries (if has_header=True) or list of lists
+          """
+          # TODO: Implement this function
+          return []
+
+  - path: csv_parser.test.py
+    content: |
+      import unittest
+      import os
+      import tempfile
+      import textwrap
+      from csv_parser import parse_csv
+
+      class TestCSVParser(unittest.TestCase):
+
+          def test_simple_csv_with_header(self):
+              data = textwrap.dedent('''\
+                  name,age,city
+                  Alice,30,New York
+                  Bob,25,Los Angeles
+                  Charlie,35,Chicago''')
+
+              with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+                  f.write(data)
+                  f.flush()
+
+                  result = parse_csv(f.name, has_header=True)
+
+                  self.assertEqual(len(result), 3)
+                  self.assertEqual(result[0]['name'], 'Alice')
+                  self.assertEqual(result[0]['age'], '30')
+                  self.assertEqual(result[1]['city'], 'Los Angeles')
+
+              os.unlink(f.name)
+
+          def test_csv_without_header(self):
+              data = textwrap.dedent('''\
+                  Alice,30,New York
+                  Bob,25,Los Angeles''')
+
+              with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+                  f.write(data)
+                  f.flush()
+
+                  result = parse_csv(f.name, has_header=False)
+
+                  self.assertEqual(len(result), 2)
+                  self.assertEqual(result[0][0], 'Alice')
+                  self.assertEqual(result[1][2], 'Los Angeles')
+
+              os.unlink(f.name)
+
+          def test_quoted_fields(self):
+              data = textwrap.dedent('''\
+                  product,price,description
+                  Widget,10.00,"A widget, really."
+                  Gadget,15.00,"A device, good."''')
+
+              with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+                  f.write(data)
+                  f.flush()
+
+                  result = parse_csv(f.name, has_header=True)
+
+                  self.assertEqual(len(result), 2)
+                  self.assertEqual(result[0]['description'], 'A widget, really.')
+                  self.assertEqual(result[1]['description'], 'A device, good.')
+
+              os.unlink(f.name)
+
+          def test_single_row(self):
+              data = textwrap.dedent('''\
+                  name,value
+                  test,123''')
+
+              with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+                  f.write(data)
+                  f.flush()
+
+                  result = parse_csv(f.name, has_header=True)
+
+                  self.assertEqual(len(result), 1)
+                  self.assertEqual(result[0]['name'], 'test')
+
+              os.unlink(f.name)
+
+      if __name__ == '__main__':
+          unittest.main()
\ No newline at end of file
diff --git a/cases/bootstrap/palindrome-checker.yaml b/cases/bootstrap/palindrome-checker.yaml
new file mode 100644
index 0000000..ab47218
--- /dev/null
+++ b/cases/bootstrap/palindrome-checker.yaml
@@ -0,0 +1,97 @@
+id: bootstrap-004
+title: "Palindrome Checker"
+prompt: |
+  Implement a palindrome checker that works correctly across
+  different edge cases. The tests are already written - you need
+  to make them all pass.
+
+  A palindrome reads the same forwards and backwards.
+  You should:
+  - Ignore case
+  - Ignore non-alphanumeric characters
+  - Handle empty strings as valid palindromes
+
+  Run: node palindrome.test.js
+  Fix the implementation until all tests pass.
+
+source: bootstrap
+category: codefix
+language: javascript
+difficulty: easy
+
+tags:
+  - javascript
+  - algorithms
+  - string-manipulation
+
+files:
+  - path: palindrome.js
+    content: |
+      function isPalindrome(str) {
+        // TODO: Implement properly
+        return str === str.split('').reverse().join('');
+      }
+
+      module.exports = { isPalindrome };
+
+  - path: palindrome.test.js
+    content: |
+      const { isPalindrome } = require('./palindrome');
+
+      function test(name, fn) {
+        try {
+          fn();
+          console.log(`✓ ${name}`);
+        } catch (err) {
+          console.log(`✗ ${name}: ${err.message}`);
+          process.exit(1);
+        }
+      }
+
+      function assertEqual(actual, expected, message) {
+        if (actual !== expected) {
+          throw new Error(message || `Expected ${expected}, got ${actual}`);
+        }
+      }
+
+      // Basic palindromes
+      test('racecar is palindrome', () => {
+        assertEqual(isPalindrome('racecar'), true);
+      });
+
+      test('hello is not palindrome', () => {
+        assertEqual(isPalindrome('hello'), false);
+      });
+
+      // Case insensitive
+      test('RaceCar is palindrome', () => {
+        assertEqual(isPalindrome('RaceCar'), true);
+      });
+
+      test('A man a plan a canal Panama', () => {
+        assertEqual(isPalindrome('A man a plan a canal Panama'), true);
+      });
+
+      // With spaces and punctuation
+      test('Was it a car or a cat I saw', () => {
+        assertEqual(isPalindrome('Was it a car or a cat I saw'), true);
+      });
+
+      // Edge cases
+      test('empty string', () => {
+        assertEqual(isPalindrome(''), true);
+      });
+
+      test('single character', () => {
+        assertEqual(isPalindrome('a'), true);
+      });
+
+      test('numeric', () => {
+        assertEqual(isPalindrome('12321'), true);
+      });
+
+      test('numeric with letters', () => {
+        assertEqual(isPalindrome('1a2 3 2a1'), true);
+      });
+
+      console.log('All tests passed!');
\ No newline at end of file
diff --git a/cases/bootstrap/python-unit-test.yaml b/cases/bootstrap/python-unit-test.yaml
new file mode 100644
index 0000000..88153f2
--- /dev/null
+++ b/cases/bootstrap/python-unit-test.yaml
@@ -0,0 +1,64 @@
+id: bootstrap-003
+title: "Python Unit Tests"
+prompt: |
+  This Python file contains a simple text processing function
+  with failing unit tests. Fix the implementation so that all
+  tests pass.
+
+  The function should:
+  - Count the number of words in a given string
+  - Handle edge cases like empty strings, multiple spaces
+  - Handle punctuation properly
+
+  Run the tests with: python text_processor.test.py
+  Ensure all tests pass.
+
+source: bootstrap
+category: codefix
+language: python
+difficulty: easy
+
+tags:
+  - python
+  - unit-tests
+  - text-processing
+
+files:
+  - path: text_processor.py
+    content: |
+      def count_words(text):
+          """Count the number of words in a string."""
+          # TODO: This implementation is buggy. Fix it!
+          words = text.split(' ')
+          return len(words)
+
+  - path: text_processor.test.py
+    content: |
+      import unittest
+      from text_processor import count_words
+
+      class TestCountWords(unittest.TestCase):
+
+          def test_simple_sentence(self):
+              self.assertEqual(count_words("hello world"), 2)
+              self.assertEqual(count_words("one two three four"), 4)
+
+          def test_empty_string(self):
+              self.assertEqual(count_words(""), 0)
+              self.assertEqual(count_words("   "), 0)
+
+          def test_single_word(self):
+              self.assertEqual(count_words("hello"), 1)
+
+          def test_multiple_spaces(self):
+              self.assertEqual(count_words("hello    world"), 2)
+
+          def test_punctuation(self):
+              self.assertEqual(count_words("hello, world!"), 2)
+              self.assertEqual(count_words("test. another? yes."), 3)
+
+          def test_newlines(self):
+              self.assertEqual(count_words("line1\nline2\nline3"), 3)
+
+      if __name__ == '__main__':
+          unittest.main()
\ No newline at end of file
diff --git a/cases/bootstrap/refactor-shipping.yaml b/cases/bootstrap/refactor-shipping.yaml
new file mode 100644
index 0000000..618e895
--- /dev/null
+++ b/cases/bootstrap/refactor-shipping.yaml
@@ -0,0 +1,135 @@
+id: bootstrap-006
+title: "Refactor Bad Code"
+prompt: |
+  The following code works but has multiple issues:
+  - Poor naming
+  - Magic numbers
+  - No error handling
+  - Hard to test
+  - Code duplication
+
+  Refactor the code to be:
+  - Readable with clear naming
+  - Maintainable
+  - Well-tested with the provided test suite
+  - Handle edge cases properly
+
+  The tests describe the expected behavior - make them pass
+  while improving code quality.
+
+  Run: node shipping_calculator.test.js
+
+source: bootstrap
+category: refactoring
+language: javascript
+difficulty: medium
+
+tags:
+  - javascript
+  - refactoring
+  - code-quality
+
+files:
+  - path: shipping_calculator.js
+    content: |
+      function c(w, d, z) {
+        if (w <= 0) return 0;
+        if (z == 'domestic') {
+          if (w < 5) return 5;
+          if (w < 10) return 10;
+          if (w < 20) return 15;
+          return 25;
+        }
+        if (z == 'international') {
+          if (d == 'express') {
+            if (w < 5) return 20;
+            if (w < 10) return 35;
+            return 50;
+          }
+          if (w < 5) return 15;
+          if (w < 10) return 25;
+          return 40;
+        }
+        return null;
+      }
+
+      module.exports = { c };
+
+  - path: shipping_calculator.test.js
+    content: |
+      const { c } = require('./shipping_calculator');
+
+      function test(name, fn) {
+        try {
+          fn();
+          console.log(`✓ ${name}`);
+        } catch (err) {
+          console.log(`✗ ${name}: ${err.message}`);
+          process.exit(1);
+        }
+      }
+
+      // Note: The exported function is named 'c'. This is part of what
+      // needs to be refactored. For now, we use 'c' in tests.
+
+      test('domestic under 5 lbs', () => {
+        const r = c(4, '', 'domestic');
+        if (r !== 5) throw new Error(`Expected 5, got ${r}`);
+      });
+
+      test('domestic 5-9 lbs', () => {
+        const r = c(7, '', 'domestic');
+        if (r !== 10) throw new Error(`Expected 10, got ${r}`);
+      });
+
+      test('domestic 10-19 lbs', () => {
+        const r = c(15, '', 'domestic');
+        if (r !== 15) throw new Error(`Expected 15, got ${r}`);
+      });
+
+      test('domestic 20+ lbs', () => {
+        const r = c(25, '', 'domestic');
+        if (r !== 25) throw new Error(`Expected 25, got ${r}`);
+      });
+
+      test('international standard under 5 lbs', () => {
+        const r = c(4, 'standard', 'international');
+        if (r !== 15) throw new Error(`Expected 15, got ${r}`);
+      });
+
+      test('international standard 5-9 lbs', () => {
+        const r = c(7, 'standard', 'international');
+        if (r !== 25) throw new Error(`Expected 25, got ${r}`);
+      });
+
+      test('international standard 10+ lbs', () => {
+        const r = c(12, 'standard', 'international');
+        if (r !== 40) throw new Error(`Expected 40, got ${r}`);
+      });
+
+      test('international express under 5 lbs', () => {
+        const r = c(3, 'express', 'international');
+        if (r !== 20) throw new Error(`Expected 20, got ${r}`);
+      });
+
+      test('international express 5-9 lbs', () => {
+        const r = c(8, 'express', 'international');
+        if (r !== 35) throw new Error(`Expected 35, got ${r}`);
+      });
+
+      test('international express 10+ lbs', () => {
+        const r = c(15, 'express', 'international');
+        if (r !== 50) throw new Error(`Expected 50, got ${r}`);
+      });
+
+      test('zero or negative weight', () => {
+        if (c(0, '', 'domestic') !== 0) throw new Error('Zero weight should be 0');
+        if (c(-1, '', 'domestic') !== 0) throw new Error('Negative weight should be 0');
+      });
+
+      test('invalid zone', () => {
+        const r = c(5, '', 'invalid');
+        if (r !== null) throw new Error('Invalid zone should return null');
+      });
+
+      console.log('All tests passed!');
\ No newline at end of file
diff --git a/package.json b/package.json
index 066e195..92cbafb 100644
--- a/package.json
+++ b/package.json
@@ -2,13 +2,14 @@
   "name": "sniffbench",
   "version": "0.1.1",
   "description": "A benchmark suite for coding agents. Think pytest, but for evaluating AI assistants.",
+  "type": "commonjs",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
   "bin": {
     "sniff": "dist/cli/index.js"
   },
   "scripts": {
-    "build": "tsc",
+    "build": "tsc && cp src/agents/opencode-sdk.mjs dist/agents/opencode-sdk.mjs",
     "dev": "tsc --watch",
     "prepublishOnly": "npm run build",
     "test": "jest",
@@ -43,10 +44,12 @@
   "dependencies": {
     "@anthropic-ai/claude-agent-sdk": "^0.1.61",
     "@anthropic-ai/claude-code": "^2.0.61",
+    "@opencode-ai/sdk": "^1.1.65",
     "chalk": "^5.3.0",
     "commander": "^12.0.0",
     "dockerode": "^4.0.2",
     "ora": "^8.0.0",
+
     "yaml": "^2.3.4",
     "zod": "^4.1.13"
   },
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 7fc12b5..1bd05a3 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -14,6 +14,9 @@ importers:
       '@anthropic-ai/claude-code':
         specifier: ^2.0.61
         version: 2.0.76
+      '@opencode-ai/sdk':
+        specifier: ^1.1.65
+        version: 1.2.1
       chalk:
         specifier: ^5.3.0
         version: 5.6.2
@@ -332,56 +335,66 @@ packages:
     resolution: {integrity: sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA==}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linux-arm@1.0.5':
     resolution: {integrity: sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g==}
     cpu: [arm]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linux-x64@1.0.4':
     resolution: {integrity: sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw==}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linuxmusl-arm64@1.0.4':
     resolution: {integrity: sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA==}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   '@img/sharp-libvips-linuxmusl-x64@1.0.4':
     resolution: {integrity: sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw==}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   '@img/sharp-linux-arm64@0.33.5':
     resolution: {integrity: sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linux-arm@0.33.5':
     resolution: {integrity: sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linux-x64@0.33.5':
     resolution: {integrity: sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linuxmusl-arm64@0.33.5':
     resolution: {integrity: sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   '@img/sharp-linuxmusl-x64@0.33.5':
     resolution: {integrity: sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   '@img/sharp-win32-x64@0.33.5':
     resolution: {integrity: sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg==}
@@ -548,6 +561,9 @@ packages:
   '@octokit/types@16.0.0':
     resolution: {integrity: sha512-sKq+9r1Mm4efXW1FCk7hFSeJo4QKreL/tTbR0rz/qx/r1Oa2VV83LTA/H/MuCOX7uCIJmQVRKBcbmWoySjAnSg==}
 
+  '@opencode-ai/sdk@1.2.1':
+    resolution: {integrity: sha512-K5e15mIXTyAykBw0GX+8O28IJHlPMw1jI/m3SDu+hgUHjmg2refqLPqyuqv8hE2nRcuGi8HajhpDJjkO7H2S0A==}
+
   '@pnpm/config.env-replace@1.1.0':
     resolution: {integrity: sha512-htyl8TWnKL7K/ESFa1oW2UB5lVDxuF5DpM7tBi6Hu2LNL3mWkIzNLG6N4zoCUP1lCKNxWy/3iu8mS8MvToGd6w==}
     engines: {node: '>=12.22.0'}
@@ -3408,6 +3424,8 @@ snapshots:
     dependencies:
       '@octokit/openapi-types': 27.0.0
 
+  '@opencode-ai/sdk@1.2.1': {}
+
   '@pnpm/config.env-replace@1.1.0': {}
 
   '@pnpm/network.ca-file@1.0.2':
diff --git a/src/agents/opencode-sdk.mjs b/src/agents/opencode-sdk.mjs
new file mode 100644
index 0000000..a2bbe3e
--- /dev/null
+++ b/src/agents/opencode-sdk.mjs
@@ -0,0 +1,8 @@
+/**
+ * ESM wrapper for @opencode-ai/sdk
+ * This file is ESM and can properly import the SDK which is ESM-only
+ */
+
+import { createOpencodeClient } from '@opencode-ai/sdk';
+
+export { createOpencodeClient };
\ No newline at end of file
diff --git a/src/agents/opencode-sdk.mjs.d.ts b/src/agents/opencode-sdk.mjs.d.ts
new file mode 100644
index 0000000..f61c7aa
--- /dev/null
+++ b/src/agents/opencode-sdk.mjs.d.ts
@@ -0,0 +1,7 @@
+/**
+ * Type declarations for opencode-sdk.mjs wrapper
+ */
+
+declare const createOpencodeClient: any;
+
+export { createOpencodeClient };
diff --git a/src/agents/opencode.ts b/src/agents/opencode.ts
new file mode 100644
index 0000000..eb7d89e
--- /dev/null
+++ b/src/agents/opencode.ts
@@ -0,0 +1,426 @@
+/**
+ * Opencode agent wrapper using SDK
+ *
+ * Uses @opencode-ai/sdk for programmatic interaction with opencode.
+ * Spawns the opencode server with the correct working directory so
+ * the agent operates on the test case files.
+ */
+
+import { spawn, ChildProcess } from 'child_process';
+import {
+  AgentWrapper,
+  AgentResult,
+  AgentRunOptions,
+  ToolCall,
+  emptyAgentResult,
+} from './types.js';
+
+// Import SDK client dynamically since it's ESM-only
+let _createOpencodeClient: any;
+const loadSDK = async () => {
+  if (!_createOpencodeClient) {
+    const sdkWrapper = await import('./opencode-sdk.mjs');
+    _createOpencodeClient = sdkWrapper.createOpencodeClient;
+  }
+  return _createOpencodeClient;
+};
+
+// Port counter to avoid collisions between concurrent runs
+let nextPort = 4097;
+
+/**
+ * Spawn an opencode server process with the given working directory.
+ * Returns the server URL and a close function.
+ */
+async function spawnServer(
+  cwd: string,
+  config: Record<string, any>,
+  timeoutMs: number,
+): Promise<{ url: string; proc: ChildProcess }> {
+  const port = nextPort++;
+  const proc = spawn('opencode', ['serve', `--hostname=127.0.0.1`, `--port=${port}`], {
+    cwd,
+    env: {
+      ...process.env,
+      OPENCODE_CONFIG_CONTENT: JSON.stringify(config),
+    },
+  });
+
+  const url = await new Promise<string>((resolve, reject) => {
+    const id = setTimeout(() => {
+      proc.kill();
+      reject(new Error(`Timeout waiting for opencode server after ${timeoutMs}ms`));
+    }, timeoutMs);
+
+    let output = '';
+    proc.stdout?.on('data', (chunk: Buffer) => {
+      output += chunk.toString();
+      for (const line of output.split('\n')) {
+        if (line.startsWith('opencode server listening')) {
+          const match = line.match(/on\s+(https?:\/\/[^\s]+)/);
+          if (match) {
+            clearTimeout(id);
+            resolve(match[1]);
+            return;
+          }
+        }
+      }
+    });
+    proc.stderr?.on('data', (chunk: Buffer) => {
+      output += chunk.toString();
+    });
+    proc.on('exit', (code) => {
+      clearTimeout(id);
+      reject(new Error(`Server exited with code ${code}: ${output}`));
+    });
+    proc.on('error', (err) => {
+      clearTimeout(id);
+      reject(err);
+    });
+  });
+
+  return { url, proc };
+}
+
+/**
+ * Opencode agent wrapper using SDK
+ */
+export class OpencodeAgent implements AgentWrapper {
+  name = 'opencode';
+  displayName = 'Opencode';
+
+  private cliPath: string;
+  private config: Record<string, any>;
+
+  constructor(cliPath: string = 'opencode', config?: Record<string, any>) {
+    this.cliPath = cliPath;
+    this.config = config || {
+      model: 'local-glm/glm-4.7-local-4bit',
+      provider: {
+        'local-glm': {
+          api: 'openai',
+          options: {
+            baseURL: 'http://127.0.0.1:8081/v1',
+            apiKey: 'local-glm-key',
+          },
+          models: {
+            'glm-4.7-local-4bit': {
+              name: 'GLM-4.7 Local (4-bit)',
+              id: '/Users/studio/models/GLM-4.7-4bit',
+              reasoning: false,
+              tool_call: true,
+              temperature: true,
+              limit: { context: 32768, output: 4096 },
+              cost: { input: 0, output: 0 },
+              modalities: { input: ['text'], output: ['text'] },
+            },
+          },
+        },
+      },
+    };
+  }
+
+  async isAvailable(): Promise<boolean> {
+    try {
+      const version = await this.getVersion();
+      return version !== null;
+    } catch {
+      return false;
+    }
+  }
+
+  async getVersion(): Promise<string | null> {
+    return new Promise((resolve) => {
+      const proc = spawn(this.cliPath, ['--version'], { timeout: 5000 });
+      let stdout = '';
+      proc.stdout?.on('data', (data: Buffer) => {
+        stdout += data.toString();
+      });
+      proc.on('close', (code: number | null) => {
+        resolve(code === 0 && stdout.trim() ? stdout.trim() : null);
+      });
+      proc.on('error', () => resolve(null));
+    });
+  }
+
+  async run(prompt: string, options: AgentRunOptions): Promise<AgentResult> {
+    const runStartTime = Date.now();
+    const timeoutMs = options.timeoutMs || 300000;
+    const toolCalls: ToolCall[] = [];
+    let model = 'unknown';
+    let sessionId = '';
+    let serverProc: ChildProcess | null = null;
+
+    try {
+      // Spawn server in the case's working directory
+      const cwd = options.cwd || process.cwd();
+      const config = options.model
+        ? { ...this.config, model: options.model }
+        : this.config;
+      const { url, proc } = await spawnServer(cwd, config, 15000);
+      serverProc = proc;
+
+      const createClient = await loadSDK();
+      const client = createClient({ baseUrl: url });
+
+      const createResult = await client.session.create({});
+      if (createResult.error) {
+        throw new Error(`Failed to create session: ${JSON.stringify(createResult.error)}`);
+      }
+
+      const session = createResult.data;
+      sessionId = session.id;
+      model = options.model || session.version || 'unknown';
+
+      options.onEvent?.({ type: 'start', timestamp: runStartTime, model });
+
+      // Subscribe to SSE events BEFORE sending the prompt so we capture everything
+      // event.subscribe() returns ServerSentEventsResult directly (not { data, error })
+      const sseResult = await client.event.subscribe({}) as any;
+      const stream: AsyncIterable<any> | undefined =
+        sseResult?.stream || sseResult?.data?.stream || sseResult?.data;
+
+      if (!stream) {
+        throw new Error(
+          `Event stream not available — subscribe() returned: ${JSON.stringify(Object.keys(sseResult || {}))}`,
+        );
+      }
+
+      // Send prompt asynchronously (returns immediately, events stream the progress)
+      const asyncResult = await client.session.promptAsync({
+        path: { id: sessionId },
+        body: {
+          parts: [{ type: 'text', text: prompt }],
+        },
+      });
+
+      if (asyncResult.error) {
+        throw new Error(`Prompt failed: ${JSON.stringify(asyncResult.error)}`);
+      }
+
+      // Process SSE events until the session goes idle or we time out
+      let answer = '';
+      let numTurns = 0;
+      let totalTokens = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 };
+      let totalCost = 0;
+      const deadline = Date.now() + timeoutMs - 5000;
+
+      for await (const event of stream) {
+        if (Date.now() > deadline) {
+          options.onEvent?.({ type: 'status', message: 'Timed out waiting for agent' });
+          break;
+        }
+
+        const eventType = event?.type || event?.event;
+
+        if (eventType === 'message.part.updated') {
+          const props = event.properties || event.data;
+          if (!props) continue;
+          const part = props.part;
+          if (!part) continue;
+
+          if (part.type === 'text') {
+            // Streaming text delta
+            const delta = props.delta || '';
+            if (delta) {
+              answer += delta;
+              options.onEvent?.({ type: 'text_delta', text: delta });
+            }
+          } else if (part.type === 'tool') {
+            const status = part.state?.status;
+            const callID = part.callID || part.callId;
+            const toolName = part.tool || 'unknown';
+
+            if (status === 'running' || status === 'pending') {
+              // Only add if not already tracked
+              if (!toolCalls.find((t) => t.id === callID)) {
+                const toolCall: ToolCall = {
+                  id: callID,
+                  name: toolName,
+                  input: part.state?.input || {},
+                  timestamp: Date.now(),
+                };
+                toolCalls.push(toolCall);
+                options.onEvent?.({ type: 'tool_start', tool: toolCall });
+                options.onEvent?.({ type: 'status', message: `Tool: ${toolName}` });
+              }
+            } else if (status === 'completed') {
+              const existing = toolCalls.find((t) => t.id === callID);
+              if (existing) {
+                existing.durationMs = part.state?.time
+                  ? (part.state.time.end - part.state.time.start) * 1000
+                  : Date.now() - existing.timestamp;
+                existing.success = true;
+                existing.result = part.state?.output
+                  ? String(part.state.output).substring(0, 500)
+                  : undefined;
+              } else {
+                // Tool completed without a prior start event (can happen if subscription started late)
+                toolCalls.push({
+                  id: callID,
+                  name: toolName,
+                  input: part.state?.input || {},
+                  timestamp: Date.now(),
+                  durationMs: part.state?.time
+                    ? (part.state.time.end - part.state.time.start) * 1000
+                    : 0,
+                  success: true,
+                  result: part.state?.output
+                    ? String(part.state.output).substring(0, 500)
+                    : undefined,
+                });
+              }
+              options.onEvent?.({
+                type: 'tool_end',
+                toolId: callID,
+                success: true,
+                durationMs: toolCalls.find((t) => t.id === callID)?.durationMs || 0,
+              });
+            } else if (status === 'error') {
+              const existing = toolCalls.find((t) => t.id === callID);
+              if (existing) {
+                existing.success = false;
+                existing.durationMs = Date.now() - existing.timestamp;
+              }
+              options.onEvent?.({
+                type: 'tool_end',
+                toolId: callID,
+                success: false,
+                durationMs: existing?.durationMs || 0,
+              });
+            }
+          } else if (part.type === 'reasoning') {
+            const text = props.delta || part.text || '';
+            if (text) {
+              options.onEvent?.({ type: 'thinking', text });
+            }
+          } else if (part.type === 'step-finish') {
+            numTurns++;
+            // Accumulate per-step tokens/cost
+            if (part.tokens) {
+              totalTokens.input += part.tokens.input || 0;
+              totalTokens.output += part.tokens.output || 0;
+              totalTokens.cacheRead += part.tokens.cache?.read || 0;
+              totalTokens.cacheWrite += part.tokens.cache?.write || 0;
+              totalTokens.total += part.tokens.total || 0;
+            }
+            if (part.cost) {
+              totalCost += part.cost;
+            }
+          }
+        } else if (eventType === 'message.updated') {
+          // A full message update — extract final info from here
+          const props = event.properties || event.data;
+          const info = props?.info;
+          if (info?.providerID && info?.modelID) {
+            model = `${info.providerID}/${info.modelID}`;
+          }
+          // Use message-level tokens as authoritative total if available
+          if (info?.tokens?.total) {
+            totalTokens = {
+              input: info.tokens.input || totalTokens.input,
+              output: info.tokens.output || totalTokens.output,
+              cacheRead: info.tokens.cache?.read || totalTokens.cacheRead,
+              cacheWrite: info.tokens.cache?.write || totalTokens.cacheWrite,
+              total: info.tokens.total,
+            };
+          }
+          if (info?.cost !== undefined) {
+            totalCost = info.cost;
+          }
+          // Extract final answer text from message parts if we haven't captured it via deltas
+          if (props?.parts && !answer) {
+            for (const p of props.parts) {
+              if (p.type === 'text' && p.text) {
+                answer += p.text;
+              }
+            }
+          }
+        } else if (eventType === 'session.status') {
+          const props = event.properties || event.data;
+          const status = props?.status;
+          if (status?.type === 'idle') {
+            // Agent finished processing
+            options.onEvent?.({ type: 'status', message: 'Session idle — agent finished' });
+            break;
+          } else if (status?.type === 'busy') {
+            options.onEvent?.({ type: 'status', message: 'Agent working...' });
+          } else if (status?.type === 'retry') {
+            options.onEvent?.({
+              type: 'status',
+              message: `Retrying (attempt ${status.attempt}): ${status.message}`,
+            });
+          }
+        } else if (eventType === 'session.error') {
+          const props = event.properties || event.data;
+          const errMsg = props?.error?.message || JSON.stringify(props?.error) || 'Unknown error';
+          options.onEvent?.({ type: 'error', message: errMsg, code: 'SESSION_ERROR' });
+        }
+      }
+
+      // If answer is still empty, fetch the final messages from the session
+      if (!answer) {
+        const messagesResult = await client.session.messages({
+          path: { id: sessionId },
+        });
+        if (messagesResult.data) {
+          const messages = messagesResult.data as any[];
+          // Find the last assistant message
+          for (let i = messages.length - 1; i >= 0; i--) {
+            const msg = messages[i];
+            if (msg.role === 'assistant' && msg.parts) {
+              for (const p of msg.parts) {
+                if (p.type === 'text' && p.text) {
+                  answer += p.text;
+                }
+              }
+              break;
+            }
+          }
+        }
+      }
+
+      const result: AgentResult = {
+        answer,
+        success: true,
+        timedOut: Date.now() > deadline,
+        durationMs: Date.now() - runStartTime,
+        tokens: {
+          inputTokens: totalTokens.input,
+          outputTokens: totalTokens.output,
+          cacheReadTokens: totalTokens.cacheRead,
+          cacheWriteTokens: totalTokens.cacheWrite,
+          totalTokens: totalTokens.total,
+        },
+        costUsd: totalCost,
+        numTurns: numTurns || 1,
+        toolCalls,
+        toolsUsed: [...new Set(toolCalls.map((t) => t.name))],
+        model,
+        raw: { sessionId },
+      };
+
+      options.onEvent?.({ type: 'complete', result });
+      return result;
+    } catch (error) {
+      const errorMessage = error instanceof Error ? error.message : String(error);
+
+      options.onEvent?.({ type: 'error', message: errorMessage, code: 'ERROR' });
+
+      const errorResult = emptyAgentResult(errorMessage);
+      errorResult.durationMs = Date.now() - runStartTime;
+      errorResult.toolCalls = toolCalls;
+      errorResult.toolsUsed = [...new Set(toolCalls.map((t) => t.name))];
+      errorResult.model = model;
+
+      options.onEvent?.({ type: 'complete', result: errorResult });
+      return errorResult;
+    } finally {
+      serverProc?.kill();
+    }
+  }
+}
+
+export function createOpencodeAgent(cliPath?: string): OpencodeAgent {
+  return new OpencodeAgent(cliPath);
+}
diff --git a/src/agents/registry.ts b/src/agents/registry.ts
index 273aa38..828ce26 100644
--- a/src/agents/registry.ts
+++ b/src/agents/registry.ts
@@ -6,6 +6,10 @@
 
 import { AgentWrapper, AgentRegistry } from './types';
 import { createClaudeCodeAgent } from './claude-code';
+import { createOpencodeAgent } from './opencode';
+
+/** Default agent used when none is specified on the CLI */
+export const DEFAULT_AGENT = 'claude-code';
 
 /**
  * Default agent registry implementation
@@ -16,6 +20,7 @@ class DefaultAgentRegistry implements AgentRegistry {
   constructor() {
     // Register built-in agents
     this.register(createClaudeCodeAgent());
+    this.register(createOpencodeAgent());
   }
 
   get(name: string): AgentWrapper | undefined {
diff --git a/src/cases/loader.ts b/src/cases/loader.ts
index 587e116..928163b 100644
--- a/src/cases/loader.ts
+++ b/src/cases/loader.ts
@@ -147,23 +147,26 @@ export interface LoadOptions {
 }
 
 /**
- * Load all cases from a directory
+ * Load all cases from one or more directories
  */
-export async function loadCases(casesDir: string, options: LoadOptions = {}): Promise<Case[]> {
+export async function loadCases(casesDir: string | string[], options: LoadOptions = {}): Promise<Case[]> {
   const cases: Case[] = [];
-
-  // Check if directory exists
-  if (!fs.existsSync(casesDir)) {
-    return cases;
+  const dirs = Array.isArray(casesDir) ? casesDir : [casesDir];
+  const seenIds = new Set<string>();
+
+  // Collect YAML files from all directories
+  const yamlFiles: string[] = [];
+  for (const dir of dirs) {
+    if (fs.existsSync(dir)) {
+      yamlFiles.push(...findYamlFiles(dir));
+    }
   }
 
-  // Recursively find all YAML files
-  const yamlFiles = findYamlFiles(casesDir);
-
   for (const filePath of yamlFiles) {
     try {
       const result = await loadCaseFile(filePath, options);
-      if (result.case && matchesFilter(result.case, options)) {
+      if (result.case && matchesFilter(result.case, options) && !seenIds.has(result.case.id)) {
+        seenIds.add(result.case.id);
         cases.push(result.case);
       }
       // Log warnings
@@ -348,22 +351,41 @@ function matchesFilter(caseData: Case, options: LoadOptions): boolean {
 
 /**
  * Get the default cases directory for a project
+ *
+ * @deprecated Use getDefaultCasesDirs() instead — returns all case directories
  */
 export function getDefaultCasesDir(projectRoot: string = process.cwd()): string {
-  // Check for .sniffbench/cases first (project-specific)
+  return getDefaultCasesDirs(projectRoot)[0];
+}
+
+/**
+ * Get all cases directories (project-specific + built-in)
+ *
+ * Project-specific cases (.sniffbench/cases) come first so they take
+ * precedence over built-in cases with the same ID.
+ */
+export function getDefaultCasesDirs(projectRoot: string = process.cwd()): string[] {
+  const dirs: string[] = [];
+
+  // Project-specific cases (first = higher priority for dedup)
   const projectCases = path.join(projectRoot, '.sniffbench', 'cases');
   if (fs.existsSync(projectCases)) {
-    return projectCases;
+    dirs.push(projectCases);
+  }
+
+  // Built-in cases shipped with sniffbench
+  const builtInCases = path.join(__dirname, '..', '..', 'cases');
+  if (fs.existsSync(builtInCases)) {
+    dirs.push(builtInCases);
   }
 
-  // Fall back to cases/ in sniffbench installation
-  return path.join(__dirname, '..', '..', 'cases');
+  return dirs;
 }
 
 /**
  * List available case categories
  */
-export async function listCategories(casesDir: string): Promise<string[]> {
+export async function listCategories(casesDir: string | string[]): Promise<string[]> {
   const cases = await loadCases(casesDir);
   const categories = new Set(cases.map((c) => c.category));
   return Array.from(categories).sort();
@@ -372,7 +394,7 @@ export async function listCategories(casesDir: string): Promise<string[]> {
 /**
  * List available languages
  */
-export async function listLanguages(casesDir: string): Promise<string[]> {
+export async function listLanguages(casesDir: string | string[]): Promise<string[]> {
   const cases = await loadCases(casesDir);
   const languages = new Set(cases.map((c) => c.language));
   return Array.from(languages).sort();
@@ -381,7 +403,7 @@ export async function listLanguages(casesDir: string): Promise<string[]> {
 /**
  * Get a single case by ID
  */
-export async function getCaseById(casesDir: string, id: string): Promise<Case | null> {
+export async function getCaseById(casesDir: string | string[], id: string): Promise<Case | null> {
   const cases = await loadCases(casesDir, { ids: [id] });
   return cases[0] || null;
 }
diff --git a/src/cases/types.ts b/src/cases/types.ts
index f63fe20..aaaf1fe 100644
--- a/src/cases/types.ts
+++ b/src/cases/types.ts
@@ -460,6 +460,21 @@ export interface CaseResult {
   /** Agent behavior trace */
   agentTrace?: AgentTrace;
 
+  /** The agent's text response */
+  agentResponse?: string;
+
+  /** Tool calls the agent made */
+  agentToolCalls?: { name: string; durationMs?: number; success?: boolean }[];
+
+  /** Model used */
+  agentModel?: string;
+
+  /** Token usage */
+  agentTokens?: { input: number; output: number; total: number };
+
+  /** Files produced by the agent (snapshot of workspace after agent runs) */
+  agentFiles?: { path: string; content: string; changed: boolean }[];
+
   /** Total duration in milliseconds */
   durationMs: number;
 
diff --git a/src/cli/commands/cases.ts b/src/cli/commands/cases.ts
index f67e1e4..93a4b4d 100644
--- a/src/cli/commands/cases.ts
+++ b/src/cli/commands/cases.ts
@@ -8,7 +8,7 @@ import { spawn } from 'child_process';
 import { box } from '../../utils/ui';
 import {
   loadCases,
-  getDefaultCasesDir,
+  getDefaultCasesDirs,
   listCategories,
   listLanguages,
   getCaseById,
@@ -37,9 +37,9 @@ export async function casesListCommand(options: CasesListOptions) {
   const spinner = ora('Loading cases...').start();
 
   try {
-    const casesDir = getDefaultCasesDir();
+    const casesDirs = getDefaultCasesDirs();
 
-    const cases = await loadCases(casesDir, {
+    const cases = await loadCases(casesDirs, {
       category: options.category,
       language: options.language,
       difficulty: options.difficulty as CaseDifficulty | undefined,
@@ -57,7 +57,7 @@ export async function casesListCommand(options: CasesListOptions) {
     if (cases.length === 0) {
       console.log(chalk.yellow('No cases found matching the criteria.'));
       console.log(chalk.dim('\nTip: Try running without filters, or add cases to:'));
-      console.log(chalk.cyan(`  ${casesDir}`));
+      console.log(chalk.cyan(`  ${casesDirs.join(' or ')}`));
       return;
     }
 
@@ -97,8 +97,8 @@ export async function casesShowCommand(options: CasesShowOptions) {
   const spinner = ora('Loading case...').start();
 
   try {
-    const casesDir = getDefaultCasesDir();
-    const caseData = await getCaseById(casesDir, options.id);
+    const casesDirs = getDefaultCasesDirs();
+    const caseData = await getCaseById(casesDirs, options.id);
 
     spinner.stop();
 
@@ -141,8 +141,8 @@ export async function casesCategoriesCommand() {
   const spinner = ora('Loading categories...').start();
 
   try {
-    const casesDir = getDefaultCasesDir();
-    const categories = await listCategories(casesDir);
+    const casesDirs = getDefaultCasesDirs();
+    const categories = await listCategories(casesDirs);
 
     spinner.stop();
 
@@ -169,8 +169,8 @@ export async function casesLanguagesCommand() {
   const spinner = ora('Loading languages...').start();
 
   try {
-    const casesDir = getDefaultCasesDir();
-    const languages = await listLanguages(casesDir);
+    const casesDirs = getDefaultCasesDirs();
+    const languages = await listLanguages(casesDirs);
 
     spinner.stop();
 
diff --git a/src/cli/commands/closed-issues.ts b/src/cli/commands/closed-issues.ts
index c67b220..efdfe15 100644
--- a/src/cli/commands/closed-issues.ts
+++ b/src/cli/commands/closed-issues.ts
@@ -44,7 +44,7 @@ import {
   ClosedIssueCaseRun,
   Run,
 } from '../../runs';
-import { getAgent } from '../../agents';
+import { getAgent, DEFAULT_AGENT } from '../../agents';
 
 // =============================================================================
 // Command Interfaces
@@ -71,6 +71,8 @@ interface ListCommandOptions {
 
 interface RunCommandOptions {
   case?: string;
+  agent?: string;
+  model?: string;
   variant?: string;
   local?: boolean;
   timeout?: string;
@@ -431,6 +433,8 @@ export async function closedIssuesRunCommand(options: RunCommandOptions) {
 
       const result = await runClosedIssueCase({
         caseData: c,
+        agent: options.agent,
+        model: options.model,
         variant,
         projectRoot: process.cwd(),
         timeoutMs,
@@ -466,7 +470,7 @@ export async function closedIssuesRunCommand(options: RunCommandOptions) {
     }
 
     // Save run to store
-    const runId = await saveClosedIssuesRun(projectRoot, results, variant, options.run);
+    const runId = await saveClosedIssuesRun(projectRoot, results, variant, options.run, options.agent);
 
     // Output JSON if requested
     if (options.json) {
@@ -566,10 +570,11 @@ async function saveClosedIssuesRun(
   projectRoot: string,
   results: RunCaseResult[],
   variant: Variant | undefined,
-  label?: string
+  label?: string,
+  agentName?: string
 ): Promise<string> {
   // Capture agent config
-  const agent = getAgent('claude-code');
+  const agent = getAgent(agentName || DEFAULT_AGENT);
   const agentConfig = await capturePartialAgentConfig(agent, projectRoot);
 
   // Link to variant if used
diff --git a/src/cli/commands/interview.ts b/src/cli/commands/interview.ts
index bddcc38..cc6e12d 100644
--- a/src/cli/commands/interview.ts
+++ b/src/cli/commands/interview.ts
@@ -13,7 +13,7 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as readline from 'readline';
 import { box } from '../../utils/ui';
-import { loadCases, getDefaultCasesDir } from '../../cases';
+import { loadCases, getDefaultCasesDirs } from '../../cases';
 import { Case } from '../../cases/types';
 import { getAgent, AgentWrapper, AgentResult, AgentEvent } from '../../agents';
 import { computeBehaviorMetrics, formatBehaviorMetrics } from '../../metrics';
@@ -1003,9 +1003,9 @@ export async function interviewCommand(options: InterviewOptions) {
 
   // Load comprehension cases
   spinner.start('Loading comprehension cases...');
-  const casesDir = getDefaultCasesDir();
+  const casesDirs = getDefaultCasesDirs();
 
-  const cases = await loadCases(casesDir, {
+  const cases = await loadCases(casesDirs, {
     category: 'comprehension',
     ids: options.cases?.split(',').map(c => c.trim()),
   });
@@ -1013,7 +1013,7 @@ export async function interviewCommand(options: InterviewOptions) {
   if (cases.length === 0) {
     spinner.warn('No comprehension cases found');
     console.log(chalk.yellow('\nMake sure comprehension cases exist in:'));
-    console.log(chalk.cyan(`  ${casesDir}/comprehension/`));
+    console.log(chalk.cyan(`  ${casesDirs.join(' or ')}`));
     return;
   }
 
diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts
index 498bb74..7921767 100644
--- a/src/cli/commands/run.ts
+++ b/src/cli/commands/run.ts
@@ -3,7 +3,7 @@ import ora from 'ora';
 import * as fs from 'fs';
 import * as path from 'path';
 import { box } from '../../utils/ui';
-import { loadCases, getDefaultCasesDir } from '../../cases';
+import { loadCases, getDefaultCasesDirs } from '../../cases';
 import { CaseResult } from '../../cases/types';
 import { runCases, ProgressUpdate } from '../../evaluation';
 import { checkDocker } from '../../sandbox';
@@ -14,6 +14,7 @@ interface RunOptions {
   output: string;
   timeout?: number;
   network?: boolean;
+  model?: string;
 }
 
 export async function runCommand(options: RunOptions) {
@@ -35,12 +36,12 @@ export async function runCommand(options: RunOptions) {
 
   // Load cases
   spinner.start('Loading test cases...');
-  const casesDir = getDefaultCasesDir();
+  const casesDirs = getDefaultCasesDirs();
 
   // Parse case filter if provided
   const caseIds = options.cases?.split(',').map((c) => c.trim());
 
-  const cases = await loadCases(casesDir, {
+  const cases = await loadCases(casesDirs, {
     ids: caseIds,
   });
 
@@ -48,7 +49,7 @@ export async function runCommand(options: RunOptions) {
     spinner.warn('No test cases found');
     console.log(
       chalk.yellow('\nTo add test cases, create YAML files in:\n') +
-        chalk.cyan(`  ${casesDir}\n\n`) +
+        chalk.cyan(`  ${casesDirs.join(' or ')}\n\n`) +
         chalk.dim('See cases/bootstrap/example-case-spec.yaml for format.')
     );
     return;
@@ -86,7 +87,7 @@ export async function runCommand(options: RunOptions) {
 
   const onCaseComplete = (result: CaseResult) => {
     if (currentSpinner) {
-      const scorePercent = Math.round(result.score * 100);
+      const scorePercent = Math.round(result.score);
       if (result.passed) {
         currentSpinner.succeed(`${result.caseId}: ${chalk.green('PASSED')} (${scorePercent}%, ${formatDuration(result.durationMs)})`);
       } else if (result.timedOut) {
@@ -103,6 +104,7 @@ export async function runCommand(options: RunOptions) {
   try {
     const result = await runCases(cases, {
       agent: options.agent,
+      model: options.model,
       timeoutSeconds: options.timeout || 300,
       networkEnabled: options.network || false,
       onProgress,
@@ -111,7 +113,7 @@ export async function runCommand(options: RunOptions) {
 
     // Display summary
     console.log('');
-    const averageScorePercent = Math.round(result.summary.averageScore * 100);
+    const averageScorePercent = Math.round(result.summary.averageScore);
     const summaryLines = [
       chalk.bold('Run Summary\n'),
       `Run ID: ${chalk.cyan(result.runId)}`,
diff --git a/src/cli/commands/variant.ts b/src/cli/commands/variant.ts
index 95056cb..3b545f5 100644
--- a/src/cli/commands/variant.ts
+++ b/src/cli/commands/variant.ts
@@ -18,7 +18,7 @@ import {
   hashAgentConfig,
   Variant,
 } from '../../variants';
-import { getAgent } from '../../agents';
+import { getAgent, DEFAULT_AGENT } from '../../agents';
 import {
   buildVariantImage,
   variantImageExists,
@@ -90,8 +90,7 @@ export async function variantRegisterCommand(
     console.log(chalk.dim(`  Replacing existing variant "${name}"...`));
   }
 
-  // Get the agent (defaults to claude-code)
-  const agentName = options.agent || 'claude-code';
+  const agentName = options.agent || DEFAULT_AGENT;
   const agent = getAgent(agentName);
 
   // Capture current ambient config with full MCP details
diff --git a/src/cli/index.ts b/src/cli/index.ts
index 466c33e..1e98886 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -42,6 +42,7 @@ import {
   closedIssuesRunCommand,
   closedIssuesCompareCommand,
 } from './commands/closed-issues';
+import { DEFAULT_AGENT } from '../agents/registry';
 
 const program = new Command();
 
@@ -59,11 +60,12 @@ program
 program
   .command('run')
   .description('Run evaluation suite on specified agent')
-  .option('--agent <name>', 'Agent to evaluate (claude-code, cursor, aider)', 'claude-code')
+  .option('--agent <name>', 'Agent to evaluate (claude-code, opencode, cursor, aider)', DEFAULT_AGENT)
   .option('--cases <cases>', 'Specific test cases to run (comma-separated)')
   .option('--output <dir>', 'Output directory for results', 'results')
   .option('--timeout <seconds>', 'Timeout per case in seconds', '300')
   .option('--network', 'Enable network access in sandbox (disabled by default)')
+  .option('--model <model>', 'Model to use (agent-specific, e.g. local-glm/glm-4.7-local-4bit)')
   .action((opts) => runCommand({ ...opts, timeout: parseInt(opts.timeout, 10) }));
 
 program
@@ -136,7 +138,7 @@ program
 program
   .command('interview')
   .description('Run comprehension interview to test agent understanding')
-  .option('--agent <name>', 'Agent to evaluate', 'claude-code')
+  .option('--agent <name>', 'Agent to evaluate', DEFAULT_AGENT)
   .option('--cases <cases>', 'Specific case IDs to run (comma-separated)')
   .option('--output <dir>', 'Output directory for results', 'results')
   .option('--compare', 'Compare new responses against existing baselines')
@@ -184,7 +186,7 @@ variantCmd
   .argument('<name>', 'Variant name (e.g., "control", "with-linear-mcp")')
   .option('-d, --description <text>', 'Description of the variant')
   .option('-c, --changes <changes...>', 'List of explicit changes in this variant')
-  .option('-a, --agent <name>', 'Agent type to capture config for', 'claude-code')
+  .option('-a, --agent <name>', 'Agent type to capture config for', DEFAULT_AGENT)
   .option('-b, --build', 'Build container image after registration')
   .option('-f, --force', 'Overwrite existing variant with same name')
   .action((name, opts) => variantRegisterCommand(name, opts));
@@ -315,6 +317,8 @@ closedIssuesCmd
   .command('run')
   .description('Run agent on closed-issue cases and compare to reference solutions')
   .option('-c, --case <id>', 'Specific case ID to run')
+  .option('--agent <name>', 'Agent to evaluate', DEFAULT_AGENT)
+  .option('--model <model>', 'Model to use (agent-specific)')
   .option('--variant <name>', 'Use a specific variant container (default: active variant)')
   .option('--local', 'Run with local claude command instead of variant container')
   .option('-t, --timeout <seconds>', 'Timeout per case in seconds', '600')
diff --git a/src/closed-issues/runner.ts b/src/closed-issues/runner.ts
index f3a3e12..b124c7a 100644
--- a/src/closed-issues/runner.ts
+++ b/src/closed-issues/runner.ts
@@ -5,7 +5,7 @@
  * to the reference PR that originally closed the issue.
  */
 
-import { execSync, spawn } from 'child_process';
+import { execSync } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
@@ -19,6 +19,7 @@ import { Variant } from '../variants/types';
 import { runInVariant, RunOptions, VariantRunResult } from '../sandbox/variant-runner';
 import { collectRequiredEnvVars } from '../sandbox/variant-container';
 import { checkMissingEnvVars, getEnvVars, getEnvFilePath } from '../utils/env';
+import { getAgent, DEFAULT_AGENT } from '../agents/registry';
 
 // =============================================================================
 // Types
@@ -28,6 +29,12 @@ export interface RunCaseOptions {
   /** The closed issue case to run */
   caseData: ClosedIssueCase;
 
+  /** Agent name to use (default: from DEFAULT_AGENT) */
+  agent?: string;
+
+  /** Model to use (agent-specific) */
+  model?: string;
+
   /** Optional variant to use (runs in container) */
   variant?: Variant;
 
@@ -102,6 +109,8 @@ const DEFAULT_TIMEOUT_MS = 10 * 60 * 1000;
 export async function runClosedIssueCase(options: RunCaseOptions): Promise<RunCaseResult> {
   const {
     caseData,
+    agent: agentName = DEFAULT_AGENT,
+    model,
     variant,
     projectRoot = process.cwd(),
     timeoutMs = DEFAULT_TIMEOUT_MS,
@@ -163,19 +172,36 @@ export async function runClosedIssueCase(options: RunCaseOptions): Promise<RunCa
         return createErrorResult(caseData.id, 'Agent timed out', startTime);
       }
     } else {
-      // Run with local claude command
-      const result = await runAgentLocally({
-        prompt: caseData.prompt,
-        workdir: tempDir,
+      // Run through agent wrapper (supports opencode, claude-code, etc.)
+      const agent = getAgent(agentName);
+      const agentResult = await agent.run(caseData.prompt, {
+        cwd: tempDir,
+        model,
         timeoutMs,
-        stream,
-        onOutput,
+        permissionMode: 'acceptEdits',
+        onEvent: stream ? (event) => {
+          if (event.type === 'text_delta' && onOutput) {
+            onOutput('stdout', event.text);
+          } else if (event.type === 'status' && onOutput) {
+            onOutput('stderr', event.message + '\n');
+          }
+        } : undefined,
       });
 
-      agentOutput = result.output;
+      agentOutput = agentResult.answer;
+      if (agentResult.tokens) {
+        tokens = {
+          inputTokens: agentResult.tokens.inputTokens,
+          outputTokens: agentResult.tokens.outputTokens,
+          cacheReadTokens: agentResult.tokens.cacheReadTokens,
+          cacheWriteTokens: agentResult.tokens.cacheWriteTokens,
+          totalTokens: agentResult.tokens.totalTokens,
+        };
+      }
+      costUsd = agentResult.costUsd;
 
-      if (!result.success) {
-        return createErrorResult(caseData.id, result.error || 'Agent failed', startTime);
+      if (!agentResult.success) {
+        return createErrorResult(caseData.id, agentResult.error || 'Agent failed', startTime);
       }
     }
 
@@ -334,79 +360,6 @@ async function runAgentWithVariant(options: {
   return runInVariant(options.variant, options.prompt, runOptions);
 }
 
-/**
- * Run agent locally using claude command
- */
-async function runAgentLocally(options: {
-  prompt: string;
-  workdir: string;
-  timeoutMs: number;
-  stream?: boolean;
-  onOutput?: (type: 'stdout' | 'stderr', data: string) => void;
-}): Promise<{ success: boolean; output: string; error?: string }> {
-  return new Promise((resolve) => {
-    let output = '';
-    let stderr = '';
-    let timedOut = false;
-
-    const proc = spawn('claude', ['--print', '--dangerously-skip-permissions', options.prompt], {
-      cwd: options.workdir,
-      env: {
-        ...process.env,
-        // Set HOME to a temp location to avoid polluting user's config
-        HOME: options.workdir,
-      },
-    });
-
-    const timeoutId = setTimeout(() => {
-      timedOut = true;
-      proc.kill('SIGTERM');
-      setTimeout(() => proc.kill('SIGKILL'), 5000);
-    }, options.timeoutMs);
-
-    proc.stdout?.on('data', (data) => {
-      const str = data.toString();
-      output += str;
-      if (options.stream && options.onOutput) {
-        options.onOutput('stdout', str);
-      }
-    });
-
-    proc.stderr?.on('data', (data) => {
-      const str = data.toString();
-      stderr += str;
-      if (options.stream && options.onOutput) {
-        options.onOutput('stderr', str);
-      }
-    });
-
-    proc.on('close', (code) => {
-      clearTimeout(timeoutId);
-
-      if (timedOut) {
-        resolve({ success: false, output, error: 'Agent timed out' });
-        return;
-      }
-
-      if (code !== 0) {
-        resolve({
-          success: false,
-          output,
-          error: `Agent exited with code ${code}: ${stderr}`,
-        });
-        return;
-      }
-
-      resolve({ success: true, output });
-    });
-
-    proc.on('error', (error) => {
-      clearTimeout(timeoutId);
-      resolve({ success: false, output, error: error.message });
-    });
-  });
-}
-
 /**
  * Capture the agent's changes as a diff
  */
diff --git a/src/evaluation/runner.ts b/src/evaluation/runner.ts
index 31765f8..302c91b 100644
--- a/src/evaluation/runner.ts
+++ b/src/evaluation/runner.ts
@@ -12,6 +12,7 @@ import * as path from 'path';
 import * as os from 'os';
 import {
   Case,
+  CaseFile,
   CaseResult,
   CriterionResult,
   EvaluatorResult,
@@ -22,11 +23,16 @@ import {
 import { createSandboxManager, checkDocker, RECOMMENDED_IMAGES } from '../sandbox';
 import { Sandbox, SandboxConfig } from '../sandbox/types';
 import { getRubricRegistry } from '../rubrics/loader';
+import { getAgent } from '../agents/registry';
+import type { AgentResult } from '../agents/types';
 
 export interface RunnerOptions {
   /** Agent being evaluated (for logging) */
   agent: string;
 
+  /** Model to use (passed to agent) */
+  model?: string;
+
   /** Timeout per case in seconds */
   timeoutSeconds?: number;
 
@@ -213,6 +219,30 @@ async function runSingleCase(
       // Install dependencies if needed
       await installDependencies(sandbox, caseData.language, options, caseIndex, totalCases, caseData.id);
 
+      // Run the agent to attempt to solve the case
+      options.onProgress?.({
+        type: 'running',
+        caseId: caseData.id,
+        caseIndex,
+        totalCases,
+        message: 'Running agent...',
+      });
+
+      const agent = getAgent(options.agent);
+      const agentResult: AgentResult = await agent.run(caseData.prompt, {
+        cwd: tempDir,
+        model: options.model,
+        timeoutMs: (options.timeoutSeconds || 300) * 1000,
+        permissionMode: 'acceptEdits',
+      });
+
+      if (!agentResult.success) {
+        throw new Error(`Agent execution failed: ${agentResult.error}`);
+      }
+
+      // Snapshot files the agent produced (before rubric evaluation)
+      const agentFiles = snapshotFiles(tempDir, caseData.files);
+
       // Evaluate using the rubric
       options.onProgress?.({
         type: 'validating',
@@ -230,11 +260,26 @@ async function runSingleCase(
         caseId: caseData.id,
         caseIndex,
         totalCases,
-        message: result.passed ? `Passed (${(result.score * 100).toFixed(0)}%)` : `Failed (${(result.score * 100).toFixed(0)}%)`,
+        message: result.passed ? `Passed (${Math.round(result.score)}%)` : `Failed (${Math.round(result.score)}%)`,
       });
 
       return {
         ...result,
+        agentResponse: agentResult.answer,
+        agentToolCalls: agentResult.toolCalls.map((t) => ({
+          name: t.name,
+          durationMs: t.durationMs,
+          success: t.success,
+        })),
+        agentModel: agentResult.model,
+        agentTokens: agentResult.tokens
+          ? {
+              input: agentResult.tokens.inputTokens,
+              output: agentResult.tokens.outputTokens,
+              total: agentResult.tokens.totalTokens,
+            }
+          : undefined,
+        agentFiles,
         durationMs,
         timestamp: new Date(),
       };
@@ -311,18 +356,18 @@ async function evaluateWithRubric(
         };
       } else if (evaluator.type === 'pattern') {
         // Run pattern evaluator (check for matches in files)
-        // For now, just pass - full implementation will use grep/find
+        // Default to fail until fully implemented
         evalResult = {
-          passed: true,
-          score: 1.0,
-          evidence: 'Pattern check not fully implemented',
+          passed: false,
+          score: 0.0,
+          evidence: 'Pattern check not yet implemented',
         };
       } else {
-        // Other evaluator types (llm_judge, benchmark, etc.) - placeholder
+        // Other evaluator types (llm_judge, benchmark, etc.) - not implemented
         evalResult = {
-          passed: true,
-          score: 1.0,
-          evidence: 'Evaluator type not yet implemented',
+          passed: false,
+          score: 0.0,
+          evidence: `Evaluator type '${evaluator.type}' not yet implemented`,
         };
       }
 
@@ -342,8 +387,10 @@ async function evaluateWithRubric(
     }
 
     // Average score for this criterion
-    const rawScore = evaluatorCount > 0 ? criterionScore / evaluatorCount : 1.0;
-    const weightedScore = (rawScore * criterion.weight) / 100;
+    // If no non-optional evaluators ran, this criterion doesn't participate in scoring
+    const hasRequiredEvaluators = evaluatorCount > 0;
+    const rawScore = hasRequiredEvaluators ? criterionScore / evaluatorCount : 0.0;
+    const weightedScore = hasRequiredEvaluators ? (rawScore * criterion.weight) / 100 : 0;
     const allPassed = evaluatorResults.filter((e) => !e.passed).length === 0;
 
     criteriaResults.push({
@@ -356,11 +403,18 @@ async function evaluateWithRubric(
     });
 
     totalWeightedScore += weightedScore;
-    _totalWeight += criterion.weight;
+    // Only count weight for criteria that had non-optional evaluators
+    if (hasRequiredEvaluators) {
+      _totalWeight += criterion.weight;
+    }
   }
 
-  // Calculate overall score (sum of weighted scores, as percentage)
-  const overallScore = totalWeightedScore * 100;
+  // Normalize score by participating weight (criteria with only optional evaluators are excluded)
+  // Each criterion's weightedScore = rawScore * weight / 100, so totalWeightedScore
+  // is a fraction of 1.0 when all weights sum to 100. When some criteria are excluded,
+  // rescale so the participating criteria fill the full 0-100% range.
+  const participatingFraction = _totalWeight / 100;
+  const overallScore = participatingFraction > 0 ? (totalWeightedScore / participatingFraction) * 100 : 0;
 
   // Determine pass/fail (default threshold: 70%)
   const passThreshold = 70;
@@ -415,3 +469,70 @@ async function installDependencies(
     await sandbox.exec('test -f go.mod && go mod download || true');
   }
 }
+
+/**
+ * Snapshot all files in the workspace after the agent runs.
+ * Compares against the original case files to flag which ones changed.
+ * Reads directly from the host tempDir (bind-mounted into the sandbox).
+ */
+function snapshotFiles(
+  tempDir: string,
+  originalFiles?: CaseFile[]
+): { path: string; content: string; changed: boolean }[] {
+  const results: { path: string; content: string; changed: boolean }[] = [];
+  const origMap = new Map<string, string>();
+
+  // Build map of original file contents for comparison
+  if (originalFiles) {
+    for (const f of originalFiles) {
+      if (f.content !== undefined) {
+        origMap.set(f.path, f.content);
+      }
+    }
+  }
+
+  // Walk the temp directory and collect all files
+  function walk(dir: string, prefix: string) {
+    let entries: fs.Dirent[];
+    try {
+      entries = fs.readdirSync(dir, { withFileTypes: true });
+    } catch {
+      return;
+    }
+    for (const entry of entries) {
+      const relPath = prefix ? `${prefix}/${entry.name}` : entry.name;
+      const fullPath = path.join(dir, entry.name);
+
+      // Skip common non-essential directories
+      if (entry.isDirectory()) {
+        if (['node_modules', '.git', '__pycache__', '.pytest_cache', 'venv', '.venv'].includes(entry.name)) {
+          continue;
+        }
+        walk(fullPath, relPath);
+        continue;
+      }
+
+      if (!entry.isFile()) continue;
+
+      // Skip binary and large files
+      try {
+        const stat = fs.statSync(fullPath);
+        if (stat.size > 100_000) continue; // Skip files over 100KB
+      } catch {
+        continue;
+      }
+
+      try {
+        const content = fs.readFileSync(fullPath, 'utf-8');
+        const original = origMap.get(relPath);
+        const changed = original === undefined || original !== content;
+        results.push({ path: relPath, content, changed });
+      } catch {
+        // Skip files that can't be read as UTF-8
+      }
+    }
+  }
+
+  walk(tempDir, '');
+  return results;
+}
diff --git a/src/rubrics/defaults.ts b/src/rubrics/defaults.ts
index 5409e20..f678d9e 100644
--- a/src/rubrics/defaults.ts
+++ b/src/rubrics/defaults.ts
@@ -30,7 +30,7 @@ export const defaultRubric: Rubric = {
         {
           type: 'command',
           name: 'Tests pass',
-          run: 'npm test 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || echo "No test runner found"',
+          run: 'npm test 2>/dev/null || node *.test.js 2>/dev/null || python *.test.py 2>/dev/null || python *_test.py 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || exit 1',
           partialCredit: true,
           passThreshold: 1.0,
         },
@@ -88,7 +88,7 @@ export const defaultRubric: Rubric = {
           type: 'command',
           name: 'Reasonable file sizes',
           // Check no single file is > 1000 lines
-          run: 'find . -name "*.{js,ts,py}" -exec wc -l {} + 2>/dev/null | awk \'$1 > 1000 {exit 1}\' || true',
+          run: 'find . \\( -name "*.js" -o -name "*.ts" -o -name "*.py" \\) -exec wc -l {} + 2>/dev/null | awk \'$1 > 1000 {exit 1}\' || true',
           optional: true,
         },
       ],
@@ -114,7 +114,7 @@ export const minimalRubric: Rubric = {
         {
           type: 'command',
           name: 'Tests pass',
-          run: 'npm test 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || exit 1',
+          run: 'npm test 2>/dev/null || node *.test.js 2>/dev/null || python *.test.py 2>/dev/null || python *_test.py 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || exit 1',
           partialCredit: true,
         },
       ],
@@ -138,7 +138,7 @@ export const strictRubric: Rubric = {
         {
           type: 'command',
           name: 'Tests pass',
-          run: 'npm test || pytest || go test ./...',
+          run: 'npm test 2>/dev/null || node *.test.js 2>/dev/null || python *.test.py 2>/dev/null || python *_test.py 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || exit 1',
           partialCredit: true,
           passThreshold: 1.0,
         },
diff --git a/tsconfig.json b/tsconfig.json
index 39562be..a84a8ba 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -7,6 +7,7 @@
     "rootDir": "./src",
     "strict": true,
     "esModuleInterop": true,
+
     "skipLibCheck": true,
     "forceConsistentCasingInFileNames": true,
     "resolveJsonModule": true,