diff --git a/HANDOFF.md b/HANDOFF.md new file mode 100644 index 0000000..0502097 --- /dev/null +++ b/HANDOFF.md @@ -0,0 +1,72 @@ +# Handoff Notes + +## Current state + +The `add-glm-agent` branch (PR #48) adds an opencode agent integration to sniffbench. It works end-to-end: spawns an opencode server, sends prompts via the SDK, streams events via SSE, captures tool calls and file snapshots, evaluates with the rubric, and saves results. + +**To test:** +```bash +pnpm run build +npx sniff run --agent opencode --cases bootstrap-005 +npx sniff closed-issues run --agent opencode --local +``` + +## Test results (GLM-4.7-4bit via opencode) + +### Bootstrap cases + +| Case | Score | Verdict | Notes | +|------|-------|---------|-------| +| bootstrap-003 (Python Unit Tests) | 100% | **False pass** | Starter code already passes all tests — no-op | +| bootstrap-004 (Palindrome Checker) | 0% | Fail | Agent edited file but tests failed. JS test runner (`node *.test.js`) was missing from rubric command chain — now fixed | +| bootstrap-005 (Binary Search) | 100% | **Legit pass** | Agent implemented full binary search from `pass` stub | +| bootstrap-006 (Refactor Bad Code) | 0% | Fail | Agent refactored and renamed function `c`, breaking `require('./shipping_calculator').c` in test file. Also hit missing JS test runner issue | +| bootstrap-007 (CSV Parser) | 100% | **Legit pass** | Agent implemented CSV parser from empty `return []` stub | +| simple-001 (Simple math) | 0% | Fail | Code already passes — should be free win. Failed because JS test runner was missing from rubric | +| fail-001 (Intentionally failing) | 0% | Expected fail | Case exists to verify failure reporting works | + +**Key fix:** Added `node *.test.js` to the test command chain in `defaults.ts`. All JS cases were failing because the rubric only tried `npm test` (no package.json) then fell through to Python/Go runners. + +### Closed-issues cases (real GitHub issues) + +| Case | Score | Notes | +|------|-------|-------| +| #12 (Add --compare flag) | 10/100 | 0 files changed. Agent explored but produced no edits. Hard task: 363 additions across 2 files | +| #38 (Split variant/variants) | 10/100 | Created new `variants.ts` instead of modifying existing files. Understood intent but wrong approach | + +These are significantly harder than bootstrap cases — require understanding a real codebase and making coordinated multi-file changes. Likely beyond a 4-bit quantized local model's capability. + +## What's next + +### 1. Bootstrap test cases need work +- **bootstrap-003 (python-unit-test):** No-op — starter code already passes all tests. Make it actually buggy (e.g., `text.split(' ')` instead of `text.split()`). +- **bootstrap-006 (refactor):** Test file imports `c` by name. If the agent renames it (the whole point of refactoring), tests break. Either update tests to import by new name, or make the test more flexible. + +### 2. Re-run JS cases +After the `node *.test.js` fix, bootstrap-004 and simple-001 should be re-run to get accurate scores. + +### 3. CodeRabbit review items +- Remove unused `randombytes` dependency from package.json +- Remove redundant `allowSyntheticDefaultImports` from tsconfig.json +- Remove redundant `"dist/**/*"` from tsconfig exclude + +### 4. Hardcoded model config +The local-glm provider config (baseURL, model path, API key) is hardcoded in `src/agents/opencode.ts` constructor default. Should be externalized — read from opencode config file or a sniffbench config file. + +### 5. Comprehension cases +The 12 comp-* cases reference a `comprehension` rubric that doesn't exist yet. These are Q&A tasks that need LLM-judge or human evaluation, not test suites. + +## Key technical details + +- **ESM wrapper:** The `@opencode-ai/sdk` is ESM-only but the project is CommonJS. The `.mjs` wrapper in `src/agents/opencode-sdk.mjs` bridges this. tsc doesn't copy `.mjs` files, so the build script includes a manual `cp` step. +- **Port management:** `nextPort` counter in opencode.ts increments per run to avoid collisions. Resets on process restart. +- **Server lifecycle:** Each agent run spawns a real opencode server process on a unique port, with the case's temp dir as cwd. The server is killed in the `finally` block. +- **Event streaming:** Uses `client.event.subscribe()` (SSE) + `session.promptAsync()`. The stream object is at `sseResult.stream` (not `.data`). Events arrive as `message.part.updated` (tool calls, text, reasoning), `message.updated` (final tokens/cost), `session.status` (idle = done). +- **File snapshots:** After agent runs, `snapshotFiles()` in runner.ts walks the host tempDir and captures all files with a `changed` flag vs originals. Skips node_modules, .git, __pycache__, files >100KB. +- **Agent config:** `DEFAULT_AGENT` constant in `src/agents/registry.ts` is used across all CLI commands. No more hardcoded `'claude-code'` strings. +- **Closed-issues runner:** Now routes through agent wrappers (supports `--agent` and `--model` flags) instead of shelling out directly to `claude` CLI. + +## Git conventions +- Do NOT include `Co-Authored-By` or Anthropic email in commits +- Include model version (e.g., `claude-opus-4-6`) in commit body if desired +- Never amend commits — always create new ones diff --git a/cases/bootstrap/binary-search.yaml b/cases/bootstrap/binary-search.yaml new file mode 100644 index 0000000..1f53899 --- /dev/null +++ b/cases/bootstrap/binary-search.yaml @@ -0,0 +1,91 @@ +id: bootstrap-005 +title: "Binary Search Implementation" +prompt: | + Complete the binary_search function implementation. The function + should find the index of a target value in a sorted array, or + return -1 if not found. + + Binary search must: + - Run in O(log n) time complexity + - Handle empty arrays + - Handle values not present in the array + - Work with any comparable values + + Run: python binary_search.test.py + Make all tests pass. + +source: bootstrap +category: codefix +language: python +difficulty: medium + +tags: + - python + - algorithms + - binary-search + +files: + - path: binary_search.py + content: | + def binary_search(arr, target): + """ + Perform binary search on a sorted array. + + Args: + arr: Sorted list of comparable elements + target: Value to search for + + Returns: + Index of target if found, -1 otherwise + + Time complexity: O(log n) + """ + # TODO: Implement binary search + pass + + - path: binary_search.test.py + content: | + import unittest + from binary_search import binary_search + + class TestBinarySearch(unittest.TestCase): + + def test_found_elements(self): + arr = [1, 3, 5, 7, 9, 11, 13, 15] + self.assertEqual(binary_search(arr, 7), 3) + self.assertEqual(binary_search(arr, 1), 0) + self.assertEqual(binary_search(arr, 15), 7) + self.assertEqual(binary_search(arr, 9), 4) + + def test_not_found(self): + arr = [1, 3, 5, 7, 9] + self.assertEqual(binary_search(arr, 2), -1) + self.assertEqual(binary_search(arr, 6), -1) + self.assertEqual(binary_search(arr, 10), -1) + + def test_empty_array(self): + self.assertEqual(binary_search([], 5), -1) + + def test_single_element(self): + arr = [42] + self.assertEqual(binary_search(arr, 42), 0) + self.assertEqual(binary_search(arr, 0), -1) + + def test_two_elements(self): + arr = [1, 2] + self.assertEqual(binary_search(arr, 1), 0) + self.assertEqual(binary_search(arr, 2), 1) + + def test_strings(self): + arr = ['apple', 'banana', 'cherry', 'date'] + self.assertEqual(binary_search(arr, 'cherry'), 2) + self.assertEqual(binary_search(arr, 'grape'), -1) + + def test_large_array(self): + arr = list(range(1000)) + self.assertEqual(binary_search(arr, 42), 42) + self.assertEqual(binary_search(arr, 999), 999) + self.assertEqual(binary_search(arr, 1000), -1) + + if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/cases/bootstrap/csv-parser.yaml b/cases/bootstrap/csv-parser.yaml new file mode 100644 index 0000000..e71369a --- /dev/null +++ b/cases/bootstrap/csv-parser.yaml @@ -0,0 +1,125 @@ +id: bootstrap-007 +title: "File Processing - CSV Parser" +prompt: | + Implement a CSV parser that can read and parse a CSV file. + The implementation should handle: + - Basic comma-separated values + - Quoted fields containing commas + - Header row extraction + - Converting to array of objects + + Run: python csv_parser.test.py + Make all tests pass. + +source: bootstrap +category: codefix +language: python +difficulty: medium + +tags: + - python + - file-processing + - csv + +files: + - path: csv_parser.py + content: | + import csv + + def parse_csv(filepath, has_header=True): + """ + Parse a CSV file and return data as list of dicts (or lists). + + Args: + filepath: Path to the CSV file + has_header: Whether the first row is a header row + + Returns: + List of dictionaries (if has_header=True) or list of lists + """ + # TODO: Implement this function + return [] + + - path: csv_parser.test.py + content: | + import unittest + import os + import tempfile + import textwrap + from csv_parser import parse_csv + + class TestCSVParser(unittest.TestCase): + + def test_simple_csv_with_header(self): + data = textwrap.dedent('''\ + name,age,city + Alice,30,New York + Bob,25,Los Angeles + Charlie,35,Chicago''') + + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write(data) + f.flush() + + result = parse_csv(f.name, has_header=True) + + self.assertEqual(len(result), 3) + self.assertEqual(result[0]['name'], 'Alice') + self.assertEqual(result[0]['age'], '30') + self.assertEqual(result[1]['city'], 'Los Angeles') + + os.unlink(f.name) + + def test_csv_without_header(self): + data = textwrap.dedent('''\ + Alice,30,New York + Bob,25,Los Angeles''') + + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write(data) + f.flush() + + result = parse_csv(f.name, has_header=False) + + self.assertEqual(len(result), 2) + self.assertEqual(result[0][0], 'Alice') + self.assertEqual(result[1][2], 'Los Angeles') + + os.unlink(f.name) + + def test_quoted_fields(self): + data = textwrap.dedent('''\ + product,price,description + Widget,10.00,"A widget, really." + Gadget,15.00,"A device, good."''') + + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write(data) + f.flush() + + result = parse_csv(f.name, has_header=True) + + self.assertEqual(len(result), 2) + self.assertEqual(result[0]['description'], 'A widget, really.') + self.assertEqual(result[1]['description'], 'A device, good.') + + os.unlink(f.name) + + def test_single_row(self): + data = textwrap.dedent('''\ + name,value + test,123''') + + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write(data) + f.flush() + + result = parse_csv(f.name, has_header=True) + + self.assertEqual(len(result), 1) + self.assertEqual(result[0]['name'], 'test') + + os.unlink(f.name) + + if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/cases/bootstrap/palindrome-checker.yaml b/cases/bootstrap/palindrome-checker.yaml new file mode 100644 index 0000000..ab47218 --- /dev/null +++ b/cases/bootstrap/palindrome-checker.yaml @@ -0,0 +1,97 @@ +id: bootstrap-004 +title: "Palindrome Checker" +prompt: | + Implement a palindrome checker that works correctly across + different edge cases. The tests are already written - you need + to make them all pass. + + A palindrome reads the same forwards and backwards. + You should: + - Ignore case + - Ignore non-alphanumeric characters + - Handle empty strings as valid palindromes + + Run: node palindrome.test.js + Fix the implementation until all tests pass. + +source: bootstrap +category: codefix +language: javascript +difficulty: easy + +tags: + - javascript + - algorithms + - string-manipulation + +files: + - path: palindrome.js + content: | + function isPalindrome(str) { + // TODO: Implement properly + return str === str.split('').reverse().join(''); + } + + module.exports = { isPalindrome }; + + - path: palindrome.test.js + content: | + const { isPalindrome } = require('./palindrome'); + + function test(name, fn) { + try { + fn(); + console.log(`✓ ${name}`); + } catch (err) { + console.log(`✗ ${name}: ${err.message}`); + process.exit(1); + } + } + + function assertEqual(actual, expected, message) { + if (actual !== expected) { + throw new Error(message || `Expected ${expected}, got ${actual}`); + } + } + + // Basic palindromes + test('racecar is palindrome', () => { + assertEqual(isPalindrome('racecar'), true); + }); + + test('hello is not palindrome', () => { + assertEqual(isPalindrome('hello'), false); + }); + + // Case insensitive + test('RaceCar is palindrome', () => { + assertEqual(isPalindrome('RaceCar'), true); + }); + + test('A man a plan a canal Panama', () => { + assertEqual(isPalindrome('A man a plan a canal Panama'), true); + }); + + // With spaces and punctuation + test('Was it a car or a cat I saw', () => { + assertEqual(isPalindrome('Was it a car or a cat I saw'), true); + }); + + // Edge cases + test('empty string', () => { + assertEqual(isPalindrome(''), true); + }); + + test('single character', () => { + assertEqual(isPalindrome('a'), true); + }); + + test('numeric', () => { + assertEqual(isPalindrome('12321'), true); + }); + + test('numeric with letters', () => { + assertEqual(isPalindrome('1a2 3 2a1'), true); + }); + + console.log('All tests passed!'); \ No newline at end of file diff --git a/cases/bootstrap/python-unit-test.yaml b/cases/bootstrap/python-unit-test.yaml new file mode 100644 index 0000000..88153f2 --- /dev/null +++ b/cases/bootstrap/python-unit-test.yaml @@ -0,0 +1,64 @@ +id: bootstrap-003 +title: "Python Unit Tests" +prompt: | + This Python file contains a simple text processing function + with failing unit tests. Fix the implementation so that all + tests pass. + + The function should: + - Count the number of words in a given string + - Handle edge cases like empty strings, multiple spaces + - Handle punctuation properly + + Run the tests with: python text_processor.test.py + Ensure all tests pass. + +source: bootstrap +category: codefix +language: python +difficulty: easy + +tags: + - python + - unit-tests + - text-processing + +files: + - path: text_processor.py + content: | + def count_words(text): + """Count the number of words in a string.""" + # TODO: This implementation is buggy. Fix it! + words = text.split(' ') + return len(words) + + - path: text_processor.test.py + content: | + import unittest + from text_processor import count_words + + class TestCountWords(unittest.TestCase): + + def test_simple_sentence(self): + self.assertEqual(count_words("hello world"), 2) + self.assertEqual(count_words("one two three four"), 4) + + def test_empty_string(self): + self.assertEqual(count_words(""), 0) + self.assertEqual(count_words(" "), 0) + + def test_single_word(self): + self.assertEqual(count_words("hello"), 1) + + def test_multiple_spaces(self): + self.assertEqual(count_words("hello world"), 2) + + def test_punctuation(self): + self.assertEqual(count_words("hello, world!"), 2) + self.assertEqual(count_words("test. another? yes."), 3) + + def test_newlines(self): + self.assertEqual(count_words("line1\nline2\nline3"), 3) + + if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/cases/bootstrap/refactor-shipping.yaml b/cases/bootstrap/refactor-shipping.yaml new file mode 100644 index 0000000..618e895 --- /dev/null +++ b/cases/bootstrap/refactor-shipping.yaml @@ -0,0 +1,135 @@ +id: bootstrap-006 +title: "Refactor Bad Code" +prompt: | + The following code works but has multiple issues: + - Poor naming + - Magic numbers + - No error handling + - Hard to test + - Code duplication + + Refactor the code to be: + - Readable with clear naming + - Maintainable + - Well-tested with the provided test suite + - Handle edge cases properly + + The tests describe the expected behavior - make them pass + while improving code quality. + + Run: node shipping_calculator.test.js + +source: bootstrap +category: refactoring +language: javascript +difficulty: medium + +tags: + - javascript + - refactoring + - code-quality + +files: + - path: shipping_calculator.js + content: | + function c(w, d, z) { + if (w <= 0) return 0; + if (z == 'domestic') { + if (w < 5) return 5; + if (w < 10) return 10; + if (w < 20) return 15; + return 25; + } + if (z == 'international') { + if (d == 'express') { + if (w < 5) return 20; + if (w < 10) return 35; + return 50; + } + if (w < 5) return 15; + if (w < 10) return 25; + return 40; + } + return null; + } + + module.exports = { c }; + + - path: shipping_calculator.test.js + content: | + const { c } = require('./shipping_calculator'); + + function test(name, fn) { + try { + fn(); + console.log(`✓ ${name}`); + } catch (err) { + console.log(`✗ ${name}: ${err.message}`); + process.exit(1); + } + } + + // Note: The exported function is named 'c'. This is part of what + // needs to be refactored. For now, we use 'c' in tests. + + test('domestic under 5 lbs', () => { + const r = c(4, '', 'domestic'); + if (r !== 5) throw new Error(`Expected 5, got ${r}`); + }); + + test('domestic 5-9 lbs', () => { + const r = c(7, '', 'domestic'); + if (r !== 10) throw new Error(`Expected 10, got ${r}`); + }); + + test('domestic 10-19 lbs', () => { + const r = c(15, '', 'domestic'); + if (r !== 15) throw new Error(`Expected 15, got ${r}`); + }); + + test('domestic 20+ lbs', () => { + const r = c(25, '', 'domestic'); + if (r !== 25) throw new Error(`Expected 25, got ${r}`); + }); + + test('international standard under 5 lbs', () => { + const r = c(4, 'standard', 'international'); + if (r !== 15) throw new Error(`Expected 15, got ${r}`); + }); + + test('international standard 5-9 lbs', () => { + const r = c(7, 'standard', 'international'); + if (r !== 25) throw new Error(`Expected 25, got ${r}`); + }); + + test('international standard 10+ lbs', () => { + const r = c(12, 'standard', 'international'); + if (r !== 40) throw new Error(`Expected 40, got ${r}`); + }); + + test('international express under 5 lbs', () => { + const r = c(3, 'express', 'international'); + if (r !== 20) throw new Error(`Expected 20, got ${r}`); + }); + + test('international express 5-9 lbs', () => { + const r = c(8, 'express', 'international'); + if (r !== 35) throw new Error(`Expected 35, got ${r}`); + }); + + test('international express 10+ lbs', () => { + const r = c(15, 'express', 'international'); + if (r !== 50) throw new Error(`Expected 50, got ${r}`); + }); + + test('zero or negative weight', () => { + if (c(0, '', 'domestic') !== 0) throw new Error('Zero weight should be 0'); + if (c(-1, '', 'domestic') !== 0) throw new Error('Negative weight should be 0'); + }); + + test('invalid zone', () => { + const r = c(5, '', 'invalid'); + if (r !== null) throw new Error('Invalid zone should return null'); + }); + + console.log('All tests passed!'); \ No newline at end of file diff --git a/package.json b/package.json index 066e195..92cbafb 100644 --- a/package.json +++ b/package.json @@ -2,13 +2,14 @@ "name": "sniffbench", "version": "0.1.1", "description": "A benchmark suite for coding agents. Think pytest, but for evaluating AI assistants.", + "type": "commonjs", "main": "dist/index.js", "types": "dist/index.d.ts", "bin": { "sniff": "dist/cli/index.js" }, "scripts": { - "build": "tsc", + "build": "tsc && cp src/agents/opencode-sdk.mjs dist/agents/opencode-sdk.mjs", "dev": "tsc --watch", "prepublishOnly": "npm run build", "test": "jest", @@ -43,10 +44,12 @@ "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.1.61", "@anthropic-ai/claude-code": "^2.0.61", + "@opencode-ai/sdk": "^1.1.65", "chalk": "^5.3.0", "commander": "^12.0.0", "dockerode": "^4.0.2", "ora": "^8.0.0", + "yaml": "^2.3.4", "zod": "^4.1.13" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7fc12b5..1bd05a3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -14,6 +14,9 @@ importers: '@anthropic-ai/claude-code': specifier: ^2.0.61 version: 2.0.76 + '@opencode-ai/sdk': + specifier: ^1.1.65 + version: 1.2.1 chalk: specifier: ^5.3.0 version: 5.6.2 @@ -332,56 +335,66 @@ packages: resolution: {integrity: sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA==} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-arm@1.0.5': resolution: {integrity: sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g==} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-x64@1.0.4': resolution: {integrity: sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw==} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.0.4': resolution: {integrity: sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA==} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.0.4': resolution: {integrity: sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw==} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-linux-arm64@0.33.5': resolution: {integrity: sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-linux-arm@0.33.5': resolution: {integrity: sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-linux-x64@0.33.5': resolution: {integrity: sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-linuxmusl-arm64@0.33.5': resolution: {integrity: sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-linuxmusl-x64@0.33.5': resolution: {integrity: sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-win32-x64@0.33.5': resolution: {integrity: sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg==} @@ -548,6 +561,9 @@ packages: '@octokit/types@16.0.0': resolution: {integrity: sha512-sKq+9r1Mm4efXW1FCk7hFSeJo4QKreL/tTbR0rz/qx/r1Oa2VV83LTA/H/MuCOX7uCIJmQVRKBcbmWoySjAnSg==} + '@opencode-ai/sdk@1.2.1': + resolution: {integrity: sha512-K5e15mIXTyAykBw0GX+8O28IJHlPMw1jI/m3SDu+hgUHjmg2refqLPqyuqv8hE2nRcuGi8HajhpDJjkO7H2S0A==} + '@pnpm/config.env-replace@1.1.0': resolution: {integrity: sha512-htyl8TWnKL7K/ESFa1oW2UB5lVDxuF5DpM7tBi6Hu2LNL3mWkIzNLG6N4zoCUP1lCKNxWy/3iu8mS8MvToGd6w==} engines: {node: '>=12.22.0'} @@ -3408,6 +3424,8 @@ snapshots: dependencies: '@octokit/openapi-types': 27.0.0 + '@opencode-ai/sdk@1.2.1': {} + '@pnpm/config.env-replace@1.1.0': {} '@pnpm/network.ca-file@1.0.2': diff --git a/src/agents/opencode-sdk.mjs b/src/agents/opencode-sdk.mjs new file mode 100644 index 0000000..a2bbe3e --- /dev/null +++ b/src/agents/opencode-sdk.mjs @@ -0,0 +1,8 @@ +/** + * ESM wrapper for @opencode-ai/sdk + * This file is ESM and can properly import the SDK which is ESM-only + */ + +import { createOpencodeClient } from '@opencode-ai/sdk'; + +export { createOpencodeClient }; \ No newline at end of file diff --git a/src/agents/opencode-sdk.mjs.d.ts b/src/agents/opencode-sdk.mjs.d.ts new file mode 100644 index 0000000..f61c7aa --- /dev/null +++ b/src/agents/opencode-sdk.mjs.d.ts @@ -0,0 +1,7 @@ +/** + * Type declarations for opencode-sdk.mjs wrapper + */ + +declare const createOpencodeClient: any; + +export { createOpencodeClient }; diff --git a/src/agents/opencode.ts b/src/agents/opencode.ts new file mode 100644 index 0000000..eb7d89e --- /dev/null +++ b/src/agents/opencode.ts @@ -0,0 +1,426 @@ +/** + * Opencode agent wrapper using SDK + * + * Uses @opencode-ai/sdk for programmatic interaction with opencode. + * Spawns the opencode server with the correct working directory so + * the agent operates on the test case files. + */ + +import { spawn, ChildProcess } from 'child_process'; +import { + AgentWrapper, + AgentResult, + AgentRunOptions, + ToolCall, + emptyAgentResult, +} from './types.js'; + +// Import SDK client dynamically since it's ESM-only +let _createOpencodeClient: any; +const loadSDK = async () => { + if (!_createOpencodeClient) { + const sdkWrapper = await import('./opencode-sdk.mjs'); + _createOpencodeClient = sdkWrapper.createOpencodeClient; + } + return _createOpencodeClient; +}; + +// Port counter to avoid collisions between concurrent runs +let nextPort = 4097; + +/** + * Spawn an opencode server process with the given working directory. + * Returns the server URL and a close function. + */ +async function spawnServer( + cwd: string, + config: Record, + timeoutMs: number, +): Promise<{ url: string; proc: ChildProcess }> { + const port = nextPort++; + const proc = spawn('opencode', ['serve', `--hostname=127.0.0.1`, `--port=${port}`], { + cwd, + env: { + ...process.env, + OPENCODE_CONFIG_CONTENT: JSON.stringify(config), + }, + }); + + const url = await new Promise((resolve, reject) => { + const id = setTimeout(() => { + proc.kill(); + reject(new Error(`Timeout waiting for opencode server after ${timeoutMs}ms`)); + }, timeoutMs); + + let output = ''; + proc.stdout?.on('data', (chunk: Buffer) => { + output += chunk.toString(); + for (const line of output.split('\n')) { + if (line.startsWith('opencode server listening')) { + const match = line.match(/on\s+(https?:\/\/[^\s]+)/); + if (match) { + clearTimeout(id); + resolve(match[1]); + return; + } + } + } + }); + proc.stderr?.on('data', (chunk: Buffer) => { + output += chunk.toString(); + }); + proc.on('exit', (code) => { + clearTimeout(id); + reject(new Error(`Server exited with code ${code}: ${output}`)); + }); + proc.on('error', (err) => { + clearTimeout(id); + reject(err); + }); + }); + + return { url, proc }; +} + +/** + * Opencode agent wrapper using SDK + */ +export class OpencodeAgent implements AgentWrapper { + name = 'opencode'; + displayName = 'Opencode'; + + private cliPath: string; + private config: Record; + + constructor(cliPath: string = 'opencode', config?: Record) { + this.cliPath = cliPath; + this.config = config || { + model: 'local-glm/glm-4.7-local-4bit', + provider: { + 'local-glm': { + api: 'openai', + options: { + baseURL: 'http://127.0.0.1:8081/v1', + apiKey: 'local-glm-key', + }, + models: { + 'glm-4.7-local-4bit': { + name: 'GLM-4.7 Local (4-bit)', + id: '/Users/studio/models/GLM-4.7-4bit', + reasoning: false, + tool_call: true, + temperature: true, + limit: { context: 32768, output: 4096 }, + cost: { input: 0, output: 0 }, + modalities: { input: ['text'], output: ['text'] }, + }, + }, + }, + }, + }; + } + + async isAvailable(): Promise { + try { + const version = await this.getVersion(); + return version !== null; + } catch { + return false; + } + } + + async getVersion(): Promise { + return new Promise((resolve) => { + const proc = spawn(this.cliPath, ['--version'], { timeout: 5000 }); + let stdout = ''; + proc.stdout?.on('data', (data: Buffer) => { + stdout += data.toString(); + }); + proc.on('close', (code: number | null) => { + resolve(code === 0 && stdout.trim() ? stdout.trim() : null); + }); + proc.on('error', () => resolve(null)); + }); + } + + async run(prompt: string, options: AgentRunOptions): Promise { + const runStartTime = Date.now(); + const timeoutMs = options.timeoutMs || 300000; + const toolCalls: ToolCall[] = []; + let model = 'unknown'; + let sessionId = ''; + let serverProc: ChildProcess | null = null; + + try { + // Spawn server in the case's working directory + const cwd = options.cwd || process.cwd(); + const config = options.model + ? { ...this.config, model: options.model } + : this.config; + const { url, proc } = await spawnServer(cwd, config, 15000); + serverProc = proc; + + const createClient = await loadSDK(); + const client = createClient({ baseUrl: url }); + + const createResult = await client.session.create({}); + if (createResult.error) { + throw new Error(`Failed to create session: ${JSON.stringify(createResult.error)}`); + } + + const session = createResult.data; + sessionId = session.id; + model = options.model || session.version || 'unknown'; + + options.onEvent?.({ type: 'start', timestamp: runStartTime, model }); + + // Subscribe to SSE events BEFORE sending the prompt so we capture everything + // event.subscribe() returns ServerSentEventsResult directly (not { data, error }) + const sseResult = await client.event.subscribe({}) as any; + const stream: AsyncIterable | undefined = + sseResult?.stream || sseResult?.data?.stream || sseResult?.data; + + if (!stream) { + throw new Error( + `Event stream not available — subscribe() returned: ${JSON.stringify(Object.keys(sseResult || {}))}`, + ); + } + + // Send prompt asynchronously (returns immediately, events stream the progress) + const asyncResult = await client.session.promptAsync({ + path: { id: sessionId }, + body: { + parts: [{ type: 'text', text: prompt }], + }, + }); + + if (asyncResult.error) { + throw new Error(`Prompt failed: ${JSON.stringify(asyncResult.error)}`); + } + + // Process SSE events until the session goes idle or we time out + let answer = ''; + let numTurns = 0; + let totalTokens = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }; + let totalCost = 0; + const deadline = Date.now() + timeoutMs - 5000; + + for await (const event of stream) { + if (Date.now() > deadline) { + options.onEvent?.({ type: 'status', message: 'Timed out waiting for agent' }); + break; + } + + const eventType = event?.type || event?.event; + + if (eventType === 'message.part.updated') { + const props = event.properties || event.data; + if (!props) continue; + const part = props.part; + if (!part) continue; + + if (part.type === 'text') { + // Streaming text delta + const delta = props.delta || ''; + if (delta) { + answer += delta; + options.onEvent?.({ type: 'text_delta', text: delta }); + } + } else if (part.type === 'tool') { + const status = part.state?.status; + const callID = part.callID || part.callId; + const toolName = part.tool || 'unknown'; + + if (status === 'running' || status === 'pending') { + // Only add if not already tracked + if (!toolCalls.find((t) => t.id === callID)) { + const toolCall: ToolCall = { + id: callID, + name: toolName, + input: part.state?.input || {}, + timestamp: Date.now(), + }; + toolCalls.push(toolCall); + options.onEvent?.({ type: 'tool_start', tool: toolCall }); + options.onEvent?.({ type: 'status', message: `Tool: ${toolName}` }); + } + } else if (status === 'completed') { + const existing = toolCalls.find((t) => t.id === callID); + if (existing) { + existing.durationMs = part.state?.time + ? (part.state.time.end - part.state.time.start) * 1000 + : Date.now() - existing.timestamp; + existing.success = true; + existing.result = part.state?.output + ? String(part.state.output).substring(0, 500) + : undefined; + } else { + // Tool completed without a prior start event (can happen if subscription started late) + toolCalls.push({ + id: callID, + name: toolName, + input: part.state?.input || {}, + timestamp: Date.now(), + durationMs: part.state?.time + ? (part.state.time.end - part.state.time.start) * 1000 + : 0, + success: true, + result: part.state?.output + ? String(part.state.output).substring(0, 500) + : undefined, + }); + } + options.onEvent?.({ + type: 'tool_end', + toolId: callID, + success: true, + durationMs: toolCalls.find((t) => t.id === callID)?.durationMs || 0, + }); + } else if (status === 'error') { + const existing = toolCalls.find((t) => t.id === callID); + if (existing) { + existing.success = false; + existing.durationMs = Date.now() - existing.timestamp; + } + options.onEvent?.({ + type: 'tool_end', + toolId: callID, + success: false, + durationMs: existing?.durationMs || 0, + }); + } + } else if (part.type === 'reasoning') { + const text = props.delta || part.text || ''; + if (text) { + options.onEvent?.({ type: 'thinking', text }); + } + } else if (part.type === 'step-finish') { + numTurns++; + // Accumulate per-step tokens/cost + if (part.tokens) { + totalTokens.input += part.tokens.input || 0; + totalTokens.output += part.tokens.output || 0; + totalTokens.cacheRead += part.tokens.cache?.read || 0; + totalTokens.cacheWrite += part.tokens.cache?.write || 0; + totalTokens.total += part.tokens.total || 0; + } + if (part.cost) { + totalCost += part.cost; + } + } + } else if (eventType === 'message.updated') { + // A full message update — extract final info from here + const props = event.properties || event.data; + const info = props?.info; + if (info?.providerID && info?.modelID) { + model = `${info.providerID}/${info.modelID}`; + } + // Use message-level tokens as authoritative total if available + if (info?.tokens?.total) { + totalTokens = { + input: info.tokens.input || totalTokens.input, + output: info.tokens.output || totalTokens.output, + cacheRead: info.tokens.cache?.read || totalTokens.cacheRead, + cacheWrite: info.tokens.cache?.write || totalTokens.cacheWrite, + total: info.tokens.total, + }; + } + if (info?.cost !== undefined) { + totalCost = info.cost; + } + // Extract final answer text from message parts if we haven't captured it via deltas + if (props?.parts && !answer) { + for (const p of props.parts) { + if (p.type === 'text' && p.text) { + answer += p.text; + } + } + } + } else if (eventType === 'session.status') { + const props = event.properties || event.data; + const status = props?.status; + if (status?.type === 'idle') { + // Agent finished processing + options.onEvent?.({ type: 'status', message: 'Session idle — agent finished' }); + break; + } else if (status?.type === 'busy') { + options.onEvent?.({ type: 'status', message: 'Agent working...' }); + } else if (status?.type === 'retry') { + options.onEvent?.({ + type: 'status', + message: `Retrying (attempt ${status.attempt}): ${status.message}`, + }); + } + } else if (eventType === 'session.error') { + const props = event.properties || event.data; + const errMsg = props?.error?.message || JSON.stringify(props?.error) || 'Unknown error'; + options.onEvent?.({ type: 'error', message: errMsg, code: 'SESSION_ERROR' }); + } + } + + // If answer is still empty, fetch the final messages from the session + if (!answer) { + const messagesResult = await client.session.messages({ + path: { id: sessionId }, + }); + if (messagesResult.data) { + const messages = messagesResult.data as any[]; + // Find the last assistant message + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === 'assistant' && msg.parts) { + for (const p of msg.parts) { + if (p.type === 'text' && p.text) { + answer += p.text; + } + } + break; + } + } + } + } + + const result: AgentResult = { + answer, + success: true, + timedOut: Date.now() > deadline, + durationMs: Date.now() - runStartTime, + tokens: { + inputTokens: totalTokens.input, + outputTokens: totalTokens.output, + cacheReadTokens: totalTokens.cacheRead, + cacheWriteTokens: totalTokens.cacheWrite, + totalTokens: totalTokens.total, + }, + costUsd: totalCost, + numTurns: numTurns || 1, + toolCalls, + toolsUsed: [...new Set(toolCalls.map((t) => t.name))], + model, + raw: { sessionId }, + }; + + options.onEvent?.({ type: 'complete', result }); + return result; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + + options.onEvent?.({ type: 'error', message: errorMessage, code: 'ERROR' }); + + const errorResult = emptyAgentResult(errorMessage); + errorResult.durationMs = Date.now() - runStartTime; + errorResult.toolCalls = toolCalls; + errorResult.toolsUsed = [...new Set(toolCalls.map((t) => t.name))]; + errorResult.model = model; + + options.onEvent?.({ type: 'complete', result: errorResult }); + return errorResult; + } finally { + serverProc?.kill(); + } + } +} + +export function createOpencodeAgent(cliPath?: string): OpencodeAgent { + return new OpencodeAgent(cliPath); +} diff --git a/src/agents/registry.ts b/src/agents/registry.ts index 273aa38..828ce26 100644 --- a/src/agents/registry.ts +++ b/src/agents/registry.ts @@ -6,6 +6,10 @@ import { AgentWrapper, AgentRegistry } from './types'; import { createClaudeCodeAgent } from './claude-code'; +import { createOpencodeAgent } from './opencode'; + +/** Default agent used when none is specified on the CLI */ +export const DEFAULT_AGENT = 'claude-code'; /** * Default agent registry implementation @@ -16,6 +20,7 @@ class DefaultAgentRegistry implements AgentRegistry { constructor() { // Register built-in agents this.register(createClaudeCodeAgent()); + this.register(createOpencodeAgent()); } get(name: string): AgentWrapper | undefined { diff --git a/src/cases/loader.ts b/src/cases/loader.ts index 587e116..928163b 100644 --- a/src/cases/loader.ts +++ b/src/cases/loader.ts @@ -147,23 +147,26 @@ export interface LoadOptions { } /** - * Load all cases from a directory + * Load all cases from one or more directories */ -export async function loadCases(casesDir: string, options: LoadOptions = {}): Promise { +export async function loadCases(casesDir: string | string[], options: LoadOptions = {}): Promise { const cases: Case[] = []; - - // Check if directory exists - if (!fs.existsSync(casesDir)) { - return cases; + const dirs = Array.isArray(casesDir) ? casesDir : [casesDir]; + const seenIds = new Set(); + + // Collect YAML files from all directories + const yamlFiles: string[] = []; + for (const dir of dirs) { + if (fs.existsSync(dir)) { + yamlFiles.push(...findYamlFiles(dir)); + } } - // Recursively find all YAML files - const yamlFiles = findYamlFiles(casesDir); - for (const filePath of yamlFiles) { try { const result = await loadCaseFile(filePath, options); - if (result.case && matchesFilter(result.case, options)) { + if (result.case && matchesFilter(result.case, options) && !seenIds.has(result.case.id)) { + seenIds.add(result.case.id); cases.push(result.case); } // Log warnings @@ -348,22 +351,41 @@ function matchesFilter(caseData: Case, options: LoadOptions): boolean { /** * Get the default cases directory for a project + * + * @deprecated Use getDefaultCasesDirs() instead — returns all case directories */ export function getDefaultCasesDir(projectRoot: string = process.cwd()): string { - // Check for .sniffbench/cases first (project-specific) + return getDefaultCasesDirs(projectRoot)[0]; +} + +/** + * Get all cases directories (project-specific + built-in) + * + * Project-specific cases (.sniffbench/cases) come first so they take + * precedence over built-in cases with the same ID. + */ +export function getDefaultCasesDirs(projectRoot: string = process.cwd()): string[] { + const dirs: string[] = []; + + // Project-specific cases (first = higher priority for dedup) const projectCases = path.join(projectRoot, '.sniffbench', 'cases'); if (fs.existsSync(projectCases)) { - return projectCases; + dirs.push(projectCases); + } + + // Built-in cases shipped with sniffbench + const builtInCases = path.join(__dirname, '..', '..', 'cases'); + if (fs.existsSync(builtInCases)) { + dirs.push(builtInCases); } - // Fall back to cases/ in sniffbench installation - return path.join(__dirname, '..', '..', 'cases'); + return dirs; } /** * List available case categories */ -export async function listCategories(casesDir: string): Promise { +export async function listCategories(casesDir: string | string[]): Promise { const cases = await loadCases(casesDir); const categories = new Set(cases.map((c) => c.category)); return Array.from(categories).sort(); @@ -372,7 +394,7 @@ export async function listCategories(casesDir: string): Promise { /** * List available languages */ -export async function listLanguages(casesDir: string): Promise { +export async function listLanguages(casesDir: string | string[]): Promise { const cases = await loadCases(casesDir); const languages = new Set(cases.map((c) => c.language)); return Array.from(languages).sort(); @@ -381,7 +403,7 @@ export async function listLanguages(casesDir: string): Promise { /** * Get a single case by ID */ -export async function getCaseById(casesDir: string, id: string): Promise { +export async function getCaseById(casesDir: string | string[], id: string): Promise { const cases = await loadCases(casesDir, { ids: [id] }); return cases[0] || null; } diff --git a/src/cases/types.ts b/src/cases/types.ts index f63fe20..aaaf1fe 100644 --- a/src/cases/types.ts +++ b/src/cases/types.ts @@ -460,6 +460,21 @@ export interface CaseResult { /** Agent behavior trace */ agentTrace?: AgentTrace; + /** The agent's text response */ + agentResponse?: string; + + /** Tool calls the agent made */ + agentToolCalls?: { name: string; durationMs?: number; success?: boolean }[]; + + /** Model used */ + agentModel?: string; + + /** Token usage */ + agentTokens?: { input: number; output: number; total: number }; + + /** Files produced by the agent (snapshot of workspace after agent runs) */ + agentFiles?: { path: string; content: string; changed: boolean }[]; + /** Total duration in milliseconds */ durationMs: number; diff --git a/src/cli/commands/cases.ts b/src/cli/commands/cases.ts index f67e1e4..93a4b4d 100644 --- a/src/cli/commands/cases.ts +++ b/src/cli/commands/cases.ts @@ -8,7 +8,7 @@ import { spawn } from 'child_process'; import { box } from '../../utils/ui'; import { loadCases, - getDefaultCasesDir, + getDefaultCasesDirs, listCategories, listLanguages, getCaseById, @@ -37,9 +37,9 @@ export async function casesListCommand(options: CasesListOptions) { const spinner = ora('Loading cases...').start(); try { - const casesDir = getDefaultCasesDir(); + const casesDirs = getDefaultCasesDirs(); - const cases = await loadCases(casesDir, { + const cases = await loadCases(casesDirs, { category: options.category, language: options.language, difficulty: options.difficulty as CaseDifficulty | undefined, @@ -57,7 +57,7 @@ export async function casesListCommand(options: CasesListOptions) { if (cases.length === 0) { console.log(chalk.yellow('No cases found matching the criteria.')); console.log(chalk.dim('\nTip: Try running without filters, or add cases to:')); - console.log(chalk.cyan(` ${casesDir}`)); + console.log(chalk.cyan(` ${casesDirs.join(' or ')}`)); return; } @@ -97,8 +97,8 @@ export async function casesShowCommand(options: CasesShowOptions) { const spinner = ora('Loading case...').start(); try { - const casesDir = getDefaultCasesDir(); - const caseData = await getCaseById(casesDir, options.id); + const casesDirs = getDefaultCasesDirs(); + const caseData = await getCaseById(casesDirs, options.id); spinner.stop(); @@ -141,8 +141,8 @@ export async function casesCategoriesCommand() { const spinner = ora('Loading categories...').start(); try { - const casesDir = getDefaultCasesDir(); - const categories = await listCategories(casesDir); + const casesDirs = getDefaultCasesDirs(); + const categories = await listCategories(casesDirs); spinner.stop(); @@ -169,8 +169,8 @@ export async function casesLanguagesCommand() { const spinner = ora('Loading languages...').start(); try { - const casesDir = getDefaultCasesDir(); - const languages = await listLanguages(casesDir); + const casesDirs = getDefaultCasesDirs(); + const languages = await listLanguages(casesDirs); spinner.stop(); diff --git a/src/cli/commands/closed-issues.ts b/src/cli/commands/closed-issues.ts index c67b220..efdfe15 100644 --- a/src/cli/commands/closed-issues.ts +++ b/src/cli/commands/closed-issues.ts @@ -44,7 +44,7 @@ import { ClosedIssueCaseRun, Run, } from '../../runs'; -import { getAgent } from '../../agents'; +import { getAgent, DEFAULT_AGENT } from '../../agents'; // ============================================================================= // Command Interfaces @@ -71,6 +71,8 @@ interface ListCommandOptions { interface RunCommandOptions { case?: string; + agent?: string; + model?: string; variant?: string; local?: boolean; timeout?: string; @@ -431,6 +433,8 @@ export async function closedIssuesRunCommand(options: RunCommandOptions) { const result = await runClosedIssueCase({ caseData: c, + agent: options.agent, + model: options.model, variant, projectRoot: process.cwd(), timeoutMs, @@ -466,7 +470,7 @@ export async function closedIssuesRunCommand(options: RunCommandOptions) { } // Save run to store - const runId = await saveClosedIssuesRun(projectRoot, results, variant, options.run); + const runId = await saveClosedIssuesRun(projectRoot, results, variant, options.run, options.agent); // Output JSON if requested if (options.json) { @@ -566,10 +570,11 @@ async function saveClosedIssuesRun( projectRoot: string, results: RunCaseResult[], variant: Variant | undefined, - label?: string + label?: string, + agentName?: string ): Promise { // Capture agent config - const agent = getAgent('claude-code'); + const agent = getAgent(agentName || DEFAULT_AGENT); const agentConfig = await capturePartialAgentConfig(agent, projectRoot); // Link to variant if used diff --git a/src/cli/commands/interview.ts b/src/cli/commands/interview.ts index bddcc38..cc6e12d 100644 --- a/src/cli/commands/interview.ts +++ b/src/cli/commands/interview.ts @@ -13,7 +13,7 @@ import * as fs from 'fs'; import * as path from 'path'; import * as readline from 'readline'; import { box } from '../../utils/ui'; -import { loadCases, getDefaultCasesDir } from '../../cases'; +import { loadCases, getDefaultCasesDirs } from '../../cases'; import { Case } from '../../cases/types'; import { getAgent, AgentWrapper, AgentResult, AgentEvent } from '../../agents'; import { computeBehaviorMetrics, formatBehaviorMetrics } from '../../metrics'; @@ -1003,9 +1003,9 @@ export async function interviewCommand(options: InterviewOptions) { // Load comprehension cases spinner.start('Loading comprehension cases...'); - const casesDir = getDefaultCasesDir(); + const casesDirs = getDefaultCasesDirs(); - const cases = await loadCases(casesDir, { + const cases = await loadCases(casesDirs, { category: 'comprehension', ids: options.cases?.split(',').map(c => c.trim()), }); @@ -1013,7 +1013,7 @@ export async function interviewCommand(options: InterviewOptions) { if (cases.length === 0) { spinner.warn('No comprehension cases found'); console.log(chalk.yellow('\nMake sure comprehension cases exist in:')); - console.log(chalk.cyan(` ${casesDir}/comprehension/`)); + console.log(chalk.cyan(` ${casesDirs.join(' or ')}`)); return; } diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index 498bb74..7921767 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -3,7 +3,7 @@ import ora from 'ora'; import * as fs from 'fs'; import * as path from 'path'; import { box } from '../../utils/ui'; -import { loadCases, getDefaultCasesDir } from '../../cases'; +import { loadCases, getDefaultCasesDirs } from '../../cases'; import { CaseResult } from '../../cases/types'; import { runCases, ProgressUpdate } from '../../evaluation'; import { checkDocker } from '../../sandbox'; @@ -14,6 +14,7 @@ interface RunOptions { output: string; timeout?: number; network?: boolean; + model?: string; } export async function runCommand(options: RunOptions) { @@ -35,12 +36,12 @@ export async function runCommand(options: RunOptions) { // Load cases spinner.start('Loading test cases...'); - const casesDir = getDefaultCasesDir(); + const casesDirs = getDefaultCasesDirs(); // Parse case filter if provided const caseIds = options.cases?.split(',').map((c) => c.trim()); - const cases = await loadCases(casesDir, { + const cases = await loadCases(casesDirs, { ids: caseIds, }); @@ -48,7 +49,7 @@ export async function runCommand(options: RunOptions) { spinner.warn('No test cases found'); console.log( chalk.yellow('\nTo add test cases, create YAML files in:\n') + - chalk.cyan(` ${casesDir}\n\n`) + + chalk.cyan(` ${casesDirs.join(' or ')}\n\n`) + chalk.dim('See cases/bootstrap/example-case-spec.yaml for format.') ); return; @@ -86,7 +87,7 @@ export async function runCommand(options: RunOptions) { const onCaseComplete = (result: CaseResult) => { if (currentSpinner) { - const scorePercent = Math.round(result.score * 100); + const scorePercent = Math.round(result.score); if (result.passed) { currentSpinner.succeed(`${result.caseId}: ${chalk.green('PASSED')} (${scorePercent}%, ${formatDuration(result.durationMs)})`); } else if (result.timedOut) { @@ -103,6 +104,7 @@ export async function runCommand(options: RunOptions) { try { const result = await runCases(cases, { agent: options.agent, + model: options.model, timeoutSeconds: options.timeout || 300, networkEnabled: options.network || false, onProgress, @@ -111,7 +113,7 @@ export async function runCommand(options: RunOptions) { // Display summary console.log(''); - const averageScorePercent = Math.round(result.summary.averageScore * 100); + const averageScorePercent = Math.round(result.summary.averageScore); const summaryLines = [ chalk.bold('Run Summary\n'), `Run ID: ${chalk.cyan(result.runId)}`, diff --git a/src/cli/commands/variant.ts b/src/cli/commands/variant.ts index 95056cb..3b545f5 100644 --- a/src/cli/commands/variant.ts +++ b/src/cli/commands/variant.ts @@ -18,7 +18,7 @@ import { hashAgentConfig, Variant, } from '../../variants'; -import { getAgent } from '../../agents'; +import { getAgent, DEFAULT_AGENT } from '../../agents'; import { buildVariantImage, variantImageExists, @@ -90,8 +90,7 @@ export async function variantRegisterCommand( console.log(chalk.dim(` Replacing existing variant "${name}"...`)); } - // Get the agent (defaults to claude-code) - const agentName = options.agent || 'claude-code'; + const agentName = options.agent || DEFAULT_AGENT; const agent = getAgent(agentName); // Capture current ambient config with full MCP details diff --git a/src/cli/index.ts b/src/cli/index.ts index 466c33e..1e98886 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -42,6 +42,7 @@ import { closedIssuesRunCommand, closedIssuesCompareCommand, } from './commands/closed-issues'; +import { DEFAULT_AGENT } from '../agents/registry'; const program = new Command(); @@ -59,11 +60,12 @@ program program .command('run') .description('Run evaluation suite on specified agent') - .option('--agent ', 'Agent to evaluate (claude-code, cursor, aider)', 'claude-code') + .option('--agent ', 'Agent to evaluate (claude-code, opencode, cursor, aider)', DEFAULT_AGENT) .option('--cases ', 'Specific test cases to run (comma-separated)') .option('--output ', 'Output directory for results', 'results') .option('--timeout ', 'Timeout per case in seconds', '300') .option('--network', 'Enable network access in sandbox (disabled by default)') + .option('--model ', 'Model to use (agent-specific, e.g. local-glm/glm-4.7-local-4bit)') .action((opts) => runCommand({ ...opts, timeout: parseInt(opts.timeout, 10) })); program @@ -136,7 +138,7 @@ program program .command('interview') .description('Run comprehension interview to test agent understanding') - .option('--agent ', 'Agent to evaluate', 'claude-code') + .option('--agent ', 'Agent to evaluate', DEFAULT_AGENT) .option('--cases ', 'Specific case IDs to run (comma-separated)') .option('--output ', 'Output directory for results', 'results') .option('--compare', 'Compare new responses against existing baselines') @@ -184,7 +186,7 @@ variantCmd .argument('', 'Variant name (e.g., "control", "with-linear-mcp")') .option('-d, --description ', 'Description of the variant') .option('-c, --changes ', 'List of explicit changes in this variant') - .option('-a, --agent ', 'Agent type to capture config for', 'claude-code') + .option('-a, --agent ', 'Agent type to capture config for', DEFAULT_AGENT) .option('-b, --build', 'Build container image after registration') .option('-f, --force', 'Overwrite existing variant with same name') .action((name, opts) => variantRegisterCommand(name, opts)); @@ -315,6 +317,8 @@ closedIssuesCmd .command('run') .description('Run agent on closed-issue cases and compare to reference solutions') .option('-c, --case ', 'Specific case ID to run') + .option('--agent ', 'Agent to evaluate', DEFAULT_AGENT) + .option('--model ', 'Model to use (agent-specific)') .option('--variant ', 'Use a specific variant container (default: active variant)') .option('--local', 'Run with local claude command instead of variant container') .option('-t, --timeout ', 'Timeout per case in seconds', '600') diff --git a/src/closed-issues/runner.ts b/src/closed-issues/runner.ts index f3a3e12..b124c7a 100644 --- a/src/closed-issues/runner.ts +++ b/src/closed-issues/runner.ts @@ -5,7 +5,7 @@ * to the reference PR that originally closed the issue. */ -import { execSync, spawn } from 'child_process'; +import { execSync } from 'child_process'; import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; @@ -19,6 +19,7 @@ import { Variant } from '../variants/types'; import { runInVariant, RunOptions, VariantRunResult } from '../sandbox/variant-runner'; import { collectRequiredEnvVars } from '../sandbox/variant-container'; import { checkMissingEnvVars, getEnvVars, getEnvFilePath } from '../utils/env'; +import { getAgent, DEFAULT_AGENT } from '../agents/registry'; // ============================================================================= // Types @@ -28,6 +29,12 @@ export interface RunCaseOptions { /** The closed issue case to run */ caseData: ClosedIssueCase; + /** Agent name to use (default: from DEFAULT_AGENT) */ + agent?: string; + + /** Model to use (agent-specific) */ + model?: string; + /** Optional variant to use (runs in container) */ variant?: Variant; @@ -102,6 +109,8 @@ const DEFAULT_TIMEOUT_MS = 10 * 60 * 1000; export async function runClosedIssueCase(options: RunCaseOptions): Promise { const { caseData, + agent: agentName = DEFAULT_AGENT, + model, variant, projectRoot = process.cwd(), timeoutMs = DEFAULT_TIMEOUT_MS, @@ -163,19 +172,36 @@ export async function runClosedIssueCase(options: RunCaseOptions): Promise { + if (event.type === 'text_delta' && onOutput) { + onOutput('stdout', event.text); + } else if (event.type === 'status' && onOutput) { + onOutput('stderr', event.message + '\n'); + } + } : undefined, }); - agentOutput = result.output; + agentOutput = agentResult.answer; + if (agentResult.tokens) { + tokens = { + inputTokens: agentResult.tokens.inputTokens, + outputTokens: agentResult.tokens.outputTokens, + cacheReadTokens: agentResult.tokens.cacheReadTokens, + cacheWriteTokens: agentResult.tokens.cacheWriteTokens, + totalTokens: agentResult.tokens.totalTokens, + }; + } + costUsd = agentResult.costUsd; - if (!result.success) { - return createErrorResult(caseData.id, result.error || 'Agent failed', startTime); + if (!agentResult.success) { + return createErrorResult(caseData.id, agentResult.error || 'Agent failed', startTime); } } @@ -334,79 +360,6 @@ async function runAgentWithVariant(options: { return runInVariant(options.variant, options.prompt, runOptions); } -/** - * Run agent locally using claude command - */ -async function runAgentLocally(options: { - prompt: string; - workdir: string; - timeoutMs: number; - stream?: boolean; - onOutput?: (type: 'stdout' | 'stderr', data: string) => void; -}): Promise<{ success: boolean; output: string; error?: string }> { - return new Promise((resolve) => { - let output = ''; - let stderr = ''; - let timedOut = false; - - const proc = spawn('claude', ['--print', '--dangerously-skip-permissions', options.prompt], { - cwd: options.workdir, - env: { - ...process.env, - // Set HOME to a temp location to avoid polluting user's config - HOME: options.workdir, - }, - }); - - const timeoutId = setTimeout(() => { - timedOut = true; - proc.kill('SIGTERM'); - setTimeout(() => proc.kill('SIGKILL'), 5000); - }, options.timeoutMs); - - proc.stdout?.on('data', (data) => { - const str = data.toString(); - output += str; - if (options.stream && options.onOutput) { - options.onOutput('stdout', str); - } - }); - - proc.stderr?.on('data', (data) => { - const str = data.toString(); - stderr += str; - if (options.stream && options.onOutput) { - options.onOutput('stderr', str); - } - }); - - proc.on('close', (code) => { - clearTimeout(timeoutId); - - if (timedOut) { - resolve({ success: false, output, error: 'Agent timed out' }); - return; - } - - if (code !== 0) { - resolve({ - success: false, - output, - error: `Agent exited with code ${code}: ${stderr}`, - }); - return; - } - - resolve({ success: true, output }); - }); - - proc.on('error', (error) => { - clearTimeout(timeoutId); - resolve({ success: false, output, error: error.message }); - }); - }); -} - /** * Capture the agent's changes as a diff */ diff --git a/src/evaluation/runner.ts b/src/evaluation/runner.ts index 31765f8..302c91b 100644 --- a/src/evaluation/runner.ts +++ b/src/evaluation/runner.ts @@ -12,6 +12,7 @@ import * as path from 'path'; import * as os from 'os'; import { Case, + CaseFile, CaseResult, CriterionResult, EvaluatorResult, @@ -22,11 +23,16 @@ import { import { createSandboxManager, checkDocker, RECOMMENDED_IMAGES } from '../sandbox'; import { Sandbox, SandboxConfig } from '../sandbox/types'; import { getRubricRegistry } from '../rubrics/loader'; +import { getAgent } from '../agents/registry'; +import type { AgentResult } from '../agents/types'; export interface RunnerOptions { /** Agent being evaluated (for logging) */ agent: string; + /** Model to use (passed to agent) */ + model?: string; + /** Timeout per case in seconds */ timeoutSeconds?: number; @@ -213,6 +219,30 @@ async function runSingleCase( // Install dependencies if needed await installDependencies(sandbox, caseData.language, options, caseIndex, totalCases, caseData.id); + // Run the agent to attempt to solve the case + options.onProgress?.({ + type: 'running', + caseId: caseData.id, + caseIndex, + totalCases, + message: 'Running agent...', + }); + + const agent = getAgent(options.agent); + const agentResult: AgentResult = await agent.run(caseData.prompt, { + cwd: tempDir, + model: options.model, + timeoutMs: (options.timeoutSeconds || 300) * 1000, + permissionMode: 'acceptEdits', + }); + + if (!agentResult.success) { + throw new Error(`Agent execution failed: ${agentResult.error}`); + } + + // Snapshot files the agent produced (before rubric evaluation) + const agentFiles = snapshotFiles(tempDir, caseData.files); + // Evaluate using the rubric options.onProgress?.({ type: 'validating', @@ -230,11 +260,26 @@ async function runSingleCase( caseId: caseData.id, caseIndex, totalCases, - message: result.passed ? `Passed (${(result.score * 100).toFixed(0)}%)` : `Failed (${(result.score * 100).toFixed(0)}%)`, + message: result.passed ? `Passed (${Math.round(result.score)}%)` : `Failed (${Math.round(result.score)}%)`, }); return { ...result, + agentResponse: agentResult.answer, + agentToolCalls: agentResult.toolCalls.map((t) => ({ + name: t.name, + durationMs: t.durationMs, + success: t.success, + })), + agentModel: agentResult.model, + agentTokens: agentResult.tokens + ? { + input: agentResult.tokens.inputTokens, + output: agentResult.tokens.outputTokens, + total: agentResult.tokens.totalTokens, + } + : undefined, + agentFiles, durationMs, timestamp: new Date(), }; @@ -311,18 +356,18 @@ async function evaluateWithRubric( }; } else if (evaluator.type === 'pattern') { // Run pattern evaluator (check for matches in files) - // For now, just pass - full implementation will use grep/find + // Default to fail until fully implemented evalResult = { - passed: true, - score: 1.0, - evidence: 'Pattern check not fully implemented', + passed: false, + score: 0.0, + evidence: 'Pattern check not yet implemented', }; } else { - // Other evaluator types (llm_judge, benchmark, etc.) - placeholder + // Other evaluator types (llm_judge, benchmark, etc.) - not implemented evalResult = { - passed: true, - score: 1.0, - evidence: 'Evaluator type not yet implemented', + passed: false, + score: 0.0, + evidence: `Evaluator type '${evaluator.type}' not yet implemented`, }; } @@ -342,8 +387,10 @@ async function evaluateWithRubric( } // Average score for this criterion - const rawScore = evaluatorCount > 0 ? criterionScore / evaluatorCount : 1.0; - const weightedScore = (rawScore * criterion.weight) / 100; + // If no non-optional evaluators ran, this criterion doesn't participate in scoring + const hasRequiredEvaluators = evaluatorCount > 0; + const rawScore = hasRequiredEvaluators ? criterionScore / evaluatorCount : 0.0; + const weightedScore = hasRequiredEvaluators ? (rawScore * criterion.weight) / 100 : 0; const allPassed = evaluatorResults.filter((e) => !e.passed).length === 0; criteriaResults.push({ @@ -356,11 +403,18 @@ async function evaluateWithRubric( }); totalWeightedScore += weightedScore; - _totalWeight += criterion.weight; + // Only count weight for criteria that had non-optional evaluators + if (hasRequiredEvaluators) { + _totalWeight += criterion.weight; + } } - // Calculate overall score (sum of weighted scores, as percentage) - const overallScore = totalWeightedScore * 100; + // Normalize score by participating weight (criteria with only optional evaluators are excluded) + // Each criterion's weightedScore = rawScore * weight / 100, so totalWeightedScore + // is a fraction of 1.0 when all weights sum to 100. When some criteria are excluded, + // rescale so the participating criteria fill the full 0-100% range. + const participatingFraction = _totalWeight / 100; + const overallScore = participatingFraction > 0 ? (totalWeightedScore / participatingFraction) * 100 : 0; // Determine pass/fail (default threshold: 70%) const passThreshold = 70; @@ -415,3 +469,70 @@ async function installDependencies( await sandbox.exec('test -f go.mod && go mod download || true'); } } + +/** + * Snapshot all files in the workspace after the agent runs. + * Compares against the original case files to flag which ones changed. + * Reads directly from the host tempDir (bind-mounted into the sandbox). + */ +function snapshotFiles( + tempDir: string, + originalFiles?: CaseFile[] +): { path: string; content: string; changed: boolean }[] { + const results: { path: string; content: string; changed: boolean }[] = []; + const origMap = new Map(); + + // Build map of original file contents for comparison + if (originalFiles) { + for (const f of originalFiles) { + if (f.content !== undefined) { + origMap.set(f.path, f.content); + } + } + } + + // Walk the temp directory and collect all files + function walk(dir: string, prefix: string) { + let entries: fs.Dirent[]; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch { + return; + } + for (const entry of entries) { + const relPath = prefix ? `${prefix}/${entry.name}` : entry.name; + const fullPath = path.join(dir, entry.name); + + // Skip common non-essential directories + if (entry.isDirectory()) { + if (['node_modules', '.git', '__pycache__', '.pytest_cache', 'venv', '.venv'].includes(entry.name)) { + continue; + } + walk(fullPath, relPath); + continue; + } + + if (!entry.isFile()) continue; + + // Skip binary and large files + try { + const stat = fs.statSync(fullPath); + if (stat.size > 100_000) continue; // Skip files over 100KB + } catch { + continue; + } + + try { + const content = fs.readFileSync(fullPath, 'utf-8'); + const original = origMap.get(relPath); + const changed = original === undefined || original !== content; + results.push({ path: relPath, content, changed }); + } catch { + // Skip files that can't be read as UTF-8 + } + } + } + + walk(tempDir, ''); + return results; +} diff --git a/src/rubrics/defaults.ts b/src/rubrics/defaults.ts index 5409e20..f678d9e 100644 --- a/src/rubrics/defaults.ts +++ b/src/rubrics/defaults.ts @@ -30,7 +30,7 @@ export const defaultRubric: Rubric = { { type: 'command', name: 'Tests pass', - run: 'npm test 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || echo "No test runner found"', + run: 'npm test 2>/dev/null || node *.test.js 2>/dev/null || python *.test.py 2>/dev/null || python *_test.py 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || exit 1', partialCredit: true, passThreshold: 1.0, }, @@ -88,7 +88,7 @@ export const defaultRubric: Rubric = { type: 'command', name: 'Reasonable file sizes', // Check no single file is > 1000 lines - run: 'find . -name "*.{js,ts,py}" -exec wc -l {} + 2>/dev/null | awk \'$1 > 1000 {exit 1}\' || true', + run: 'find . \\( -name "*.js" -o -name "*.ts" -o -name "*.py" \\) -exec wc -l {} + 2>/dev/null | awk \'$1 > 1000 {exit 1}\' || true', optional: true, }, ], @@ -114,7 +114,7 @@ export const minimalRubric: Rubric = { { type: 'command', name: 'Tests pass', - run: 'npm test 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || exit 1', + run: 'npm test 2>/dev/null || node *.test.js 2>/dev/null || python *.test.py 2>/dev/null || python *_test.py 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || exit 1', partialCredit: true, }, ], @@ -138,7 +138,7 @@ export const strictRubric: Rubric = { { type: 'command', name: 'Tests pass', - run: 'npm test || pytest || go test ./...', + run: 'npm test 2>/dev/null || node *.test.js 2>/dev/null || python *.test.py 2>/dev/null || python *_test.py 2>/dev/null || pytest 2>/dev/null || go test ./... 2>/dev/null || exit 1', partialCredit: true, passThreshold: 1.0, }, diff --git a/tsconfig.json b/tsconfig.json index 39562be..a84a8ba 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -7,6 +7,7 @@ "rootDir": "./src", "strict": true, "esModuleInterop": true, + "skipLibCheck": true, "forceConsistentCasingInFileNames": true, "resolveJsonModule": true,