From 710e17f9746b1433bb60eb82e86a77056844b625 Mon Sep 17 00:00:00 2001 From: edgarpavlovsky Date: Fri, 7 Nov 2025 19:12:02 -0700 Subject: [PATCH] Fix terminal-bench integration test in CI - Fix pyproject.toml: Add [tool.hatch.build.targets.wheel] packages specification - Enable terminal-bench test in CI workflow with proper conditions - Add PATH fixes for uv and terminal-bench binaries - Add timeouts to prevent hanging (20min job, 15min step) - Add .actrc for local GitHub Actions testing with act - Add .secrets to .gitignore The terminal-bench adapter package was failing to build because hatchling didn't know which files to include. Now it correctly includes the adapters/ directory and the test can run in CI. --- .actrc | 12 ++++++++++++ .github/workflows/test.yml | 24 +++++++++++++++++++----- .gitignore | 1 + benchmark/pyproject.toml | 3 +++ 4 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 .actrc diff --git a/.actrc b/.actrc new file mode 100644 index 0000000..36fedd5 --- /dev/null +++ b/.actrc @@ -0,0 +1,12 @@ +# Act configuration for running GitHub Actions locally +# Uses larger Docker image to support full Ubuntu functionality + +# Use medium-sized Ubuntu image (ubuntu-latest equivalent) +-P ubuntu-latest=catthehacker/ubuntu:act-latest + +# Enable verbose output for debugging +--verbose + +# Container architecture (automatically detected) +--container-architecture linux/amd64 + diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6d82292..718ea31 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -99,8 +99,12 @@ jobs: integration-tests: name: Terminal-bench Integration runs-on: ubuntu-latest - # Temporarily disabled - needs debugging - if: false + timeout-minutes: 20 # Fail fast if tests hang + # Run on main branch and e/* branches for testing + if: | + github.ref == 'refs/heads/main' || + startsWith(github.ref, 'refs/heads/e/') || + startsWith(github.head_ref, 'e/') steps: - uses: actions/checkout@v4 @@ -114,10 +118,15 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Install uv - run: curl -LsSf https://astral.sh/uv/install.sh | sh + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Install terminal-bench - run: uv tool install terminal-bench + run: | + export PATH="$HOME/.local/bin:$PATH" + uv tool install terminal-bench + echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Create virtual environment run: uv venv @@ -134,9 +143,14 @@ jobs: uv pip install -e . - name: Run terminal-bench integration test + timeout-minutes: 15 # Per-step timeout env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + PYTHONUNBUFFERED: "1" # Force immediate output run: | source .venv/bin/activate - pytest tests/ -m "integration" -v --tb=short + export PATH="$HOME/.local/bin:$PATH" + echo "Starting terminal-bench integration tests at $(date)" + pytest tests/ -m "integration" -v --tb=short -s --log-cli-level=INFO + echo "Terminal-bench tests completed at $(date)" diff --git a/.gitignore b/.gitignore index 0d1da53..7ccd72a 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ logs/ # Benchmark runs runs/ +.secrets diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml index 2c995ac..0675483 100644 --- a/benchmark/pyproject.toml +++ b/benchmark/pyproject.toml @@ -12,6 +12,9 @@ dependencies = [ requires = ["hatchling"] build-backend = "hatchling.build" +[tool.hatch.build.targets.wheel] +packages = ["adapters"] + [dependency-groups] dev = []