sonesuke · sonesuke · Feb 23, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -24,7 +24,13 @@ RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
 RUN curl https://mise.run | sh
 ENV PATH="/root/.local/bin:$PATH"
 
-# Install Node.js LTS manually as a fallback, though mise will handle the project version
+# Install Node.js LTS manually
 RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
     && apt-get install -y nodejs \
     && apt-get clean -y && rm -rf /var/lib/apt/lists/*
+
+# Install Chromium and dependencies for browser-based MCP
+RUN apt-get update && apt-get install -y \
+    chromium \
+    chromium-common \
+    && apt-get clean -y && rm -rf /var/lib/apt/lists/*
diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh
@@ -41,12 +41,43 @@ EOF
         echo "[Devcontainer Setup] WARNING: mise is not installed."
     fi
 
-    echo "[Devcontainer Setup] Authenticating claude..."
     if [ -n "$Z_AI_API_KEY" ]; then
         npx -y @z_ai/coding-helper auth glm_coding_plan_global "$Z_AI_API_KEY"
         npx -y @z_ai/coding-helper auth reload claude
     fi
 
+    echo "[Devcontainer Setup] Installing MCP tools..."
+    curl -fsSL https://raw.githubusercontent.com/sonesuke/google-patent-cli/main/install.sh | bash
+    curl -fsSL https://raw.githubusercontent.com/sonesuke/arxiv-cli/main/install.sh | bash
+
+    echo "[Devcontainer Setup] Configuring google-patent-cli for Docker..."
+    mkdir -p ~/.config/google-patent-cli
+    cat > ~/.config/google-patent-cli/config.toml << 'EOF'
+# Chrome browser path
+browser_path = "/usr/bin/chromium"
+
+# Chrome arguments for Docker/DevContainer environment
+chrome_args = [
+    "--no-sandbox",
+    "--disable-setuid-sandbox",
+    "--disable-gpu"
+]
+EOF
+
+    echo "[Devcontainer Setup] Configuring arxiv-cli for Docker..."
+    mkdir -p ~/.config/arxiv-cli
+    cat > ~/.config/arxiv-cli/config.toml << 'EOF'
+# Chrome browser path
+browser_path = "/usr/bin/chromium"
+
+# Chrome arguments for Docker/DevContainer environment
+chrome_args = [
+    "--no-sandbox",
+    "--disable-setuid-sandbox",
+    "--disable-gpu"
+]
+EOF
+
     echo "[Devcontainer Setup] Complete!"
 else
     echo "Running in CI environment, skipping development setup..."

diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@ investigations/
 .venv/
 __pycache__/
 /target/
+out/
 Cargo.lock
diff --git a/AGENTS.md b/AGENTS.md
@@ -32,3 +32,12 @@ An autonomous daemon that checks for failing GitHub Actions CI checks on open Pu
 
 - **Workflow**: Finds failing PRs → Runs `claude` inside the Dev Container (`devcontainer exec`) → Analyzes the failure (typically using `mise run pre-commit`) → Commits the fix and replies to the PR.
 - **Requirements**: Requires Docker, GitHub CLI (`gh`), `devcontainer` CLI, and `jq` installed on the host machine.
+
+### Test-Runner (`agents/test-runner/runner.sh`)
+
+An autonomous daemon that runs End-to-End (E2E) triggering and functional tests for the `patent-kit` skills using **parallel Claude CLI** processes.
+
+- **Architecture**: For each test case × trial, the runner spawns an independent `claude -p` process inside the Dev Container. All trials run concurrently and results are aggregated into a summary report.
+- **Workflow**: Reads test cases from `e2e/test_cases/*.md` → Expands `prompt.txt` template per trial → Launches parallel `devcontainer exec claude -p` processes → Waits for all to complete → Generates summary in `e2e/reports/<report_id>/summary.md`.
+- **Usage**: `bash agents/test-runner/runner.sh [N_TRIALS]` (default: 1 trial per test case).
+- **Requirements**: Requires Docker, `devcontainer` CLI, and `jq` installed on the host machine.
diff --git a/agents/test-runner/runner.sh b/agents/test-runner/runner.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+# agents/test-runner/runner.sh (Host side)
+# Parallel Claude CLI test runner.
+# Orchestrates test execution: manages processes, collects results, generates reports.
+# All display/output is delegated to test-setup.sh and test-check.sh.
+#
+# Usage: runner.sh <n_trials> [pattern]
+#   n_trials: Number of trials per test case (default: 1)
+#   pattern:  Glob pattern to match test files (default: "cases/*/*.toml")
+#             Examples:
+#               "cases/*/*.toml"                    - all tests
+#               "cases/c*/*.toml"                   - skills starting with 'c'
+#               "cases/concept-interview/*.toml"    - all concept-interview tests
+#               "cases/concept-interview/func*.toml" - tests starting with 'func'
+#               "cases/concept-interview/functional-with-spec.toml" - specific test
+
+set -o pipefail
+
+# --- Pre-flight Checks ---
+check_command() {
+    if ! command -v "$1" >/dev/null 2>&1; then
+        echo "[Error] Required command '$1' not found. Please install it." >&2
+        return 1
+    fi
+}
+
+check_command "devcontainer" || exit 1
+check_command "jq" || exit 1
+check_command "yq" || exit 1
+
+if ! docker info >/dev/null 2>&1; then
+    echo "[Error] Docker is not running or accessible. Please start Docker Desktop." >&2
+    exit 1
+fi
+
+WORKSPACE_FOLDER="${WORKSPACE_FOLDER:-$(pwd)}"
+N_TRIALS="${1:-1}"
+TARGET_PATTERN="${2:-cases/*/*.toml}"
+
+echo "[Host] Ensuring dev container is up for $WORKSPACE_FOLDER..."
+devcontainer up --workspace-folder "$WORKSPACE_FOLDER"
+
+# --- Prepare report directory ---
+mkdir -p "$WORKSPACE_FOLDER/out"
+REPORT_ID=$(date +%Y%m%d_%H%M%S)
+REPORT_DIR="$WORKSPACE_FOLDER/out/$REPORT_ID"
+mkdir -p "$REPORT_DIR"
+
+echo "=================================================="
+echo "[Host] Starting Parallel Claude CLI Test-Runner"
+echo "[Host] Trials per test case: $N_TRIALS"
+echo "[Host] Pattern: $TARGET_PATTERN"
+echo "=================================================="
+
+TOTAL_CASES=0
+TOTAL_PASS=0
+TOTAL_FAIL=0
+
+# Track all log files for summary
+declare -a ALL_LOG_FILES=()
+
+# --- Collect test files matching pattern ---
+TEST_FILES=()
+for TEST_FILE in $TARGET_PATTERN; do
+    # Skip if no matches
+    [ -f "$TEST_FILE" ] || continue
+
+    # Extract skill and test names from path
+    # Expected format: cases/<skill>/<test>.toml
+    TEST_FILE_REL="${TEST_FILE#$WORKSPACE_FOLDER/}"
+    SKILL_NAME=$(basename "$(dirname "$TEST_FILE_REL")")
+    TEST_NAME=$(basename "$TEST_FILE" .toml)
+
+    TEST_FILES+=("$TEST_FILE")
+    TEST_SKILLS+=("$SKILL_NAME")
+    TEST_NAMES+=("$TEST_NAME")
+done
+
+# --- Process each test file ---
+for IDX in "${!TEST_FILES[@]}"; do
+    TEST_FILE="${TEST_FILES[$IDX]}"
+    SKILL_NAME="${TEST_SKILLS[$IDX]}"
+    TEST_NAME="${TEST_NAMES[$IDX]}"
+    TEST_CASE_NAME="${SKILL_NAME}/${TEST_NAME}"
+    TOTAL_CASES=$((TOTAL_CASES + 1))
+
+    # Read test configuration
+    TEST_PROMPT=$(yq eval '.test_prompt' "$TEST_FILE")
+    TEST_TIMEOUT=$(yq eval '.timeout // 300' "$TEST_FILE")  # Default 300 seconds
+
+    echo ""
+    echo "──────────────────────────────────────────────────"
+    echo "[Host] Test Case: $TEST_CASE_NAME"
+    echo "──────────────────────────────────────────────────"
+
+    # --- Phase 1: Execute N trials in parallel ---
+    PIDS=()
+    TRIAL_DIRS=()
+    TRIAL_START_TIMES=()
+    TRIAL_LOG_FILES=()
+
+    # Create skill-specific log directory
+    LOG_DIR="$REPORT_DIR/${SKILL_NAME}"
+    mkdir -p "$LOG_DIR"
+
+    for TRIAL in $(seq 1 "$N_TRIALS"); do
+        LABEL="${TEST_CASE_NAME}_trial-${TRIAL}"
+        LOG_FILE="$LOG_DIR/${TEST_NAME}-${TRIAL}.log"
+        WORK_DIR="/tmp/e2e-${LABEL}"
+        TRIAL_LOG_FILES+=("$LOG_FILE")
+        TRIAL_DIRS+=("$WORK_DIR")
+        TRIAL_START_TIMES+=($(date +%s))
+
+        # Setup workspace (delegated to test-setup.sh)
+        "$(dirname "$0")/tools/test-setup.sh" "$WORKSPACE_FOLDER" "$WORK_DIR" "$TEST_FILE"
+
+        # Launch trial in background with timeout
+        echo "[Host]   Launching trial $TRIAL → $LOG_FILE"
+        # Use gtimeout if available (macOS with gnu coreutils), otherwise use timeout
+        TIMEOUT_CMD=$(command -v gtimeout || command -v timeout || echo "")
+        if [ -n "$TIMEOUT_CMD" ]; then
+            $TIMEOUT_CMD "${TEST_TIMEOUT}s" devcontainer exec \
+                --workspace-folder "$WORKSPACE_FOLDER" \
+                bash -c 'cd "$1" && claude -p \
+                    --dangerously-skip-permissions \
+                    --verbose \
+                    --output-format stream-json \
+                    --plugin-dir ./plugin \
+                    -- "$2" < /dev/null | jq -c '"'"'(. + {timestamp: now})'"'"'' -- "${WORK_DIR}" "$TEST_PROMPT" \
+                >"$LOG_FILE" 2>&1 &
+        else
+            # Fallback: run without timeout (not recommended)
+            devcontainer exec \
+                --workspace-folder "$WORKSPACE_FOLDER" \
+                bash -c 'cd "$1" && claude -p \
+                    --dangerously-skip-permissions \
+                    --verbose \
+                    --output-format stream-json \
+                    --plugin-dir ./plugin \
+                    -- "$2" < /dev/null | jq -c '"'"'(. + {timestamp: now})'"'"'' -- "${WORK_DIR}" "$TEST_PROMPT" \
+                >"$LOG_FILE" 2>&1 &
+        fi
+
+        PIDS+=($!)
+    done
+
+    # Wait for all trials to complete
+    echo "[Host]   Waiting for ${#PIDS[@]} trial(s) to complete..."
+    TRIAL_DURATIONS=()
+    for i in "${!PIDS[@]}"; do
+        # Wait with timeout check
+        ELAPSED=0
+        while kill -0 "${PIDS[$i]}" 2>/dev/null; do
+            if [ $ELAPSED -ge $TEST_TIMEOUT ]; then
+                echo "[Host]   ⚠️  Trial $((i + 1)) timeout after ${TEST_TIMEOUT}s, killing..."
+                kill -9 "${PIDS[$i]}" 2>/dev/null
+                break
+            fi
+            sleep 5
+            ELAPSED=$((ELAPSED + 5))
+        done
+
+        if wait "${PIDS[$i]}" 2>/dev/null; then
+            echo "[Host]   ✅ Trial $((i + 1)) finished"
+        else
+            echo "[Host]   ⚠️  Trial $((i + 1)) exited with non-zero (may still be valid)"
+        fi
+        END_TIME=$(date +%s)
+        DURATION=$(( END_TIME - TRIAL_START_TIMES[i] ))
+        TRIAL_DURATIONS+=("$DURATION")
+        echo "[Host]   ⏱️  Trial $((i + 1)) took ${DURATION}s"
+    done
+
+    # --- Phase 2: Evaluate trials (delegated to test-check.sh) ---
+    echo "[Host]   Running evaluation..."
+
+    CASE_PASS=true
+    RESULT_FILE="$LOG_DIR/${TEST_NAME}.results"
+    > "$RESULT_FILE"  # Create/clear result file
+
+    for TRIAL_IDX in $(seq 0 $((N_TRIALS - 1))); do
+        TRIAL_NUM=$((TRIAL_IDX + 1))
+        WORK_DIR="${TRIAL_DIRS[$TRIAL_IDX]}"
+        LOG_FILE="${TRIAL_LOG_FILES[$TRIAL_IDX]}"
+
+        # Run checks using test-check.sh and capture output
+        # Convert TEST_FILE to absolute path for test-check.sh (which runs from tools/ directory)
+        TEST_FILE_ABSOLUTE=$(cd "$WORKSPACE_FOLDER" && realpath "$TEST_FILE")
+        CHECK_SCRIPT="$(realpath "$(dirname "$0")/tools/test-check.sh")"
+        CHECK_OUTPUT=$("$CHECK_SCRIPT" "$WORKSPACE_FOLDER" "$TEST_FILE_ABSOLUTE" "$LOG_FILE" "$WORK_DIR" "$TRIAL_NUM" 2>&1)
+        CHECK_EXIT_CODE=$?
+
+        # Display output
+        echo "$CHECK_OUTPUT"
+
+        # Extract token usage from output
+        TRIAL_INPUT=$(echo "$CHECK_OUTPUT" | grep "📊 Tokens:" | sed -E 's/.*in=([0-9]+) .*/\1/' || echo "0")
+        TRIAL_CACHE_READ=$(echo "$CHECK_OUTPUT" | grep "📊 Tokens:" | sed -E 's/.*cache=([0-9]+).*/\1/' || echo "0")
+        TRIAL_TOTAL_INPUT=$((TRIAL_INPUT + TRIAL_CACHE_READ))
+        TRIAL_OUTPUT=$(echo "$CHECK_OUTPUT" | grep "📊 Tokens:" | sed -E 's/.*out=([0-9]+)/\1/' || echo "0")
+
+        # Store trial result for summary
+        TRIAL_STATUS="true"
+        if [ $CHECK_EXIT_CODE -ne 0 ]; then
+            CASE_PASS=false
+            TRIAL_STATUS="false"
+        fi
+        echo "${TRIAL_STATUS}|${TRIAL_DURATIONS[$TRIAL_IDX]}|${TRIAL_INPUT}|${TRIAL_CACHE_READ}|${TRIAL_TOTAL_INPUT}|${TRIAL_OUTPUT}" >> "$RESULT_FILE"
+
+        # Display duration
+        echo "[Host]   ⏱️  Trial $TRIAL_NUM took ${TRIAL_DURATIONS[$TRIAL_IDX]}s"
+    done
+
+    # Display case result
+    if [ "$CASE_PASS" = true ]; then
+        echo "[Host]   ✅ $TEST_CASE_NAME: PASS"
+        TOTAL_PASS=$((TOTAL_PASS + 1))
+    else
+        echo "[Host]   ❌ $TEST_CASE_NAME: FAIL"
+        TOTAL_FAIL=$((TOTAL_FAIL + 1))
+    fi
+done
+
+# --- Generate and display summary (delegated to test-summary.sh) ---
+"$(dirname "$0")/tools/test-summary.sh" "$REPORT_DIR" "$TOTAL_CASES" "$TOTAL_PASS" "$TOTAL_FAIL" "$N_TRIALS"
+
+exit "$TOTAL_FAIL"
diff --git a/agents/test-runner/tools/check-mcp-success.sh b/agents/test-runner/tools/check-mcp-success.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Check if MCP tool calls succeeded in a log file
+# Usage: check-mcp-success.sh <log_file> <mcp_tool_name> [--optional]
+#   --optional: If no MCP calls are made, return success (default: fail)
+# Returns: 0 if all MCP calls succeeded (or none made with --optional), 1 if any failed
+
+LOG_FILE="$1"
+MCP_TOOL_NAME="$2"
+OPTIONAL_FLAG="${3:-}"
+
+if [[ -z "$LOG_FILE" ]] || [[ -z "$MCP_TOOL_NAME" ]]; then
+  echo "Usage: $0 <log_file> <mcp_tool_name> [--optional]" >&2
+  exit 2
+fi
+
+if [[ ! -f "$LOG_FILE" ]]; then
+  echo "Log file not found: $LOG_FILE" >&2
+  exit 2
+fi
+
+# Extract tool_use IDs for the specified MCP tool from assistant messages
+TOOL_USE_IDS=$(jq -r '
+  .[]
+  | select(.type? == "assistant")
+  | (.message.content? // [])
+  | select(type == "array")
+  | .[]
+  | select(type == "object" and .type? == "tool_use" and (.name? // "") | test("'"$MCP_TOOL_NAME"'"))
+  | .id
+' "$LOG_FILE")
+
+# Count how many tool_use IDs we found
+ID_COUNT=$(echo "$TOOL_USE_IDS" | grep -c '^\w*$' || true)
+
+# If no MCP calls were made
+if [[ $ID_COUNT -eq 0 ]]; then
+  if [[ "$OPTIONAL_FLAG" == "--optional" ]]; then
+    # Optional check: return success if no calls were made
+    exit 0
+  else
+    # Required check: return failure if no calls were made
+    echo "No $MCP_TOOL_NAME tool calls found in log" >&2
+    exit 1
+  fi
+fi
+
+# Check if any of the corresponding tool_results have is_error: true
+while IFS= read -r tool_id; do
+  if [[ -n "$tool_id" ]]; then
+    ERROR_CHECK=$(jq -r "
+      .[]
+      | select(.type? == \"user\")
+      | (.message.content? // [])
+      | select(type == \"array\")
+      | .[]
+      | select(type == \"object\" and .type? == \"tool_result\" and .tool_use_id? == \"$tool_id\")
+      | .is_error // false
+    " "$LOG_FILE")
+
+    if [[ "$ERROR_CHECK" == "true" ]]; then
+      echo "MCP tool $MCP_TOOL_NAME (tool_use_id: $tool_id) returned an error" >&2
+      exit 1
+    fi
+  fi
+done <<< "$TOOL_USE_IDS"
+
+# All MCP calls succeeded
+exit 0
diff --git a/agents/test-runner/tools/check-skill-invoked.sh b/agents/test-runner/tools/check-skill-invoked.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# check-skill-invoked.sh - Check if a specific skill was invoked
+# Usage: check-skill-invoked.sh <skill_name> <log_file> [<mcp_tool>] [<optional_flag>]
+# Note: Called from test-check.sh as: $CHECK_CMD "$LOG_FILE" "$MCP_TOOL" "$OPTIONAL_FLAG"
+# where $CHECK_CMD = "check-skill-invoked.sh constitution-reminding"
+# So actual arguments are: $1=skill_name, $2=log_file, $3=mcp_tool, $4=optional_flag
+
+SKILL_NAME="${1:-}"
+LOG_FILE="${2:-}"
+
+if [ -z "$LOG_FILE" ] || [ -z "$SKILL_NAME" ]; then
+    echo "[Error] Usage: $0 <skill_name> <log_file>" >&2
+    exit 1
+fi
+
+# Check if the skill was invoked in the log
+# Note: Log is JSONL format with "name":"Skill" and "skill":"patent-kit:<skill-name>"
+grep -q '"Skill"' "$LOG_FILE" && grep -q '"skill":".*'"$SKILL_NAME" "$LOG_FILE"