Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
77 commits
Select commit Hold shift + click to select a range
970d681
feat: add autonomous test-runner agent for E2E validation
sonesuke Feb 22, 2026
48d8d43
test(test-runner): add progress tools, specification fixture, and fun…
sonesuke Feb 22, 2026
ad81717
test: refine test strategy to include n-trials, mock interactions, an…
sonesuke Feb 22, 2026
9ec80ac
chore: ignore test runner reports directory
sonesuke Feb 22, 2026
7be39e6
test: refactor prompt.txt to use sub-agent architecture
sonesuke Feb 22, 2026
1938ce2
test: implement true parallel execution and token tracking for sub-ag…
sonesuke Feb 22, 2026
e50cb7a
feat(e2e): improve test runner, evaluation logic and devcontainer setup
sonesuke Feb 22, 2026
9c659bb
test(e2e): add init validation and improve evaluation checks
sonesuke Feb 23, 2026
97e7503
refactor(e2e): reorganize test structure with skill/test-type subfolders
sonesuke Feb 23, 2026
da49185
test(e2e): add three comprehensive test cases for targeting skill
sonesuke Feb 23, 2026
a6271ea
refactor(skills): apply progressive disclosure to targeting skill str…
sonesuke Feb 23, 2026
144a1c5
refactor(skills): apply progressive disclosure to constitution, conce…
sonesuke Feb 23, 2026
6e058f1
test(e2e): fix evaluation jq filters and test prompts for new skills
sonesuke Feb 23, 2026
f1bdf1c
refactor(e2e): convert evaluation files from JSON to TOML
sonesuke Feb 23, 2026
5217246
refactor(e2e): flatten test structure and unify to TOML format
sonesuke Feb 23, 2026
1e05a61
refactor(e2e): change report output directory from e2e/reports to out
sonesuke Feb 23, 2026
81086ad
refactor(test-runner): extract summary generation and remove unused t…
sonesuke Feb 23, 2026
33f0a34
feat(test-runner): add statistics to summary and flatten log file str…
sonesuke Feb 23, 2026
4761a8c
feat(test-runner): add ability to run specific test case
sonesuke Feb 23, 2026
25b8e0c
refactor(test-runner): change from positional args to glob pattern ma…
sonesuke Feb 23, 2026
ceab820
fix(test-runner): correct test-summary.sh to match runner.sh interface
sonesuke Feb 23, 2026
85f7943
fix(test-runner): restore statistics display in test-summary.sh
sonesuke Feb 23, 2026
3e7d9e6
refactor(test-runner): simplify test-summary input using result files
sonesuke Feb 23, 2026
2f5da63
feat(test-runner): organize outputs in skill-specific subdirectories
sonesuke Feb 23, 2026
ae65f85
fix(e2e): add template read check and fix template paths
sonesuke Feb 23, 2026
b391ee5
fix(skills): add MCP server error handling to troubleshooting docs
sonesuke Feb 23, 2026
151a6f3
fix(devcontainer): set CI=1 to enable Chrome in Docker
sonesuke Feb 23, 2026
e127662
test(e2e): add template read checks to targeting tests
sonesuke Feb 23, 2026
2266aa7
test(e2e): add MCP success checks to all relevant test cases
sonesuke Feb 23, 2026
8e17712
fix(devcontainer): configure chrome_args for Docker via config files
sonesuke Feb 23, 2026
4d5ef98
fix(devcontainer): add browser_path to MCP tool configs
sonesuke Feb 23, 2026
53984d2
fix(e2e): fix search_patents_called jq pattern
sonesuke Feb 23, 2026
f42a06a
fix(e2e): use isError field for MCP success checks
sonesuke Feb 23, 2026
6e5e7a0
fix(e2e): fix jq pattern to check isError at correct level
sonesuke Feb 23, 2026
73d461a
fix(e2e): remove any() wrapper from jq invocation
sonesuke Feb 23, 2026
3371413
test: replace jq one-liner with shell script for MCP success checks
sonesuke Feb 23, 2026
2d11649
fix: correct jq patterns for array-level operations in test checks
sonesuke Feb 23, 2026
c119746
fix: correct jq patterns for targeting test cases
sonesuke Feb 23, 2026
46a00ae
fix: correct jq patterns for concept-interview/functional-with-spec
sonesuke Feb 23, 2026
6987790
refactor: consolidate MCP success check scripts into one with --optio…
sonesuke Feb 23, 2026
9735c7d
feat: add cache token breakdown to test reports
sonesuke Feb 23, 2026
c858f20
feat: add timestamp to each log entry for performance analysis
sonesuke Feb 23, 2026
b64fabb
refactor: use jq now() function for timestamp generation
sonesuke Feb 23, 2026
673b004
docs: update constitution instructions for current architecture
sonesuke Feb 23, 2026
7c084f1
feat: add legal-checker skill and update constitution
sonesuke Feb 23, 2026
8492da6
feat: integrate legal-checker skill into analysis workflows
sonesuke Feb 23, 2026
bb8e867
fix: add YAML frontmatter to legal-checker SKILL.md
sonesuke Feb 23, 2026
32115cc
fix: update legal-checker test to check for skill loading message
sonesuke Feb 23, 2026
be02141
refactor: rename all skills to gerund form (-ing)
sonesuke Feb 23, 2026
95fcf58
refactor: decentralize rules from constitution to individual skills
sonesuke Feb 23, 2026
a55cf26
fix: correct skill name references and improve test runner
sonesuke Feb 23, 2026
30e08a8
fix: remove CLI-era syntax and improve assignee verification
sonesuke Feb 23, 2026
9b44ff2
fix: update concept-interviewing test jq patterns
sonesuke Feb 23, 2026
7770d51
fix: remove CLI-era --query syntax from targeting instructions
sonesuke Feb 23, 2026
f2396c7
fix: targeting should not auto-run concept-interviewing
sonesuke Feb 23, 2026
185c23e
fix: update concept-interview to concept-interviewing in targeting
sonesuke Feb 23, 2026
fd1ff14
fix: update targeting test jq patterns
sonesuke Feb 23, 2026
40bd16d
refactor: introduce check-skill-invoked.sh script
sonesuke Feb 23, 2026
e0f252d
refactor: convert all skill invocation checks from jq to script-based
sonesuke Feb 23, 2026
390768f
fix: use grep-based check for skill invocation in JSONL logs
sonesuke Feb 23, 2026
1e28149
fix: use absolute path for test file and improve grep pattern
sonesuke Feb 23, 2026
a8d18f4
fix: correct grep pattern for skill invocation check
sonesuke Feb 23, 2026
02c11b8
fix: correct argument order in check-skill-invoked.sh
sonesuke Feb 23, 2026
ded713c
fix: prepend ./ to script commands in test-check.sh
sonesuke Feb 23, 2026
ecaad0e
fix: use absolute path for test-check.sh and improve SCRIPT_DIR detec…
sonesuke Feb 23, 2026
9a19614
feat: add timeout support to test runner
sonesuke Feb 23, 2026
d6017f3
feat: split targeting test into with-spec and with-data variants
sonesuke Feb 23, 2026
e58d56e
fix: increase targeting test timeout to 600 seconds
sonesuke Feb 23, 2026
b73ca5d
fix: align specification with CSV data and improve prompt
sonesuke Feb 23, 2026
5592d47
fix: change competitors to Google, Microsoft, OpenAI
sonesuke Feb 23, 2026
d1db8e5
refactor: simplify functional-with-data test to only check merge exec…
sonesuke Feb 23, 2026
debd02b
fix: add Target Release Date and Cutoff Date to specification
sonesuke Feb 23, 2026
4291849
refactor: remove merge_executed check, only verify target.jsonl exists
sonesuke Feb 23, 2026
1913351
fix: explicitly instruct to run merge.sh and skip search steps
sonesuke Feb 23, 2026
d4722da
fix: detect existing CSV files and run merge.sh directly
sonesuke Feb 23, 2026
ba17357
fix: explicitly instruct to run merge.sh immediately when CSV files e…
sonesuke Feb 23, 2026
6d646bd
fix: update targeting skill to trigger when CSV files are detected
sonesuke Feb 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@ RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
RUN curl https://mise.run | sh
ENV PATH="/root/.local/bin:$PATH"

# Install Node.js LTS manually as a fallback, though mise will handle the project version
# Install Node.js LTS manually
RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
&& apt-get install -y nodejs \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Install Chromium and dependencies for browser-based MCP
RUN apt-get update && apt-get install -y \
chromium \
chromium-common \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*
33 changes: 32 additions & 1 deletion .devcontainer/post-create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,43 @@ EOF
echo "[Devcontainer Setup] WARNING: mise is not installed."
fi

echo "[Devcontainer Setup] Authenticating claude..."
if [ -n "$Z_AI_API_KEY" ]; then
npx -y @z_ai/coding-helper auth glm_coding_plan_global "$Z_AI_API_KEY"
npx -y @z_ai/coding-helper auth reload claude
fi

echo "[Devcontainer Setup] Installing MCP tools..."
curl -fsSL https://raw.githubusercontent.com/sonesuke/google-patent-cli/main/install.sh | bash
curl -fsSL https://raw.githubusercontent.com/sonesuke/arxiv-cli/main/install.sh | bash

echo "[Devcontainer Setup] Configuring google-patent-cli for Docker..."
mkdir -p ~/.config/google-patent-cli
cat > ~/.config/google-patent-cli/config.toml << 'EOF'
# Chrome browser path
browser_path = "/usr/bin/chromium"

# Chrome arguments for Docker/DevContainer environment
chrome_args = [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-gpu"
]
EOF

echo "[Devcontainer Setup] Configuring arxiv-cli for Docker..."
mkdir -p ~/.config/arxiv-cli
cat > ~/.config/arxiv-cli/config.toml << 'EOF'
# Chrome browser path
browser_path = "/usr/bin/chromium"

# Chrome arguments for Docker/DevContainer environment
chrome_args = [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-gpu"
]
EOF

echo "[Devcontainer Setup] Complete!"
else
echo "Running in CI environment, skipping development setup..."
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ investigations/
.venv/
__pycache__/
/target/
out/
Cargo.lock
9 changes: 9 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,12 @@ An autonomous daemon that checks for failing GitHub Actions CI checks on open Pu

- **Workflow**: Finds failing PRs → Runs `claude` inside the Dev Container (`devcontainer exec`) → Analyzes the failure (typically using `mise run pre-commit`) → Commits the fix and replies to the PR.
- **Requirements**: Requires Docker, GitHub CLI (`gh`), `devcontainer` CLI, and `jq` installed on the host machine.

### Test-Runner (`agents/test-runner/runner.sh`)

An autonomous daemon that runs End-to-End (E2E) triggering and functional tests for the `patent-kit` skills using **parallel Claude CLI** processes.

- **Architecture**: For each test case × trial, the runner spawns an independent `claude -p` process inside the Dev Container. All trials run concurrently and results are aggregated into a summary report.
- **Workflow**: Reads test cases from `e2e/test_cases/*.md` → Expands `prompt.txt` template per trial → Launches parallel `devcontainer exec claude -p` processes → Waits for all to complete → Generates summary in `e2e/reports/<report_id>/summary.md`.
- **Usage**: `bash agents/test-runner/runner.sh [N_TRIALS]` (default: 1 trial per test case).
- **Requirements**: Requires Docker, `devcontainer` CLI, and `jq` installed on the host machine.
227 changes: 227 additions & 0 deletions agents/test-runner/runner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
#!/bin/bash
# agents/test-runner/runner.sh (Host side)
# Parallel Claude CLI test runner.
# Orchestrates test execution: manages processes, collects results, generates reports.
# All display/output is delegated to test-setup.sh and test-check.sh.
#
# Usage: runner.sh <n_trials> [pattern]
# n_trials: Number of trials per test case (default: 1)
# pattern: Glob pattern to match test files (default: "cases/*/*.toml")
# Examples:
# "cases/*/*.toml" - all tests
# "cases/c*/*.toml" - skills starting with 'c'
# "cases/concept-interview/*.toml" - all concept-interview tests
# "cases/concept-interview/func*.toml" - tests starting with 'func'
# "cases/concept-interview/functional-with-spec.toml" - specific test

set -o pipefail

# --- Pre-flight Checks ---
check_command() {
if ! command -v "$1" >/dev/null 2>&1; then
echo "[Error] Required command '$1' not found. Please install it." >&2
return 1
fi
}

check_command "devcontainer" || exit 1
check_command "jq" || exit 1
check_command "yq" || exit 1

if ! docker info >/dev/null 2>&1; then
echo "[Error] Docker is not running or accessible. Please start Docker Desktop." >&2
exit 1
fi

WORKSPACE_FOLDER="${WORKSPACE_FOLDER:-$(pwd)}"
N_TRIALS="${1:-1}"
TARGET_PATTERN="${2:-cases/*/*.toml}"

echo "[Host] Ensuring dev container is up for $WORKSPACE_FOLDER..."
devcontainer up --workspace-folder "$WORKSPACE_FOLDER"

# --- Prepare report directory ---
mkdir -p "$WORKSPACE_FOLDER/out"
REPORT_ID=$(date +%Y%m%d_%H%M%S)
REPORT_DIR="$WORKSPACE_FOLDER/out/$REPORT_ID"
mkdir -p "$REPORT_DIR"

echo "=================================================="
echo "[Host] Starting Parallel Claude CLI Test-Runner"
echo "[Host] Trials per test case: $N_TRIALS"
echo "[Host] Pattern: $TARGET_PATTERN"
echo "=================================================="

TOTAL_CASES=0
TOTAL_PASS=0
TOTAL_FAIL=0

# Track all log files for summary
declare -a ALL_LOG_FILES=()

# --- Collect test files matching pattern ---
TEST_FILES=()
for TEST_FILE in $TARGET_PATTERN; do
# Skip if no matches
[ -f "$TEST_FILE" ] || continue

# Extract skill and test names from path
# Expected format: cases/<skill>/<test>.toml
TEST_FILE_REL="${TEST_FILE#$WORKSPACE_FOLDER/}"
SKILL_NAME=$(basename "$(dirname "$TEST_FILE_REL")")
TEST_NAME=$(basename "$TEST_FILE" .toml)

TEST_FILES+=("$TEST_FILE")
TEST_SKILLS+=("$SKILL_NAME")
TEST_NAMES+=("$TEST_NAME")
done

# --- Process each test file ---
for IDX in "${!TEST_FILES[@]}"; do
TEST_FILE="${TEST_FILES[$IDX]}"
SKILL_NAME="${TEST_SKILLS[$IDX]}"
TEST_NAME="${TEST_NAMES[$IDX]}"
TEST_CASE_NAME="${SKILL_NAME}/${TEST_NAME}"
TOTAL_CASES=$((TOTAL_CASES + 1))

# Read test configuration
TEST_PROMPT=$(yq eval '.test_prompt' "$TEST_FILE")
TEST_TIMEOUT=$(yq eval '.timeout // 300' "$TEST_FILE") # Default 300 seconds

echo ""
echo "──────────────────────────────────────────────────"
echo "[Host] Test Case: $TEST_CASE_NAME"
echo "──────────────────────────────────────────────────"

# --- Phase 1: Execute N trials in parallel ---
PIDS=()
TRIAL_DIRS=()
TRIAL_START_TIMES=()
TRIAL_LOG_FILES=()

# Create skill-specific log directory
LOG_DIR="$REPORT_DIR/${SKILL_NAME}"
mkdir -p "$LOG_DIR"

for TRIAL in $(seq 1 "$N_TRIALS"); do
LABEL="${TEST_CASE_NAME}_trial-${TRIAL}"
LOG_FILE="$LOG_DIR/${TEST_NAME}-${TRIAL}.log"
WORK_DIR="/tmp/e2e-${LABEL}"
TRIAL_LOG_FILES+=("$LOG_FILE")
TRIAL_DIRS+=("$WORK_DIR")
TRIAL_START_TIMES+=($(date +%s))

# Setup workspace (delegated to test-setup.sh)
"$(dirname "$0")/tools/test-setup.sh" "$WORKSPACE_FOLDER" "$WORK_DIR" "$TEST_FILE"

# Launch trial in background with timeout
echo "[Host] Launching trial $TRIAL → $LOG_FILE"
# Use gtimeout if available (macOS with gnu coreutils), otherwise use timeout
TIMEOUT_CMD=$(command -v gtimeout || command -v timeout || echo "")
if [ -n "$TIMEOUT_CMD" ]; then
$TIMEOUT_CMD "${TEST_TIMEOUT}s" devcontainer exec \
--workspace-folder "$WORKSPACE_FOLDER" \
bash -c 'cd "$1" && claude -p \
--dangerously-skip-permissions \
--verbose \
--output-format stream-json \
--plugin-dir ./plugin \
-- "$2" < /dev/null | jq -c '"'"'(. + {timestamp: now})'"'"'' -- "${WORK_DIR}" "$TEST_PROMPT" \
>"$LOG_FILE" 2>&1 &
else
# Fallback: run without timeout (not recommended)
devcontainer exec \
--workspace-folder "$WORKSPACE_FOLDER" \
bash -c 'cd "$1" && claude -p \
--dangerously-skip-permissions \
--verbose \
--output-format stream-json \
--plugin-dir ./plugin \
-- "$2" < /dev/null | jq -c '"'"'(. + {timestamp: now})'"'"'' -- "${WORK_DIR}" "$TEST_PROMPT" \
>"$LOG_FILE" 2>&1 &
fi

PIDS+=($!)
done

# Wait for all trials to complete
echo "[Host] Waiting for ${#PIDS[@]} trial(s) to complete..."
TRIAL_DURATIONS=()
for i in "${!PIDS[@]}"; do
# Wait with timeout check
ELAPSED=0
while kill -0 "${PIDS[$i]}" 2>/dev/null; do
if [ $ELAPSED -ge $TEST_TIMEOUT ]; then
echo "[Host] ⚠️ Trial $((i + 1)) timeout after ${TEST_TIMEOUT}s, killing..."
kill -9 "${PIDS[$i]}" 2>/dev/null
break
fi
sleep 5
ELAPSED=$((ELAPSED + 5))
done

if wait "${PIDS[$i]}" 2>/dev/null; then
echo "[Host] ✅ Trial $((i + 1)) finished"
else
echo "[Host] ⚠️ Trial $((i + 1)) exited with non-zero (may still be valid)"
fi
END_TIME=$(date +%s)
DURATION=$(( END_TIME - TRIAL_START_TIMES[i] ))
TRIAL_DURATIONS+=("$DURATION")
echo "[Host] ⏱️ Trial $((i + 1)) took ${DURATION}s"
done

# --- Phase 2: Evaluate trials (delegated to test-check.sh) ---
echo "[Host] Running evaluation..."

CASE_PASS=true
RESULT_FILE="$LOG_DIR/${TEST_NAME}.results"
> "$RESULT_FILE" # Create/clear result file

for TRIAL_IDX in $(seq 0 $((N_TRIALS - 1))); do
TRIAL_NUM=$((TRIAL_IDX + 1))
WORK_DIR="${TRIAL_DIRS[$TRIAL_IDX]}"
LOG_FILE="${TRIAL_LOG_FILES[$TRIAL_IDX]}"

# Run checks using test-check.sh and capture output
# Convert TEST_FILE to absolute path for test-check.sh (which runs from tools/ directory)
TEST_FILE_ABSOLUTE=$(cd "$WORKSPACE_FOLDER" && realpath "$TEST_FILE")
CHECK_SCRIPT="$(realpath "$(dirname "$0")/tools/test-check.sh")"
CHECK_OUTPUT=$("$CHECK_SCRIPT" "$WORKSPACE_FOLDER" "$TEST_FILE_ABSOLUTE" "$LOG_FILE" "$WORK_DIR" "$TRIAL_NUM" 2>&1)
CHECK_EXIT_CODE=$?

# Display output
echo "$CHECK_OUTPUT"

# Extract token usage from output
TRIAL_INPUT=$(echo "$CHECK_OUTPUT" | grep "📊 Tokens:" | sed -E 's/.*in=([0-9]+) .*/\1/' || echo "0")
TRIAL_CACHE_READ=$(echo "$CHECK_OUTPUT" | grep "📊 Tokens:" | sed -E 's/.*cache=([0-9]+).*/\1/' || echo "0")
TRIAL_TOTAL_INPUT=$((TRIAL_INPUT + TRIAL_CACHE_READ))
TRIAL_OUTPUT=$(echo "$CHECK_OUTPUT" | grep "📊 Tokens:" | sed -E 's/.*out=([0-9]+)/\1/' || echo "0")

# Store trial result for summary
TRIAL_STATUS="true"
if [ $CHECK_EXIT_CODE -ne 0 ]; then
CASE_PASS=false
TRIAL_STATUS="false"
fi
echo "${TRIAL_STATUS}|${TRIAL_DURATIONS[$TRIAL_IDX]}|${TRIAL_INPUT}|${TRIAL_CACHE_READ}|${TRIAL_TOTAL_INPUT}|${TRIAL_OUTPUT}" >> "$RESULT_FILE"

# Display duration
echo "[Host] ⏱️ Trial $TRIAL_NUM took ${TRIAL_DURATIONS[$TRIAL_IDX]}s"
done

# Display case result
if [ "$CASE_PASS" = true ]; then
echo "[Host] ✅ $TEST_CASE_NAME: PASS"
TOTAL_PASS=$((TOTAL_PASS + 1))
else
echo "[Host] ❌ $TEST_CASE_NAME: FAIL"
TOTAL_FAIL=$((TOTAL_FAIL + 1))
fi
done

# --- Generate and display summary (delegated to test-summary.sh) ---
"$(dirname "$0")/tools/test-summary.sh" "$REPORT_DIR" "$TOTAL_CASES" "$TOTAL_PASS" "$TOTAL_FAIL" "$N_TRIALS"

exit "$TOTAL_FAIL"
68 changes: 68 additions & 0 deletions agents/test-runner/tools/check-mcp-success.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash
# Check if MCP tool calls succeeded in a log file
# Usage: check-mcp-success.sh <log_file> <mcp_tool_name> [--optional]
# --optional: If no MCP calls are made, return success (default: fail)
# Returns: 0 if all MCP calls succeeded (or none made with --optional), 1 if any failed

LOG_FILE="$1"
MCP_TOOL_NAME="$2"
OPTIONAL_FLAG="${3:-}"

if [[ -z "$LOG_FILE" ]] || [[ -z "$MCP_TOOL_NAME" ]]; then
echo "Usage: $0 <log_file> <mcp_tool_name> [--optional]" >&2
exit 2
fi

if [[ ! -f "$LOG_FILE" ]]; then
echo "Log file not found: $LOG_FILE" >&2
exit 2
fi

# Extract tool_use IDs for the specified MCP tool from assistant messages
TOOL_USE_IDS=$(jq -r '
.[]
| select(.type? == "assistant")
| (.message.content? // [])
| select(type == "array")
| .[]
| select(type == "object" and .type? == "tool_use" and (.name? // "") | test("'"$MCP_TOOL_NAME"'"))
| .id
' "$LOG_FILE")

# Count how many tool_use IDs we found
ID_COUNT=$(echo "$TOOL_USE_IDS" | grep -c '^\w*$' || true)

# If no MCP calls were made
if [[ $ID_COUNT -eq 0 ]]; then
if [[ "$OPTIONAL_FLAG" == "--optional" ]]; then
# Optional check: return success if no calls were made
exit 0
else
# Required check: return failure if no calls were made
echo "No $MCP_TOOL_NAME tool calls found in log" >&2
exit 1
fi
fi

# Check if any of the corresponding tool_results have is_error: true
while IFS= read -r tool_id; do
if [[ -n "$tool_id" ]]; then
ERROR_CHECK=$(jq -r "
.[]
| select(.type? == \"user\")
| (.message.content? // [])
| select(type == \"array\")
| .[]
| select(type == \"object\" and .type? == \"tool_result\" and .tool_use_id? == \"$tool_id\")
| .is_error // false
" "$LOG_FILE")

if [[ "$ERROR_CHECK" == "true" ]]; then
echo "MCP tool $MCP_TOOL_NAME (tool_use_id: $tool_id) returned an error" >&2
exit 1
fi
fi
done <<< "$TOOL_USE_IDS"

# All MCP calls succeeded
exit 0
18 changes: 18 additions & 0 deletions agents/test-runner/tools/check-skill-invoked.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
# check-skill-invoked.sh - Check if a specific skill was invoked
# Usage: check-skill-invoked.sh <skill_name> <log_file> [<mcp_tool>] [<optional_flag>]
# Note: Called from test-check.sh as: $CHECK_CMD "$LOG_FILE" "$MCP_TOOL" "$OPTIONAL_FLAG"
# where $CHECK_CMD = "check-skill-invoked.sh constitution-reminding"
# So actual arguments are: $1=skill_name, $2=log_file, $3=mcp_tool, $4=optional_flag

SKILL_NAME="${1:-}"
LOG_FILE="${2:-}"

if [ -z "$LOG_FILE" ] || [ -z "$SKILL_NAME" ]; then
echo "[Error] Usage: $0 <skill_name> <log_file>" >&2
exit 1
fi

# Check if the skill was invoked in the log
# Note: Log is JSONL format with "name":"Skill" and "skill":"patent-kit:<skill-name>"
grep -q '"Skill"' "$LOG_FILE" && grep -q '"skill":".*'"$SKILL_NAME" "$LOG_FILE"
Loading