Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/tasks/code-discrepancy-tutorials-grasp/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@
4. Note that `LeidenCluster` would follow sklearn patterns but graspologic uses functional APIs for partitioning"""


STEPS_CHECK_DISCREPANCIES = """The purpose of this task is to verify whether the agent could correctly verify a set of 5 discrepancies in the documentation that were injected versus the actual implementation/behavior of the code.
STEPS_CHECK_DISCREPANCIES = """The purpose of this task is to verify whether the agent could correctly verify a set of 4 discrepancies in the documentation that were injected versus the actual implementation/behavior of the code.
This the report of discrepancies that were injected:

# Injected Discrepancies
Expand Down
6 changes: 2 additions & 4 deletions data/tasks/pdf-hr-q4/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,15 @@
write_test_result,
)

STEPS = """\
1. Find and read the file `data_analysis_answer.txt` in the project directory.
STEPS = """1. Find and read the file `data_analysis_answer.txt` in the project directory.
2. Compare the answer against the expected median salaries for each performance rating:
- Rating 1: $60,000 (acceptable range: $59,400 - $60,600)
- Rating 2: $63,550 (acceptable range: $62,915 - $64,186)
- Rating 3: $80,700 (acceptable range: $79,893 - $81,507)
- Rating 4: $85,600 (acceptable range: $84,744 - $86,456)
- Rating 5: $89,250 (acceptable range: $88,358 - $90,143)
3. For each rating, check if the value in the answer is within the acceptable range.
4. Award 20 points for each correct rating.
"""
4. Award 20 points for each correct rating."""

RUBRIC = {
"rating_1_correct": "str - (20 points) Is Rating 1's median salary within tolerance?",
Expand Down
4 changes: 2 additions & 2 deletions eval_recipes/benchmarking/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from eval_recipes.benchmarking.jobs.base import Job, JobContext, JobResult, JobStatus
from eval_recipes.benchmarking.jobs.runner import JobRunner
from eval_recipes.benchmarking.reporting import generate_agent_consolidated_report, generate_trial_report
from eval_recipes.benchmarking.run_trial import TrialConfig, run_trial
from eval_recipes.benchmarking.run_trial import DEFAULT_EVAL_RECIPES_VERSION, TrialConfig, run_trial
from eval_recipes.benchmarking.schemas import AgentConfig, TaskConfig, TaskInfo


Expand Down Expand Up @@ -217,7 +217,7 @@ def __init__(
num_trials: int = 1,
continuation_provider: Literal["openai", "azure_openai", "none"] = "none",
continuation_model: Literal["gpt-5", "gpt-5.1"] = "gpt-5",
eval_recipes_version: str = "0.0.25",
eval_recipes_version: str = DEFAULT_EVAL_RECIPES_VERSION,
report_score_threshold: float = 85.0,
) -> None:
repo_root = Path(__file__).parents[2]
Expand Down
78 changes: 45 additions & 33 deletions eval_recipes/benchmarking/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def generate_trial_report(
"agent_output.log": trial_dir / "agent_output.log",
"test_output.log": trial_dir / "test_output.log",
"trial_result.json": trial_dir / "trial_result.json",
"test.py": trial_dir / "test.py",
"test.py": task_directory / "test.py",
"instructions.txt": task_directory / "instructions.txt",
}

Expand Down Expand Up @@ -165,24 +165,30 @@ async def generate_trial_report(
env={"ANTHROPIC_API_KEY": os.environ.get("ANTHROPIC_API_KEY", "")},
)
messages = [] # store messages for debugging purposes only
async with ClaudeSDKClient(options=options) as client:
await client.query(task_report_prompt)
async for _message in client.receive_response():
messages.append(_message)

# Check if the report was created at the expected path
report_path = temp_dir / "FAILURE_REPORT.md"
if not report_path.exists():
logger.warning(
f"Report not found at expected path: {report_path}. Asking agent to verify the location..."
)
await client.query(
f"The report was not found at {report_path}. "
"Can you please double check if it is in the correct location and has the correct file name? "
"The file should be named 'FAILURE_REPORT.md' and placed in the current working directory."
)
log_file_path = trial_dir / "trial_report_agent.log"
with log_file_path.open("w", encoding="utf-8") as log_file:
async with ClaudeSDKClient(options=options) as client:
await client.query(task_report_prompt)
async for _message in client.receive_response():
messages.append(_message)
log_file.write(str(_message) + "\n")
log_file.flush()

# Check if the report was created at the expected path
report_path = temp_dir / "FAILURE_REPORT.md"
if not report_path.exists():
logger.warning(
f"Report not found at expected path: {report_path}. Asking agent to verify the location..."
)
await client.query(
f"The report was not found at {report_path}. "
"Can you please double check if it is in the correct location and has the correct file name? "
"The file should be named 'FAILURE_REPORT.md' and placed in the current working directory."
)
async for _message in client.receive_response():
messages.append(_message)
log_file.write(str(_message) + "\n")
log_file.flush()

# Get the report from the temp dir and move it to the trial directory
report_path = temp_dir / "FAILURE_REPORT.md"
Expand Down Expand Up @@ -263,24 +269,30 @@ async def generate_agent_consolidated_report(
)

messages = []
async with ClaudeSDKClient(options=options) as client:
await client.query(consolidated_user_prompt)
async for msg in client.receive_response():
messages.append(msg)

# Check if the report was created at the expected path
report_path = temp_dir / "CONSOLIDATED_REPORT.md"
if not report_path.exists():
logger.warning(
f"Report not found at expected path: {report_path}. Asking agent to verify the location..."
)
await client.query(
f"The report was not found at {report_path}. "
"Can you please double check if it is in the correct location and has the correct file name? "
"The file should be named 'CONSOLIDATED_REPORT.md' and placed in the current working directory."
)
log_file_path = runs_dir / f"consolidated_report_agent_{agent_name}.log"
with log_file_path.open("w", encoding="utf-8") as log_file:
async with ClaudeSDKClient(options=options) as client:
await client.query(consolidated_user_prompt)
async for msg in client.receive_response():
messages.append(msg)
log_file.write(str(msg) + "\n")
log_file.flush()

# Check if the report was created at the expected path
report_path = temp_dir / "CONSOLIDATED_REPORT.md"
if not report_path.exists():
logger.warning(
f"Report not found at expected path: {report_path}. Asking agent to verify the location..."
)
await client.query(
f"The report was not found at {report_path}. "
"Can you please double check if it is in the correct location and has the correct file name? "
"The file should be named 'CONSOLIDATED_REPORT.md' and placed in the current working directory."
)
async for msg in client.receive_response():
messages.append(msg)
log_file.write(str(msg) + "\n")
log_file.flush()

# Get the report from the temp dir and move it to the runs dir
report_path = temp_dir / "CONSOLIDATED_REPORT.md"
Expand Down
34 changes: 13 additions & 21 deletions eval_recipes/benchmarking/run_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
from eval_recipes.benchmarking.docker_manager import DockerManager
from eval_recipes.benchmarking.schemas import AgentConfig, TaskConfig, TrialResult

DEFAULT_EVAL_RECIPES_VERSION = "0.0.26"


@dataclass
class TrialConfig:
environment: dict[str, str] = field(default_factory=dict)
continuation_provider: Literal["openai", "azure_openai", "none"] = "none"
continuation_model: Literal["gpt-5", "gpt-5.1"] = "gpt-5"
eval_recipes_version: str = "0.0.25"
eval_recipes_version: str = DEFAULT_EVAL_RECIPES_VERSION


async def run_trial(
Expand Down Expand Up @@ -123,8 +125,10 @@ async def run_trial(

if continuation_response:
continuation_metadata["continuation_occurred"] = True
continuation_metadata["continuation_prompt"] = continuation_response

escaped_response = _escape_bash_string(continuation_response)
# Strip leading dashes to prevent CLI option interpretation
escaped_response = _escape_bash_string(continuation_response.lstrip("- "))
continuation_template = Template(agent.command_template_continue)
continuation_command = continuation_template.render(task_instructions=escaped_response)

Expand All @@ -143,6 +147,8 @@ async def run_trial(
agent_log_path = trial_dir / "agent_output.log"
with agent_log_path.open("a", encoding="utf-8") as f:
f.write("\n\n--- CONTINUATION ---\n\n")
f.write(f"Continuation prompt:\n{continuation_response}\n\n")
f.write("--- CONTINUATION OUTPUT ---\n\n")
f.write(continuation_logs)

agent_duration += continuation_duration
Expand Down Expand Up @@ -217,22 +223,6 @@ def _run_tests(
test_id = str(uuid.uuid4())
logger.info(f"Running tests (trial {trial_number}) with ID: {test_id}")

logger.info("Initializing /project as a uv project")
_exec_result, _init_output = docker_manager.exec_command(
container=container,
command=["uv", "init", "--no-readme", "--no-pin-python", "--name", "test_project"],
log_filename="uv_init_output.log",
workdir="/project",
)

git_url = f"git+https://github.com/microsoft/eval-recipes@v{eval_recipes_version}"
docker_manager.exec_command(
container=container,
command=["uv", "add", git_url],
log_filename="uv_add_eval_recipes_output.log",
workdir="/project",
)

if task.test_time_data_dir and task.test_time_data_dir.exists():
logger.info(f"Copying test-time data directory from {task.test_time_data_dir} to container")
test_time_data_files = _collect_directory_files(task.test_time_data_dir)
Expand All @@ -249,9 +239,11 @@ def _run_tests(
}
docker_manager.copy_files_to_container(container=container, files=files, dest_path="/project")

# Parse test_command string into list for execution
test_command_parts = task.test_command.split()
logger.info(f"Running test: {task.test_command}")
# Build test command with eval-recipes as a dependency via --with
# This installs eval-recipes into uv's isolated cache, not into /project
git_url = f"git+https://github.com/microsoft/eval-recipes@v{eval_recipes_version}"
test_command_parts = ["uv", "run", "--with", git_url, "--no-project", "/project/test.py"]
logger.info(f"Running test: {' '.join(test_command_parts)}")
test_start_time = time.perf_counter()
_exec_result, full_output = docker_manager.exec_command(
container=container,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "eval_recipes"
version = "0.0.25"
version = "0.0.26"
description = "Eval Recipes"
authors = [{ name = "Semantic Workbench Team" }]
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading