From 7fd717fc4808a222d7b63d8a7cc0cc15051dd468 Mon Sep 17 00:00:00 2001 From: DavidKoleczek <45405824+DavidKoleczek@users.noreply.github.com> Date: Sun, 4 Jan 2026 11:32:23 -0500 Subject: [PATCH] various bug fixes --- .../code-discrepancy-tutorials-grasp/test.py | 2 +- data/tasks/pdf-hr-q4/test.py | 6 +- eval_recipes/benchmarking/harness.py | 4 +- eval_recipes/benchmarking/reporting.py | 78 +++++++++++-------- eval_recipes/benchmarking/run_trial.py | 34 ++++---- pyproject.toml | 2 +- uv.lock | 2 +- 7 files changed, 65 insertions(+), 63 deletions(-) diff --git a/data/tasks/code-discrepancy-tutorials-grasp/test.py b/data/tasks/code-discrepancy-tutorials-grasp/test.py index 62ca303..7562a85 100644 --- a/data/tasks/code-discrepancy-tutorials-grasp/test.py +++ b/data/tasks/code-discrepancy-tutorials-grasp/test.py @@ -223,7 +223,7 @@ 4. Note that `LeidenCluster` would follow sklearn patterns but graspologic uses functional APIs for partitioning""" -STEPS_CHECK_DISCREPANCIES = """The purpose of this task is to verify whether the agent could correctly verify a set of 5 discrepancies in the documentation that were injected versus the actual implementation/behavior of the code. +STEPS_CHECK_DISCREPANCIES = """The purpose of this task is to verify whether the agent could correctly verify a set of 4 discrepancies in the documentation that were injected versus the actual implementation/behavior of the code. This the report of discrepancies that were injected: # Injected Discrepancies diff --git a/data/tasks/pdf-hr-q4/test.py b/data/tasks/pdf-hr-q4/test.py index a4ad941..d03fea2 100644 --- a/data/tasks/pdf-hr-q4/test.py +++ b/data/tasks/pdf-hr-q4/test.py @@ -30,8 +30,7 @@ write_test_result, ) -STEPS = """\ -1. Find and read the file `data_analysis_answer.txt` in the project directory. +STEPS = """1. Find and read the file `data_analysis_answer.txt` in the project directory. 2. Compare the answer against the expected median salaries for each performance rating: - Rating 1: $60,000 (acceptable range: $59,400 - $60,600) - Rating 2: $63,550 (acceptable range: $62,915 - $64,186) @@ -39,8 +38,7 @@ - Rating 4: $85,600 (acceptable range: $84,744 - $86,456) - Rating 5: $89,250 (acceptable range: $88,358 - $90,143) 3. For each rating, check if the value in the answer is within the acceptable range. -4. Award 20 points for each correct rating. -""" +4. Award 20 points for each correct rating.""" RUBRIC = { "rating_1_correct": "str - (20 points) Is Rating 1's median salary within tolerance?", diff --git a/eval_recipes/benchmarking/harness.py b/eval_recipes/benchmarking/harness.py index 1aa2e9b..9459551 100644 --- a/eval_recipes/benchmarking/harness.py +++ b/eval_recipes/benchmarking/harness.py @@ -11,7 +11,7 @@ from eval_recipes.benchmarking.jobs.base import Job, JobContext, JobResult, JobStatus from eval_recipes.benchmarking.jobs.runner import JobRunner from eval_recipes.benchmarking.reporting import generate_agent_consolidated_report, generate_trial_report -from eval_recipes.benchmarking.run_trial import TrialConfig, run_trial +from eval_recipes.benchmarking.run_trial import DEFAULT_EVAL_RECIPES_VERSION, TrialConfig, run_trial from eval_recipes.benchmarking.schemas import AgentConfig, TaskConfig, TaskInfo @@ -217,7 +217,7 @@ def __init__( num_trials: int = 1, continuation_provider: Literal["openai", "azure_openai", "none"] = "none", continuation_model: Literal["gpt-5", "gpt-5.1"] = "gpt-5", - eval_recipes_version: str = "0.0.25", + eval_recipes_version: str = DEFAULT_EVAL_RECIPES_VERSION, report_score_threshold: float = 85.0, ) -> None: repo_root = Path(__file__).parents[2] diff --git a/eval_recipes/benchmarking/reporting.py b/eval_recipes/benchmarking/reporting.py index 9cbd67c..21ff4ac 100644 --- a/eval_recipes/benchmarking/reporting.py +++ b/eval_recipes/benchmarking/reporting.py @@ -126,7 +126,7 @@ async def generate_trial_report( "agent_output.log": trial_dir / "agent_output.log", "test_output.log": trial_dir / "test_output.log", "trial_result.json": trial_dir / "trial_result.json", - "test.py": trial_dir / "test.py", + "test.py": task_directory / "test.py", "instructions.txt": task_directory / "instructions.txt", } @@ -165,24 +165,30 @@ async def generate_trial_report( env={"ANTHROPIC_API_KEY": os.environ.get("ANTHROPIC_API_KEY", "")}, ) messages = [] # store messages for debugging purposes only - async with ClaudeSDKClient(options=options) as client: - await client.query(task_report_prompt) - async for _message in client.receive_response(): - messages.append(_message) - - # Check if the report was created at the expected path - report_path = temp_dir / "FAILURE_REPORT.md" - if not report_path.exists(): - logger.warning( - f"Report not found at expected path: {report_path}. Asking agent to verify the location..." - ) - await client.query( - f"The report was not found at {report_path}. " - "Can you please double check if it is in the correct location and has the correct file name? " - "The file should be named 'FAILURE_REPORT.md' and placed in the current working directory." - ) + log_file_path = trial_dir / "trial_report_agent.log" + with log_file_path.open("w", encoding="utf-8") as log_file: + async with ClaudeSDKClient(options=options) as client: + await client.query(task_report_prompt) async for _message in client.receive_response(): messages.append(_message) + log_file.write(str(_message) + "\n") + log_file.flush() + + # Check if the report was created at the expected path + report_path = temp_dir / "FAILURE_REPORT.md" + if not report_path.exists(): + logger.warning( + f"Report not found at expected path: {report_path}. Asking agent to verify the location..." + ) + await client.query( + f"The report was not found at {report_path}. " + "Can you please double check if it is in the correct location and has the correct file name? " + "The file should be named 'FAILURE_REPORT.md' and placed in the current working directory." + ) + async for _message in client.receive_response(): + messages.append(_message) + log_file.write(str(_message) + "\n") + log_file.flush() # Get the report from the temp dir and move it to the trial directory report_path = temp_dir / "FAILURE_REPORT.md" @@ -263,24 +269,30 @@ async def generate_agent_consolidated_report( ) messages = [] - async with ClaudeSDKClient(options=options) as client: - await client.query(consolidated_user_prompt) - async for msg in client.receive_response(): - messages.append(msg) - - # Check if the report was created at the expected path - report_path = temp_dir / "CONSOLIDATED_REPORT.md" - if not report_path.exists(): - logger.warning( - f"Report not found at expected path: {report_path}. Asking agent to verify the location..." - ) - await client.query( - f"The report was not found at {report_path}. " - "Can you please double check if it is in the correct location and has the correct file name? " - "The file should be named 'CONSOLIDATED_REPORT.md' and placed in the current working directory." - ) + log_file_path = runs_dir / f"consolidated_report_agent_{agent_name}.log" + with log_file_path.open("w", encoding="utf-8") as log_file: + async with ClaudeSDKClient(options=options) as client: + await client.query(consolidated_user_prompt) async for msg in client.receive_response(): messages.append(msg) + log_file.write(str(msg) + "\n") + log_file.flush() + + # Check if the report was created at the expected path + report_path = temp_dir / "CONSOLIDATED_REPORT.md" + if not report_path.exists(): + logger.warning( + f"Report not found at expected path: {report_path}. Asking agent to verify the location..." + ) + await client.query( + f"The report was not found at {report_path}. " + "Can you please double check if it is in the correct location and has the correct file name? " + "The file should be named 'CONSOLIDATED_REPORT.md' and placed in the current working directory." + ) + async for msg in client.receive_response(): + messages.append(msg) + log_file.write(str(msg) + "\n") + log_file.flush() # Get the report from the temp dir and move it to the runs dir report_path = temp_dir / "CONSOLIDATED_REPORT.md" diff --git a/eval_recipes/benchmarking/run_trial.py b/eval_recipes/benchmarking/run_trial.py index 16b3f21..7d8cb54 100644 --- a/eval_recipes/benchmarking/run_trial.py +++ b/eval_recipes/benchmarking/run_trial.py @@ -16,13 +16,15 @@ from eval_recipes.benchmarking.docker_manager import DockerManager from eval_recipes.benchmarking.schemas import AgentConfig, TaskConfig, TrialResult +DEFAULT_EVAL_RECIPES_VERSION = "0.0.26" + @dataclass class TrialConfig: environment: dict[str, str] = field(default_factory=dict) continuation_provider: Literal["openai", "azure_openai", "none"] = "none" continuation_model: Literal["gpt-5", "gpt-5.1"] = "gpt-5" - eval_recipes_version: str = "0.0.25" + eval_recipes_version: str = DEFAULT_EVAL_RECIPES_VERSION async def run_trial( @@ -123,8 +125,10 @@ async def run_trial( if continuation_response: continuation_metadata["continuation_occurred"] = True + continuation_metadata["continuation_prompt"] = continuation_response - escaped_response = _escape_bash_string(continuation_response) + # Strip leading dashes to prevent CLI option interpretation + escaped_response = _escape_bash_string(continuation_response.lstrip("- ")) continuation_template = Template(agent.command_template_continue) continuation_command = continuation_template.render(task_instructions=escaped_response) @@ -143,6 +147,8 @@ async def run_trial( agent_log_path = trial_dir / "agent_output.log" with agent_log_path.open("a", encoding="utf-8") as f: f.write("\n\n--- CONTINUATION ---\n\n") + f.write(f"Continuation prompt:\n{continuation_response}\n\n") + f.write("--- CONTINUATION OUTPUT ---\n\n") f.write(continuation_logs) agent_duration += continuation_duration @@ -217,22 +223,6 @@ def _run_tests( test_id = str(uuid.uuid4()) logger.info(f"Running tests (trial {trial_number}) with ID: {test_id}") - logger.info("Initializing /project as a uv project") - _exec_result, _init_output = docker_manager.exec_command( - container=container, - command=["uv", "init", "--no-readme", "--no-pin-python", "--name", "test_project"], - log_filename="uv_init_output.log", - workdir="/project", - ) - - git_url = f"git+https://github.com/microsoft/eval-recipes@v{eval_recipes_version}" - docker_manager.exec_command( - container=container, - command=["uv", "add", git_url], - log_filename="uv_add_eval_recipes_output.log", - workdir="/project", - ) - if task.test_time_data_dir and task.test_time_data_dir.exists(): logger.info(f"Copying test-time data directory from {task.test_time_data_dir} to container") test_time_data_files = _collect_directory_files(task.test_time_data_dir) @@ -249,9 +239,11 @@ def _run_tests( } docker_manager.copy_files_to_container(container=container, files=files, dest_path="/project") - # Parse test_command string into list for execution - test_command_parts = task.test_command.split() - logger.info(f"Running test: {task.test_command}") + # Build test command with eval-recipes as a dependency via --with + # This installs eval-recipes into uv's isolated cache, not into /project + git_url = f"git+https://github.com/microsoft/eval-recipes@v{eval_recipes_version}" + test_command_parts = ["uv", "run", "--with", git_url, "--no-project", "/project/test.py"] + logger.info(f"Running test: {' '.join(test_command_parts)}") test_start_time = time.perf_counter() _exec_result, full_output = docker_manager.exec_command( container=container, diff --git a/pyproject.toml b/pyproject.toml index e97e53e..eabfa3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "eval_recipes" -version = "0.0.25" +version = "0.0.26" description = "Eval Recipes" authors = [{ name = "Semantic Workbench Team" }] readme = "README.md" diff --git a/uv.lock b/uv.lock index 7b909c5..ae05730 100644 --- a/uv.lock +++ b/uv.lock @@ -729,7 +729,7 @@ wheels = [ [[package]] name = "eval-recipes" -version = "0.0.25" +version = "0.0.26" source = { editable = "." } dependencies = [ { name = "azure-core", extra = ["aio"] },