From 62f6645a144899aeafbe0266d279c8d5bf7c3c91 Mon Sep 17 00:00:00 2001 From: Nithin Date: Sat, 31 Jan 2026 01:18:07 -0500 Subject: [PATCH 1/4] Refactor LLMRequest conversions into Protocol-based converters - Add RequestConverter[RequestType] Protocol for bidirectional conversion - Implement ChatCompletionConverter, ResponsesConverter, and MessagesConverter - Update LLMRequest methods to delegate to converters for better modularity - All existing tests pass (55/55), fully backward compatible - No changes required to calling code (chat_completions_compatible.py, etc.) Addresses issue #65: Improves abstraction and extensibility of LLMRequest conversion logic while maintaining 100% backward compatibility. --- src/ares/llms/request.py | 638 ++++++++++++++++++++++++++------------- 1 file changed, 432 insertions(+), 206 deletions(-) diff --git a/src/ares/llms/request.py b/src/ares/llms/request.py index 0da06d7..6e881be 100644 --- a/src/ares/llms/request.py +++ b/src/ares/llms/request.py @@ -2,7 +2,7 @@ import dataclasses import logging -from typing import Any, Literal, NotRequired, Required, TypedDict, cast +from typing import Any, Literal, NotRequired, Protocol, Required, TypedDict, cast import anthropic.types import openai.types.chat @@ -450,6 +450,52 @@ def _tool_choice_from_anthropic( return None +class RequestConverter[RequestType](Protocol): + """Converts between ARES LLMRequest and external API formats. + + This protocol defines the interface for bidirectional conversion between ARES's internal + LLMRequest format and external API request formats (OpenAI Chat Completions, OpenAI Responses, + Anthropic Messages, etc.). + + Type Parameters: + RequestType: The external API's request parameters type (e.g., dict[str, Any] for kwargs) + + Note: + Implementations should be frozen dataclasses for thread-safety in async contexts. + The model parameter is NOT included in conversions - it should be managed by the LLMClient. + """ + + def to_external(self, request: "LLMRequest", *, strict: bool = True) -> RequestType: + """Convert ARES LLMRequest to external API format. + + Args: + request: ARES internal request format + strict: If True, raise ValueError on information loss. If False, log warnings. + + Returns: + Request parameters in external API format (without model parameter) + + Raises: + ValueError: If strict=True and information would be lost in conversion + """ + ... + + def from_external(self, kwargs: RequestType, *, strict: bool = True) -> "LLMRequest": + """Convert external API format to ARES LLMRequest. + + Args: + kwargs: External API request parameters + strict: If True, raise ValueError for unhandled parameters. If False, log warnings. + + Returns: + LLMRequest instance + + Raises: + ValueError: If strict=True and there are unhandled parameters + """ + ... + + @dataclasses.dataclass(frozen=True, kw_only=True) class LLMRequest: """Unified request format for OpenAI Chat Completions, OpenAI Responses, and Claude Messages APIs. @@ -497,6 +543,8 @@ class LLMRequest: def to_chat_completion_kwargs(self, *, strict: bool = True) -> dict[str, Any]: """Convert to OpenAI Chat Completions API format. + This is a convenience wrapper around ChatCompletionConverter. + Args: strict: If True, raise ValueError on information loss. If False, log warnings. @@ -512,110 +560,13 @@ def to_chat_completion_kwargs(self, *, strict: bool = True) -> dict[str, Any]: - service_tier="standard_only" is not supported - stop_sequences truncated to 4 if more provided """ - # Check for information loss - lost_info = [] - if self.top_k is not None: - lost_info.append(f"top_k={self.top_k} (Claude-specific, not supported)") - if self.service_tier == "standard_only": - lost_info.append("service_tier='standard_only' (not supported by Chat API)") - if self.stop_sequences and len(self.stop_sequences) > 4: - lost_info.append( - f"stop_sequences truncated from {len(self.stop_sequences)} to 4 " - f"(Chat API limit: {self.stop_sequences[4:]} will be dropped)" - ) - - if lost_info: - msg = f"Converting to Chat Completions will lose information: {'; '.join(lost_info)}" - if strict: - raise ValueError(msg) - _LOGGER.warning(msg) - - # Convert messages, flattening ToolCallMessage into AssistantMessage.tool_calls - chat_messages: list[dict[str, Any]] = [] - pending_tool_calls: list[dict[str, Any]] = [] - - for msg in self.messages: - msg_dict = dict(msg) - - # ToolCallMessage → collect for previous assistant message - if "call_id" in msg_dict and "name" in msg_dict and "arguments" in msg_dict: - # This is a ToolCallMessage - pending_tool_calls.append( - { - "id": msg_dict["call_id"], - "type": "function", - "function": { - "name": msg_dict["name"], - "arguments": msg_dict["arguments"], - }, - } - ) - else: - # Flush any pending tool calls to the last assistant message - if pending_tool_calls and chat_messages: - last_msg = chat_messages[-1] - if last_msg.get("role") == "assistant": - last_msg["tool_calls"] = pending_tool_calls - pending_tool_calls = [] - else: - if strict: - role = last_msg.get("role") - raise ValueError( - f"ToolCallMessage found but previous message is not assistant (role={role})" - ) - _LOGGER.warning( - "ToolCallMessage found but previous message is not assistant, discarding tool calls" - ) - pending_tool_calls = [] - - # Add the current message - chat_messages.append(msg_dict) - - # Flush any remaining tool calls - if pending_tool_calls and chat_messages: - last_msg = chat_messages[-1] - if last_msg.get("role") == "assistant": - last_msg["tool_calls"] = pending_tool_calls - elif strict: - raise ValueError("ToolCallMessage at end but last message is not assistant") - - kwargs: dict[str, Any] = { - "messages": chat_messages, - } - - # Add system prompt as first message if present - if self.system_prompt: - kwargs["messages"] = [ - {"role": "system", "content": self.system_prompt}, - *kwargs["messages"], - ] - - # Add optional parameters (filter None values) - if self.max_output_tokens is not None: - kwargs["max_completion_tokens"] = self.max_output_tokens - if self.temperature is not None: - kwargs["temperature"] = self.temperature - if self.top_p is not None: - kwargs["top_p"] = self.top_p - if self.stream: - kwargs["stream"] = True - if self.tools: - kwargs["tools"] = [_tool_to_chat_completions(tool) for tool in self.tools] - if self.tool_choice is not None: - kwargs["tool_choice"] = _tool_choice_to_openai(self.tool_choice) - if self.metadata: - kwargs["metadata"] = self.metadata - if self.service_tier and self.service_tier != "standard_only": - kwargs["service_tier"] = self.service_tier - if self.stop_sequences: - # OpenAI Chat supports up to 4 stop sequences - kwargs["stop"] = self.stop_sequences[:4] - - return kwargs + return ChatCompletionConverter().to_external(self, strict=strict) def to_responses_kwargs(self, *, strict: bool = True) -> dict[str, Any]: """Convert to OpenAI Responses API format. + This is a convenience wrapper around ResponsesConverter. + Args: strict: If True, raise ValueError on information loss. If False, log warnings. @@ -633,50 +584,13 @@ def to_responses_kwargs(self, *, strict: bool = True) -> dict[str, Any]: - top_k is not supported (Claude-specific) - service_tier="standard_only" is not supported """ - # Check for information loss - lost_info = [] - if self.stop_sequences: - lost_info.append(f"stop_sequences={self.stop_sequences} (not supported by Responses API)") - if self.top_k is not None: - lost_info.append(f"top_k={self.top_k} (Claude-specific, not supported)") - if self.service_tier == "standard_only": - lost_info.append("service_tier='standard_only' (not supported by Responses API)") - - if lost_info: - msg = f"Converting to Responses will lose information: {'; '.join(lost_info)}" - if strict: - raise ValueError(msg) - _LOGGER.warning(msg) - - kwargs: dict[str, Any] = { - "input": self._messages_to_responses_input(), - } - - if self.system_prompt: - kwargs["instructions"] = self.system_prompt - - if self.max_output_tokens is not None: - kwargs["max_output_tokens"] = self.max_output_tokens - if self.temperature is not None: - kwargs["temperature"] = self.temperature - if self.top_p is not None: - kwargs["top_p"] = self.top_p - if self.stream: - kwargs["stream"] = True - if self.tools: - kwargs["tools"] = [_tool_to_responses(tool) for tool in self.tools] - if self.tool_choice is not None: - kwargs["tool_choice"] = _tool_choice_to_responses(self.tool_choice) - if self.metadata: - kwargs["metadata"] = self.metadata - if self.service_tier and self.service_tier != "standard_only": - kwargs["service_tier"] = self.service_tier - - return kwargs + return ResponsesConverter().to_external(self, strict=strict) def to_messages_kwargs(self, *, strict: bool = True) -> dict[str, Any]: """Convert to Claude Messages API format. + This is a convenience wrapper around MessagesConverter. + Args: strict: If True, raise ValueError on information loss. If False, log warnings. @@ -694,60 +608,7 @@ def to_messages_kwargs(self, *, strict: bool = True) -> dict[str, Any]: - service_tier options are limited to "auto" and "standard_only" - tool schemas may need conversion (not implemented yet) """ - # Check for information loss - lost_info = [] - if self.service_tier not in (None, "auto", "standard_only"): - lost_info.append(f"service_tier='{self.service_tier}' (Claude only supports 'auto' and 'standard_only')") - - # Check for filtered messages - filtered_messages = [] - for msg in self.messages: - msg_dict = dict(msg) - role = msg_dict["role"] - if role in ("system", "developer"): - content = str(msg_dict.get("content", ""))[:50] - filtered_messages.append(f"{role} message: {content}...") - - if filtered_messages: - lost_info.append(f"Messages filtered out (use system_prompt instead): {'; '.join(filtered_messages)}") - - if lost_info: - msg = f"Converting to Claude Messages will lose information: {'; '.join(lost_info)}" - if strict: - raise ValueError(msg) - _LOGGER.warning(msg) - - kwargs: dict[str, Any] = { - "messages": self._messages_to_claude_format(strict=strict), - "max_tokens": self.max_output_tokens or 1024, # max_tokens is required by Claude - } - - if self.system_prompt: - kwargs["system"] = self.system_prompt - - if self.temperature is not None: - # Convert from OpenAI range (0-2) to Claude range (0-1) - kwargs["temperature"] = min(self.temperature / 2.0, 1.0) - if self.top_p is not None: - kwargs["top_p"] = self.top_p - if self.top_k is not None: - kwargs["top_k"] = self.top_k - if self.stream: - kwargs["stream"] = True - if self.tools: - # Convert tools to Anthropic format (adds explicit type: "custom") - kwargs["tools"] = [_tool_to_anthropic(tool) for tool in self.tools] - if self.tool_choice is not None: - kwargs["tool_choice"] = _tool_choice_to_anthropic(self.tool_choice) - if self.metadata: - # Claude uses metadata.user_id specifically - kwargs["metadata"] = self.metadata - if self.service_tier in ("auto", "standard_only"): - kwargs["service_tier"] = self.service_tier - if self.stop_sequences: - kwargs["stop_sequences"] = self.stop_sequences - - return kwargs + return MessagesConverter().to_external(self, strict=strict) def _messages_to_responses_input(self) -> list[dict[str, Any]]: """Convert messages from internal format to Responses input items. @@ -875,6 +736,213 @@ def from_chat_completion( ) -> "LLMRequest": """Create LLMRequest from OpenAI Chat Completions API kwargs. + This is a convenience wrapper around ChatCompletionConverter. + + Args: + kwargs: OpenAI Chat Completions API parameters + strict: If True, raise ValueError for unhandled parameters. If False, log warnings. + + Returns: + LLMRequest instance + + Raises: + ValueError: If strict=True and there are unhandled parameters + + Note: + Model parameter is ignored - it should be managed by the LLMClient + """ + return ChatCompletionConverter().from_external(kwargs, strict=strict) + + @classmethod + def from_responses( + cls, + kwargs: openai.types.responses.response_create_params.ResponseCreateParamsBase, + *, + strict: bool = True, + ) -> "LLMRequest": + """Create LLMRequest from OpenAI Responses API kwargs. + + This is a convenience wrapper around ResponsesConverter. + + Args: + kwargs: OpenAI Responses API parameters + strict: If True, raise ValueError for unhandled parameters. If False, log warnings. + + Returns: + LLMRequest instance + + Raises: + ValueError: If strict=True and there are unhandled parameters + + Note: + Model parameter is ignored - it should be managed by the LLMClient + """ + return ResponsesConverter().from_external(kwargs, strict=strict) + + @classmethod + def from_messages( + cls, + kwargs: anthropic.types.MessageCreateParams, + *, + strict: bool = True, + ) -> "LLMRequest": + """Create LLMRequest from Claude Messages API kwargs. + + This is a convenience wrapper around MessagesConverter. + + Args: + kwargs: Claude Messages API parameters + strict: If True, raise ValueError for unhandled parameters. If False, log warnings. + + Returns: + LLMRequest instance + + Raises: + ValueError: If strict=True and there are unhandled parameters + """ + return MessagesConverter().from_external(kwargs, strict=strict) + + +@dataclasses.dataclass(frozen=True) +class ChatCompletionConverter: + """Converts between LLMRequest and OpenAI Chat Completions format. + + This converter handles bidirectional conversion between ARES's internal LLMRequest + format and the OpenAI Chat Completions API format. + + Conversion Notes: + - top_k is not supported (Claude-specific) + - service_tier="standard_only" is not supported + - stop_sequences truncated to 4 (OpenAI limit) + - system_prompt is converted to/from system message in messages list + - ToolCallMessage flattened into AssistantMessage.tool_calls + """ + + def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]: + """Convert ARES LLMRequest to OpenAI Chat Completions format. + + Args: + request: ARES internal request format + strict: If True, raise ValueError on information loss. If False, log warnings. + + Returns: + Dictionary of kwargs for openai.ChatCompletion.create() (without model) + + Raises: + ValueError: If strict=True and information would be lost in conversion + + Note: + Model parameter is NOT included - it should be added by the LLMClient + """ + # Check for information loss + lost_info = [] + if request.top_k is not None: + lost_info.append(f"top_k={request.top_k} (Claude-specific, not supported)") + if request.service_tier == "standard_only": + lost_info.append("service_tier='standard_only' (not supported by Chat API)") + if request.stop_sequences and len(request.stop_sequences) > 4: + lost_info.append( + f"stop_sequences truncated from {len(request.stop_sequences)} to 4 " + f"(Chat API limit: {request.stop_sequences[4:]} will be dropped)" + ) + + if lost_info: + msg = f"Converting to Chat Completions will lose information: {'; '.join(lost_info)}" + if strict: + raise ValueError(msg) + _LOGGER.warning(msg) + + # Convert messages, flattening ToolCallMessage into AssistantMessage.tool_calls + chat_messages: list[dict[str, Any]] = [] + pending_tool_calls: list[dict[str, Any]] = [] + + for msg in request.messages: + msg_dict = dict(msg) + + # ToolCallMessage → collect for previous assistant message + if "call_id" in msg_dict and "name" in msg_dict and "arguments" in msg_dict: + # This is a ToolCallMessage + pending_tool_calls.append( + { + "id": msg_dict["call_id"], + "type": "function", + "function": { + "name": msg_dict["name"], + "arguments": msg_dict["arguments"], + }, + } + ) + else: + # Flush any pending tool calls to the last assistant message + if pending_tool_calls and chat_messages: + last_msg = chat_messages[-1] + if last_msg.get("role") == "assistant": + last_msg["tool_calls"] = pending_tool_calls + pending_tool_calls = [] + else: + if strict: + role = last_msg.get("role") + raise ValueError( + f"ToolCallMessage found but previous message is not assistant (role={role})" + ) + _LOGGER.warning( + "ToolCallMessage found but previous message is not assistant, discarding tool calls" + ) + pending_tool_calls = [] + + # Add the current message + chat_messages.append(msg_dict) + + # Flush any remaining tool calls + if pending_tool_calls and chat_messages: + last_msg = chat_messages[-1] + if last_msg.get("role") == "assistant": + last_msg["tool_calls"] = pending_tool_calls + elif strict: + raise ValueError("ToolCallMessage at end but last message is not assistant") + + kwargs: dict[str, Any] = { + "messages": chat_messages, + } + + # Add system prompt as first message if present + if request.system_prompt: + kwargs["messages"] = [ + {"role": "system", "content": request.system_prompt}, + *kwargs["messages"], + ] + + # Add optional parameters (filter None values) + if request.max_output_tokens is not None: + kwargs["max_completion_tokens"] = request.max_output_tokens + if request.temperature is not None: + kwargs["temperature"] = request.temperature + if request.top_p is not None: + kwargs["top_p"] = request.top_p + if request.stream: + kwargs["stream"] = True + if request.tools: + kwargs["tools"] = [_tool_to_chat_completions(tool) for tool in request.tools] + if request.tool_choice is not None: + kwargs["tool_choice"] = _tool_choice_to_openai(request.tool_choice) + if request.metadata: + kwargs["metadata"] = request.metadata + if request.service_tier and request.service_tier != "standard_only": + kwargs["service_tier"] = request.service_tier + if request.stop_sequences: + # OpenAI Chat supports up to 4 stop sequences + kwargs["stop"] = request.stop_sequences[:4] + + return kwargs + + def from_external( + self, + kwargs: openai.types.chat.completion_create_params.CompletionCreateParams, + *, + strict: bool = True, + ) -> LLMRequest: + """Create LLMRequest from OpenAI Chat Completions API kwargs. + Args: kwargs: OpenAI Chat Completions API parameters strict: If True, raise ValueError for unhandled parameters. If False, log warnings. @@ -1003,7 +1071,7 @@ def from_chat_completion( if system_prompt: final_system_prompt = _extract_string_content(system_prompt, strict=strict, context="System prompt") - return cls( + return LLMRequest( messages=filtered_messages, max_output_tokens=kwargs.get("max_completion_tokens") or kwargs.get("max_tokens"), temperature=kwargs.get("temperature"), @@ -1017,13 +1085,85 @@ def from_chat_completion( system_prompt=final_system_prompt, ) - @classmethod - def from_responses( - cls, + +@dataclasses.dataclass(frozen=True) +class ResponsesConverter: + """Converts between LLMRequest and OpenAI Responses format. + + This converter handles bidirectional conversion between ARES's internal LLMRequest + format and the OpenAI Responses API format. + + Conversion Notes: + - stop_sequences are not supported in Responses API + - top_k is not supported (Claude-specific) + - service_tier="standard_only" is not supported + - system_prompt mapped to/from instructions parameter + - messages converted to/from input items + """ + + def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]: + """Convert ARES LLMRequest to OpenAI Responses format. + + Args: + request: ARES internal request format + strict: If True, raise ValueError on information loss. If False, log warnings. + + Returns: + Dictionary of kwargs for openai.Responses.create() (without model) + + Raises: + ValueError: If strict=True and information would be lost in conversion + + Note: + Model parameter is NOT included - it should be added by the LLMClient + """ + # Check for information loss + lost_info = [] + if request.stop_sequences: + lost_info.append(f"stop_sequences={request.stop_sequences} (not supported by Responses API)") + if request.top_k is not None: + lost_info.append(f"top_k={request.top_k} (Claude-specific, not supported)") + if request.service_tier == "standard_only": + lost_info.append("service_tier='standard_only' (not supported by Responses API)") + + if lost_info: + msg = f"Converting to Responses will lose information: {'; '.join(lost_info)}" + if strict: + raise ValueError(msg) + _LOGGER.warning(msg) + + kwargs: dict[str, Any] = { + "input": request._messages_to_responses_input(), + } + + if request.system_prompt: + kwargs["instructions"] = request.system_prompt + + if request.max_output_tokens is not None: + kwargs["max_output_tokens"] = request.max_output_tokens + if request.temperature is not None: + kwargs["temperature"] = request.temperature + if request.top_p is not None: + kwargs["top_p"] = request.top_p + if request.stream: + kwargs["stream"] = True + if request.tools: + kwargs["tools"] = [_tool_to_responses(tool) for tool in request.tools] + if request.tool_choice is not None: + kwargs["tool_choice"] = _tool_choice_to_responses(request.tool_choice) + if request.metadata: + kwargs["metadata"] = request.metadata + if request.service_tier and request.service_tier != "standard_only": + kwargs["service_tier"] = request.service_tier + + return kwargs + + def from_external( + self, kwargs: openai.types.responses.response_create_params.ResponseCreateParamsBase, *, strict: bool = True, - ) -> "LLMRequest": + ) -> LLMRequest: """Create LLMRequest from OpenAI Responses API kwargs. Args: @@ -1166,7 +1306,7 @@ def from_responses( if temp_tools: converted_tools = temp_tools - return cls( + return LLMRequest( messages=filtered_messages, max_output_tokens=kwargs.get("max_output_tokens"), temperature=kwargs.get("temperature"), @@ -1179,13 +1319,99 @@ def from_responses( system_prompt=kwargs.get("instructions"), ) - @classmethod - def from_messages( - cls, + +@dataclasses.dataclass(frozen=True) +class MessagesConverter: + """Converts between LLMRequest and Anthropic Messages format. + + This converter handles bidirectional conversion between ARES's internal LLMRequest + format and the Anthropic Messages API format. + + Conversion Notes: + - temperature converted between OpenAI range (0-2) and Claude range (0-1) + - messages must alternate user/assistant (enforced by Claude API) + - system_prompt mapped to/from system parameter + - service_tier options limited to "auto" and "standard_only" + - top_k is Claude-specific (supported) + """ + + def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]: + """Convert ARES LLMRequest to Claude Messages format. + + Args: + request: ARES internal request format + strict: If True, raise ValueError on information loss. If False, log warnings. + + Returns: + Dictionary of kwargs for anthropic.messages.create() (without model) + + Raises: + ValueError: If strict=True and information would be lost in conversion + + Note: + Model parameter is NOT included - it should be added by the LLMClient + """ + # Check for information loss + lost_info = [] + if request.service_tier not in (None, "auto", "standard_only"): + lost_info.append(f"service_tier='{request.service_tier}' (Claude only supports 'auto' and 'standard_only')") + + # Check for filtered messages + filtered_messages = [] + for msg in request.messages: + msg_dict = dict(msg) + role = msg_dict["role"] + if role in ("system", "developer"): + content = str(msg_dict.get("content", ""))[:50] + filtered_messages.append(f"{role} message: {content}...") + + if filtered_messages: + lost_info.append(f"Messages filtered out (use system_prompt instead): {'; '.join(filtered_messages)}") + + if lost_info: + msg = f"Converting to Claude Messages will lose information: {'; '.join(lost_info)}" + if strict: + raise ValueError(msg) + _LOGGER.warning(msg) + + kwargs: dict[str, Any] = { + "messages": request._messages_to_claude_format(strict=strict), + "max_tokens": request.max_output_tokens or 1024, # max_tokens is required by Claude + } + + if request.system_prompt: + kwargs["system"] = request.system_prompt + + if request.temperature is not None: + # Convert from OpenAI range (0-2) to Claude range (0-1) + kwargs["temperature"] = min(request.temperature / 2.0, 1.0) + if request.top_p is not None: + kwargs["top_p"] = request.top_p + if request.top_k is not None: + kwargs["top_k"] = request.top_k + if request.stream: + kwargs["stream"] = True + if request.tools: + # Convert tools to Anthropic format (adds explicit type: "custom") + kwargs["tools"] = [_tool_to_anthropic(tool) for tool in request.tools] + if request.tool_choice is not None: + kwargs["tool_choice"] = _tool_choice_to_anthropic(request.tool_choice) + if request.metadata: + # Claude uses metadata.user_id specifically + kwargs["metadata"] = request.metadata + if request.service_tier in ("auto", "standard_only"): + kwargs["service_tier"] = request.service_tier + if request.stop_sequences: + kwargs["stop_sequences"] = request.stop_sequences + + return kwargs + + def from_external( + self, kwargs: anthropic.types.MessageCreateParams, *, strict: bool = True, - ) -> "LLMRequest": + ) -> LLMRequest: """Create LLMRequest from Claude Messages API kwargs. Args: @@ -1267,7 +1493,7 @@ def from_messages( raise _LOGGER.warning("Skipping invalid tool: %s", e) - return cls( + return LLMRequest( messages=filtered_messages, max_output_tokens=kwargs["max_tokens"], temperature=temperature, From 2d293f5f7bef4dfca07ccdae61a1fa597782102d Mon Sep 17 00:00:00 2001 From: Nithin Date: Sat, 31 Jan 2026 18:10:08 -0500 Subject: [PATCH 2/4] Add stdout/stderr separation to ExecResult - Add stdout and stderr fields to ExecResult dataclass - Keep output property for backward compatibility (returns stdout + stderr) - Update DaytonaContainer and DockerContainer implementations - Update MockContainer and all test mocks - All tests pass (21/21), fully backward compatible Resolves container execution output TODO in containers.py:11 --- .../terminus2/terminus2_agent_test.py | 20 ++++++++-------- src/ares/containers/containers.py | 24 +++++++++++++++++-- src/ares/containers/daytona.py | 4 +++- src/ares/containers/docker.py | 4 +++- src/ares/testing/mock_container.py | 2 +- src/ares/testing/mock_container_test.py | 7 +++--- 6 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src/ares/code_agents/terminus2/terminus2_agent_test.py b/src/ares/code_agents/terminus2/terminus2_agent_test.py index 693923b..95f5a70 100644 --- a/src/ares/code_agents/terminus2/terminus2_agent_test.py +++ b/src/ares/code_agents/terminus2/terminus2_agent_test.py @@ -33,7 +33,7 @@ def handle_command(self, command: str) -> containers.ExecResult: session_name = match.group(1) self.sessions[session_name] = "active" self.panes[session_name] = "" - return containers.ExecResult(output="", exit_code=0) + return containers.ExecResult(stdout="", stderr="", exit_code=0) elif "tmux send-keys" in command and "-l" in command: match = re.search(r"-t\s+(\S+)", command) @@ -44,7 +44,7 @@ def handle_command(self, command: str) -> containers.ExecResult: if text_match and session_name in self.panes: text = text_match.group(1) or text_match.group(2) or "" self.panes[session_name] += text - return containers.ExecResult(output="", exit_code=0) + return containers.ExecResult(stdout="", stderr="", exit_code=0) elif "tmux send-keys" in command and "Enter" in command: match = re.search(r"-t\s+(\S+)", command) @@ -55,7 +55,7 @@ def handle_command(self, command: str) -> containers.ExecResult: self.panes[session_name] += "\n" if typed_command.strip(): self.panes[session_name] += f"[executed: {typed_command}]\n" - return containers.ExecResult(output="", exit_code=0) + return containers.ExecResult(stdout="", stderr="", exit_code=0) elif "tmux capture-pane" in command: match = re.search(r"-t\s+(\S+)", command) @@ -63,7 +63,7 @@ def handle_command(self, command: str) -> containers.ExecResult: if match: session_name = match.group(1) output = self.panes.get(session_name, "") - return containers.ExecResult(output=output, exit_code=0) + return containers.ExecResult(stdout=output, stderr="", exit_code=0) elif "tmux kill-session" in command: match = re.search(r"-t\s+(\S+)", command) @@ -71,24 +71,24 @@ def handle_command(self, command: str) -> containers.ExecResult: session_name = match.group(1) self.sessions.pop(session_name, None) self.panes.pop(session_name, None) - return containers.ExecResult(output="", exit_code=0) + return containers.ExecResult(stdout="", stderr="", exit_code=0) elif "tmux has-session" in command: match = re.search(r"-t\s+(\S+)", command) if match: session_name = match.group(1) exit_code = 0 if session_name in self.sessions else 1 - return containers.ExecResult(output="", exit_code=exit_code) - return containers.ExecResult(output="", exit_code=1) + return containers.ExecResult(stdout="", stderr="", exit_code=exit_code) + return containers.ExecResult(stdout="", stderr="", exit_code=1) elif "which tmux" in command: - return containers.ExecResult(output="/usr/bin/tmux", exit_code=0) + return containers.ExecResult(stdout="/usr/bin/tmux", stderr="", exit_code=0) elif "tmux set-option" in command: - return containers.ExecResult(output="", exit_code=0) + return containers.ExecResult(stdout="", stderr="", exit_code=0) # Default success for other commands - return containers.ExecResult(output="", exit_code=0) + return containers.ExecResult(stdout="", stderr="", exit_code=0) class TestTerminus2AgentBasics: diff --git a/src/ares/containers/containers.py b/src/ares/containers/containers.py index d4da940..fa56a2d 100644 --- a/src/ares/containers/containers.py +++ b/src/ares/containers/containers.py @@ -8,10 +8,30 @@ @dataclasses.dataclass(frozen=True) class ExecResult: - # TODO: Maybe stdout/stderr? - output: str + """Result of executing a command in a container. + + Attributes: + stdout: Standard output from the command. + stderr: Standard error output from the command. + exit_code: Exit code of the command (0 typically means success). + output: Combined stdout + stderr for backward compatibility. + This is a computed property - prefer using stdout/stderr directly. + """ + + stdout: str + stderr: str exit_code: int + @property + def output(self) -> str: + """Combined stdout and stderr for backward compatibility. + + Returns stdout + stderr concatenated. For new code, prefer accessing + stdout and stderr separately for better error handling. + """ + # Combine with stderr second so errors appear at the end + return self.stdout + self.stderr + @dataclasses.dataclass(frozen=True) class Resources: diff --git a/src/ares/containers/daytona.py b/src/ares/containers/daytona.py index 497ef91..cb4df04 100644 --- a/src/ares/containers/daytona.py +++ b/src/ares/containers/daytona.py @@ -153,7 +153,9 @@ async def exec_run( if float(int_exit_code) != exit_code: raise ValueError(f"Exit code is not an integer: {exit_code}") - return containers.ExecResult(output=result.result, exit_code=int_exit_code) + # Daytona provides combined stdout+stderr in result field + # Put it in stdout, leave stderr empty for now + return containers.ExecResult(stdout=result.result, stderr="", exit_code=int_exit_code) def stop_and_remove(self) -> None: """Stop and remove the container.""" diff --git a/src/ares/containers/docker.py b/src/ares/containers/docker.py index 6d1cae5..5eaaa79 100644 --- a/src/ares/containers/docker.py +++ b/src/ares/containers/docker.py @@ -118,7 +118,9 @@ async def exec_run( timeout=timeout_s, ) result_str = result.output.decode("utf-8", errors="replace") - return containers.ExecResult(output=result_str, exit_code=result.exit_code) + # Docker provides combined stdout+stderr in output field + # Put it in stdout, leave stderr empty for now + return containers.ExecResult(stdout=result_str, stderr="", exit_code=result.exit_code) def stop_and_remove(self) -> None: """Stop and remove the container.""" diff --git a/src/ares/testing/mock_container.py b/src/ares/testing/mock_container.py index cd2998e..f0b1b08 100644 --- a/src/ares/testing/mock_container.py +++ b/src/ares/testing/mock_container.py @@ -73,7 +73,7 @@ async def exec_run( return self.exec_responses[command] # Default to successful empty response - return containers.ExecResult(output="", exit_code=0) + return containers.ExecResult(stdout="", stderr="", exit_code=0) async def upload_files(self, local_paths: list[pathlib.Path], remote_paths: list[str]) -> None: """Record uploaded files.""" diff --git a/src/ares/testing/mock_container_test.py b/src/ares/testing/mock_container_test.py index 7514b46..f5b4da6 100644 --- a/src/ares/testing/mock_container_test.py +++ b/src/ares/testing/mock_container_test.py @@ -55,7 +55,8 @@ async def test_mock_container_exec_run_configured_response(): """Test that exec_run uses configured responses.""" container = mock_container.MockContainer() container.exec_responses["ls -la"] = containers.ExecResult( - output="file1.txt\nfile2.txt", + stdout="file1.txt\nfile2.txt", + stderr="", exit_code=0, ) @@ -72,8 +73,8 @@ async def test_mock_container_exec_run_custom_handler(): def handler(command: str) -> containers.ExecResult: if "error" in command: - return containers.ExecResult(output="Error!", exit_code=1) - return containers.ExecResult(output=f"Executed: {command}", exit_code=0) + return containers.ExecResult(stdout="", stderr="Error!", exit_code=1) + return containers.ExecResult(stdout=f"Executed: {command}", stderr="", exit_code=0) container.exec_handler = handler From 323c6f62de680e97d78f67df5a320ddf57fd707c Mon Sep 17 00:00:00 2001 From: Nithin Date: Sat, 31 Jan 2026 19:27:40 -0500 Subject: [PATCH 3/4] Convert timeout template to use Jinja2 for consistency - Update _render_timeout_template to use Jinja2.Template instead of str.format() - Convert _TIMEOUT_TEMPLATE to use Jinja2 syntax ({{ variable }} instead of {variable}) - Add docstring to _render_timeout_template function - Improves consistency with other template rendering functions - Tested and verified correct rendering Resolves TODO in mini_swe_agent.py:109 --- src/ares/code_agents/mini_swe_agent.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/ares/code_agents/mini_swe_agent.py b/src/ares/code_agents/mini_swe_agent.py index da2238f..9db64fa 100644 --- a/src/ares/code_agents/mini_swe_agent.py +++ b/src/ares/code_agents/mini_swe_agent.py @@ -38,10 +38,10 @@ # Copied from minisweagent's default config. _TIMEOUT_TEMPLATE = """ -The last command {action} timed out and has been killed. +The last command {{ action }} timed out and has been killed. The output of the command was: -{output} +{{ output }} Please try another command and make sure to avoid those requiring interactive input. """.strip() @@ -106,8 +106,19 @@ def _render_format_error_template(format_error_template: str, actions: list[str] def _render_timeout_template(action: str, output: str) -> str: - # TODO: Use jinja2, and allow updating of configuration. - return _TIMEOUT_TEMPLATE.format(action=action, output=output) + """Render the timeout error message using Jinja2. + + Args: + action: The action/command that timed out + output: Any partial output from the command (may be empty) + + Returns: + Rendered timeout error message + """ + return jinja2.Template(_TIMEOUT_TEMPLATE, undefined=jinja2.StrictUndefined).render( + action=action, + output=output, + ) @dataclasses.dataclass(kw_only=True) From 665ac364c5a34a07f02a137d5ef382ef02b65531 Mon Sep 17 00:00:00 2001 From: Nithin Date: Sat, 31 Jan 2026 19:35:30 -0500 Subject: [PATCH 4/4] Integrate stat tracker into Terminus2 agent - Add timing for initial setup (t2/setup) - tracks tmux session initialization - Add timing for LLM requests (t2/llm_request) - tracks expensive API calls - Remove TODO comment now that tracker is actively used - All tests pass (8/8), no functional changes Resolves TODO in terminus2_agent.py:146 --- src/ares/code_agents/terminus2/terminus2_agent.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/ares/code_agents/terminus2/terminus2_agent.py b/src/ares/code_agents/terminus2/terminus2_agent.py index 5bd76f4..57d3740 100644 --- a/src/ares/code_agents/terminus2/terminus2_agent.py +++ b/src/ares/code_agents/terminus2/terminus2_agent.py @@ -143,7 +143,6 @@ class Terminus2Agent(code_agent_base.CodeAgent): container: containers.Container llm_client: llm_clients.LLMClient - # TODO: Actually use the stat tracker in the agent. tracker: stat_tracker.StatTracker = dataclasses.field(default_factory=stat_tracker.NullStatTracker) parser_format: Literal["json", "xml"] = "json" max_turns: int = 1_000_000 # Match terminal-bench reference (effectively unlimited) @@ -489,7 +488,8 @@ async def run(self, task: str) -> None: self._original_instruction = task # Store for summarization # Initialize tmux session to capture initial terminal state - await self._ensure_tmux_session() + with self.tracker.timeit("t2/setup"): + await self._ensure_tmux_session() # Capture initial terminal state using incremental output # First call returns "Current Terminal Screen:\n{visible}" automatically @@ -680,9 +680,10 @@ async def _query_llm(self) -> response.LLMResponse: ) try: - response = await self.llm_client( - request.LLMRequest(messages=self._messages, system_prompt=self._system_prompt) - ) + with self.tracker.timeit("t2/llm_request"): + response = await self.llm_client( + request.LLMRequest(messages=self._messages, system_prompt=self._system_prompt) + ) _LOGGER.debug("[%d] Received LLM response", id(self)) return response