diff --git a/src/ares/code_agents/mini_swe_agent.py b/src/ares/code_agents/mini_swe_agent.py
index da2238f..9db64fa 100644
--- a/src/ares/code_agents/mini_swe_agent.py
+++ b/src/ares/code_agents/mini_swe_agent.py
@@ -38,10 +38,10 @@
# Copied from minisweagent's default config.
_TIMEOUT_TEMPLATE = """
-The last command {action} timed out and has been killed.
+The last command {{ action }} timed out and has been killed.
The output of the command was:
Please try another command and make sure to avoid those requiring interactive input.
""".strip()
@@ -106,8 +106,19 @@ def _render_format_error_template(format_error_template: str, actions: list[str]
def _render_timeout_template(action: str, output: str) -> str:
- # TODO: Use jinja2, and allow updating of configuration.
- return _TIMEOUT_TEMPLATE.format(action=action, output=output)
+ """Render the timeout error message using Jinja2.
+
+ Args:
+ action: The action/command that timed out
+ output: Any partial output from the command (may be empty)
+
+ Returns:
+ Rendered timeout error message
+ """
+ return jinja2.Template(_TIMEOUT_TEMPLATE, undefined=jinja2.StrictUndefined).render(
+ action=action,
+ output=output,
+ )
@dataclasses.dataclass(kw_only=True)
diff --git a/src/ares/code_agents/terminus2/terminus2_agent.py b/src/ares/code_agents/terminus2/terminus2_agent.py
index 5bd76f4..57d3740 100644
--- a/src/ares/code_agents/terminus2/terminus2_agent.py
+++ b/src/ares/code_agents/terminus2/terminus2_agent.py
@@ -143,7 +143,6 @@ class Terminus2Agent(code_agent_base.CodeAgent):
container: containers.Container
llm_client: llm_clients.LLMClient
- # TODO: Actually use the stat tracker in the agent.
tracker: stat_tracker.StatTracker = dataclasses.field(default_factory=stat_tracker.NullStatTracker)
parser_format: Literal["json", "xml"] = "json"
max_turns: int = 1_000_000 # Match terminal-bench reference (effectively unlimited)
@@ -489,7 +488,8 @@ async def run(self, task: str) -> None:
self._original_instruction = task # Store for summarization
# Initialize tmux session to capture initial terminal state
- await self._ensure_tmux_session()
+ with self.tracker.timeit("t2/setup"):
+ await self._ensure_tmux_session()
# Capture initial terminal state using incremental output
# First call returns "Current Terminal Screen:\n{visible}" automatically
@@ -680,9 +680,10 @@ async def _query_llm(self) -> response.LLMResponse:
)
try:
- response = await self.llm_client(
- request.LLMRequest(messages=self._messages, system_prompt=self._system_prompt)
- )
+ with self.tracker.timeit("t2/llm_request"):
+ response = await self.llm_client(
+ request.LLMRequest(messages=self._messages, system_prompt=self._system_prompt)
+ )
_LOGGER.debug("[%d] Received LLM response", id(self))
return response
diff --git a/src/ares/code_agents/terminus2/terminus2_agent_test.py b/src/ares/code_agents/terminus2/terminus2_agent_test.py
index 693923b..95f5a70 100644
--- a/src/ares/code_agents/terminus2/terminus2_agent_test.py
+++ b/src/ares/code_agents/terminus2/terminus2_agent_test.py
@@ -33,7 +33,7 @@ def handle_command(self, command: str) -> containers.ExecResult:
session_name = match.group(1)
self.sessions[session_name] = "active"
self.panes[session_name] = ""
- return containers.ExecResult(output="", exit_code=0)
+ return containers.ExecResult(stdout="", stderr="", exit_code=0)
elif "tmux send-keys" in command and "-l" in command:
match = re.search(r"-t\s+(\S+)", command)
@@ -44,7 +44,7 @@ def handle_command(self, command: str) -> containers.ExecResult:
if text_match and session_name in self.panes:
text = text_match.group(1) or text_match.group(2) or ""
self.panes[session_name] += text
- return containers.ExecResult(output="", exit_code=0)
+ return containers.ExecResult(stdout="", stderr="", exit_code=0)
elif "tmux send-keys" in command and "Enter" in command:
match = re.search(r"-t\s+(\S+)", command)
@@ -55,7 +55,7 @@ def handle_command(self, command: str) -> containers.ExecResult:
self.panes[session_name] += "\n"
if typed_command.strip():
self.panes[session_name] += f"[executed: {typed_command}]\n"
- return containers.ExecResult(output="", exit_code=0)
+ return containers.ExecResult(stdout="", stderr="", exit_code=0)
elif "tmux capture-pane" in command:
match = re.search(r"-t\s+(\S+)", command)
@@ -63,7 +63,7 @@ def handle_command(self, command: str) -> containers.ExecResult:
if match:
session_name = match.group(1)
output = self.panes.get(session_name, "")
- return containers.ExecResult(output=output, exit_code=0)
+ return containers.ExecResult(stdout=output, stderr="", exit_code=0)
elif "tmux kill-session" in command:
match = re.search(r"-t\s+(\S+)", command)
@@ -71,24 +71,24 @@ def handle_command(self, command: str) -> containers.ExecResult:
session_name = match.group(1)
self.sessions.pop(session_name, None)
self.panes.pop(session_name, None)
- return containers.ExecResult(output="", exit_code=0)
+ return containers.ExecResult(stdout="", stderr="", exit_code=0)
elif "tmux has-session" in command:
match = re.search(r"-t\s+(\S+)", command)
if match:
session_name = match.group(1)
exit_code = 0 if session_name in self.sessions else 1
- return containers.ExecResult(output="", exit_code=exit_code)
- return containers.ExecResult(output="", exit_code=1)
+ return containers.ExecResult(stdout="", stderr="", exit_code=exit_code)
+ return containers.ExecResult(stdout="", stderr="", exit_code=1)
elif "which tmux" in command:
- return containers.ExecResult(output="/usr/bin/tmux", exit_code=0)
+ return containers.ExecResult(stdout="/usr/bin/tmux", stderr="", exit_code=0)
elif "tmux set-option" in command:
- return containers.ExecResult(output="", exit_code=0)
+ return containers.ExecResult(stdout="", stderr="", exit_code=0)
# Default success for other commands
- return containers.ExecResult(output="", exit_code=0)
+ return containers.ExecResult(stdout="", stderr="", exit_code=0)
class TestTerminus2AgentBasics:
diff --git a/src/ares/containers/containers.py b/src/ares/containers/containers.py
index d4da940..fa56a2d 100644
--- a/src/ares/containers/containers.py
+++ b/src/ares/containers/containers.py
@@ -8,10 +8,30 @@
@dataclasses.dataclass(frozen=True)
class ExecResult:
- # TODO: Maybe stdout/stderr?
- output: str
+ """Result of executing a command in a container.
+
+ Attributes:
+ stdout: Standard output from the command.
+ stderr: Standard error output from the command.
+ exit_code: Exit code of the command (0 typically means success).
+ output: Combined stdout + stderr for backward compatibility.
+ This is a computed property - prefer using stdout/stderr directly.
+ """
+
+ stdout: str
+ stderr: str
exit_code: int
+ @property
+ def output(self) -> str:
+ """Combined stdout and stderr for backward compatibility.
+
+ Returns stdout + stderr concatenated. For new code, prefer accessing
+ stdout and stderr separately for better error handling.
+ """
+ # Combine with stderr second so errors appear at the end
+ return self.stdout + self.stderr
+
@dataclasses.dataclass(frozen=True)
class Resources:
diff --git a/src/ares/containers/daytona.py b/src/ares/containers/daytona.py
index 497ef91..cb4df04 100644
--- a/src/ares/containers/daytona.py
+++ b/src/ares/containers/daytona.py
@@ -153,7 +153,9 @@ async def exec_run(
if float(int_exit_code) != exit_code:
raise ValueError(f"Exit code is not an integer: {exit_code}")
- return containers.ExecResult(output=result.result, exit_code=int_exit_code)
+ # Daytona provides combined stdout+stderr in result field
+ # Put it in stdout, leave stderr empty for now
+ return containers.ExecResult(stdout=result.result, stderr="", exit_code=int_exit_code)
def stop_and_remove(self) -> None:
"""Stop and remove the container."""
diff --git a/src/ares/containers/docker.py b/src/ares/containers/docker.py
index 6d1cae5..5eaaa79 100644
--- a/src/ares/containers/docker.py
+++ b/src/ares/containers/docker.py
@@ -118,7 +118,9 @@ async def exec_run(
timeout=timeout_s,
)
result_str = result.output.decode("utf-8", errors="replace")
- return containers.ExecResult(output=result_str, exit_code=result.exit_code)
+ # Docker provides combined stdout+stderr in output field
+ # Put it in stdout, leave stderr empty for now
+ return containers.ExecResult(stdout=result_str, stderr="", exit_code=result.exit_code)
def stop_and_remove(self) -> None:
"""Stop and remove the container."""
diff --git a/src/ares/llms/request.py b/src/ares/llms/request.py
index 0da06d7..6e881be 100644
--- a/src/ares/llms/request.py
+++ b/src/ares/llms/request.py
@@ -2,7 +2,7 @@
import dataclasses
import logging
-from typing import Any, Literal, NotRequired, Required, TypedDict, cast
+from typing import Any, Literal, NotRequired, Protocol, Required, TypedDict, cast
import anthropic.types
import openai.types.chat
@@ -450,6 +450,52 @@ def _tool_choice_from_anthropic(
return None
+class RequestConverter[RequestType](Protocol):
+ """Converts between ARES LLMRequest and external API formats.
+
+ This protocol defines the interface for bidirectional conversion between ARES's internal
+ LLMRequest format and external API request formats (OpenAI Chat Completions, OpenAI Responses,
+ Anthropic Messages, etc.).
+
+ Type Parameters:
+ RequestType: The external API's request parameters type (e.g., dict[str, Any] for kwargs)
+
+ Note:
+ Implementations should be frozen dataclasses for thread-safety in async contexts.
+ The model parameter is NOT included in conversions - it should be managed by the LLMClient.
+ """
+
+ def to_external(self, request: "LLMRequest", *, strict: bool = True) -> RequestType:
+ """Convert ARES LLMRequest to external API format.
+
+ Args:
+ request: ARES internal request format
+ strict: If True, raise ValueError on information loss. If False, log warnings.
+
+ Returns:
+ Request parameters in external API format (without model parameter)
+
+ Raises:
+ ValueError: If strict=True and information would be lost in conversion
+ """
+ ...
+
+ def from_external(self, kwargs: RequestType, *, strict: bool = True) -> "LLMRequest":
+ """Convert external API format to ARES LLMRequest.
+
+ Args:
+ kwargs: External API request parameters
+ strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
+
+ Returns:
+ LLMRequest instance
+
+ Raises:
+ ValueError: If strict=True and there are unhandled parameters
+ """
+ ...
+
+
@dataclasses.dataclass(frozen=True, kw_only=True)
class LLMRequest:
"""Unified request format for OpenAI Chat Completions, OpenAI Responses, and Claude Messages APIs.
@@ -497,6 +543,8 @@ class LLMRequest:
def to_chat_completion_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
"""Convert to OpenAI Chat Completions API format.
+ This is a convenience wrapper around ChatCompletionConverter.
+
Args:
strict: If True, raise ValueError on information loss. If False, log warnings.
@@ -512,110 +560,13 @@ def to_chat_completion_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
- service_tier="standard_only" is not supported
- stop_sequences truncated to 4 if more provided
"""
- # Check for information loss
- lost_info = []
- if self.top_k is not None:
- lost_info.append(f"top_k={self.top_k} (Claude-specific, not supported)")
- if self.service_tier == "standard_only":
- lost_info.append("service_tier='standard_only' (not supported by Chat API)")
- if self.stop_sequences and len(self.stop_sequences) > 4:
- lost_info.append(
- f"stop_sequences truncated from {len(self.stop_sequences)} to 4 "
- f"(Chat API limit: {self.stop_sequences[4:]} will be dropped)"
- )
-
- if lost_info:
- msg = f"Converting to Chat Completions will lose information: {'; '.join(lost_info)}"
- if strict:
- raise ValueError(msg)
- _LOGGER.warning(msg)
-
- # Convert messages, flattening ToolCallMessage into AssistantMessage.tool_calls
- chat_messages: list[dict[str, Any]] = []
- pending_tool_calls: list[dict[str, Any]] = []
-
- for msg in self.messages:
- msg_dict = dict(msg)
-
- # ToolCallMessage → collect for previous assistant message
- if "call_id" in msg_dict and "name" in msg_dict and "arguments" in msg_dict:
- # This is a ToolCallMessage
- pending_tool_calls.append(
- {
- "id": msg_dict["call_id"],
- "type": "function",
- "function": {
- "name": msg_dict["name"],
- "arguments": msg_dict["arguments"],
- },
- }
- )
- else:
- # Flush any pending tool calls to the last assistant message
- if pending_tool_calls and chat_messages:
- last_msg = chat_messages[-1]
- if last_msg.get("role") == "assistant":
- last_msg["tool_calls"] = pending_tool_calls
- pending_tool_calls = []
- else:
- if strict:
- role = last_msg.get("role")
- raise ValueError(
- f"ToolCallMessage found but previous message is not assistant (role={role})"
- )
- _LOGGER.warning(
- "ToolCallMessage found but previous message is not assistant, discarding tool calls"
- )
- pending_tool_calls = []
-
- # Add the current message
- chat_messages.append(msg_dict)
-
- # Flush any remaining tool calls
- if pending_tool_calls and chat_messages:
- last_msg = chat_messages[-1]
- if last_msg.get("role") == "assistant":
- last_msg["tool_calls"] = pending_tool_calls
- elif strict:
- raise ValueError("ToolCallMessage at end but last message is not assistant")
-
- kwargs: dict[str, Any] = {
- "messages": chat_messages,
- }
-
- # Add system prompt as first message if present
- if self.system_prompt:
- kwargs["messages"] = [
- {"role": "system", "content": self.system_prompt},
- *kwargs["messages"],
- ]
-
- # Add optional parameters (filter None values)
- if self.max_output_tokens is not None:
- kwargs["max_completion_tokens"] = self.max_output_tokens
- if self.temperature is not None:
- kwargs["temperature"] = self.temperature
- if self.top_p is not None:
- kwargs["top_p"] = self.top_p
- if self.stream:
- kwargs["stream"] = True
- if self.tools:
- kwargs["tools"] = [_tool_to_chat_completions(tool) for tool in self.tools]
- if self.tool_choice is not None:
- kwargs["tool_choice"] = _tool_choice_to_openai(self.tool_choice)
- if self.metadata:
- kwargs["metadata"] = self.metadata
- if self.service_tier and self.service_tier != "standard_only":
- kwargs["service_tier"] = self.service_tier
- if self.stop_sequences:
- # OpenAI Chat supports up to 4 stop sequences
- kwargs["stop"] = self.stop_sequences[:4]
-
- return kwargs
+ return ChatCompletionConverter().to_external(self, strict=strict)
def to_responses_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
"""Convert to OpenAI Responses API format.
+ This is a convenience wrapper around ResponsesConverter.
+
Args:
strict: If True, raise ValueError on information loss. If False, log warnings.
@@ -633,50 +584,13 @@ def to_responses_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
- top_k is not supported (Claude-specific)
- service_tier="standard_only" is not supported
"""
- # Check for information loss
- lost_info = []
- if self.stop_sequences:
- lost_info.append(f"stop_sequences={self.stop_sequences} (not supported by Responses API)")
- if self.top_k is not None:
- lost_info.append(f"top_k={self.top_k} (Claude-specific, not supported)")
- if self.service_tier == "standard_only":
- lost_info.append("service_tier='standard_only' (not supported by Responses API)")
-
- if lost_info:
- msg = f"Converting to Responses will lose information: {'; '.join(lost_info)}"
- if strict:
- raise ValueError(msg)
- _LOGGER.warning(msg)
-
- kwargs: dict[str, Any] = {
- "input": self._messages_to_responses_input(),
- }
-
- if self.system_prompt:
- kwargs["instructions"] = self.system_prompt
-
- if self.max_output_tokens is not None:
- kwargs["max_output_tokens"] = self.max_output_tokens
- if self.temperature is not None:
- kwargs["temperature"] = self.temperature
- if self.top_p is not None:
- kwargs["top_p"] = self.top_p
- if self.stream:
- kwargs["stream"] = True
- if self.tools:
- kwargs["tools"] = [_tool_to_responses(tool) for tool in self.tools]
- if self.tool_choice is not None:
- kwargs["tool_choice"] = _tool_choice_to_responses(self.tool_choice)
- if self.metadata:
- kwargs["metadata"] = self.metadata
- if self.service_tier and self.service_tier != "standard_only":
- kwargs["service_tier"] = self.service_tier
-
- return kwargs
+ return ResponsesConverter().to_external(self, strict=strict)
def to_messages_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
"""Convert to Claude Messages API format.
+ This is a convenience wrapper around MessagesConverter.
+
Args:
strict: If True, raise ValueError on information loss. If False, log warnings.
@@ -694,60 +608,7 @@ def to_messages_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
- service_tier options are limited to "auto" and "standard_only"
- tool schemas may need conversion (not implemented yet)
"""
- # Check for information loss
- lost_info = []
- if self.service_tier not in (None, "auto", "standard_only"):
- lost_info.append(f"service_tier='{self.service_tier}' (Claude only supports 'auto' and 'standard_only')")
-
- # Check for filtered messages
- filtered_messages = []
- for msg in self.messages:
- msg_dict = dict(msg)
- role = msg_dict["role"]
- if role in ("system", "developer"):
- content = str(msg_dict.get("content", ""))[:50]
- filtered_messages.append(f"{role} message: {content}...")
-
- if filtered_messages:
- lost_info.append(f"Messages filtered out (use system_prompt instead): {'; '.join(filtered_messages)}")
-
- if lost_info:
- msg = f"Converting to Claude Messages will lose information: {'; '.join(lost_info)}"
- if strict:
- raise ValueError(msg)
- _LOGGER.warning(msg)
-
- kwargs: dict[str, Any] = {
- "messages": self._messages_to_claude_format(strict=strict),
- "max_tokens": self.max_output_tokens or 1024, # max_tokens is required by Claude
- }
-
- if self.system_prompt:
- kwargs["system"] = self.system_prompt
-
- if self.temperature is not None:
- # Convert from OpenAI range (0-2) to Claude range (0-1)
- kwargs["temperature"] = min(self.temperature / 2.0, 1.0)
- if self.top_p is not None:
- kwargs["top_p"] = self.top_p
- if self.top_k is not None:
- kwargs["top_k"] = self.top_k
- if self.stream:
- kwargs["stream"] = True
- if self.tools:
- # Convert tools to Anthropic format (adds explicit type: "custom")
- kwargs["tools"] = [_tool_to_anthropic(tool) for tool in self.tools]
- if self.tool_choice is not None:
- kwargs["tool_choice"] = _tool_choice_to_anthropic(self.tool_choice)
- if self.metadata:
- # Claude uses metadata.user_id specifically
- kwargs["metadata"] = self.metadata
- if self.service_tier in ("auto", "standard_only"):
- kwargs["service_tier"] = self.service_tier
- if self.stop_sequences:
- kwargs["stop_sequences"] = self.stop_sequences
-
- return kwargs
+ return MessagesConverter().to_external(self, strict=strict)
def _messages_to_responses_input(self) -> list[dict[str, Any]]:
"""Convert messages from internal format to Responses input items.
@@ -875,6 +736,213 @@ def from_chat_completion(
) -> "LLMRequest":
"""Create LLMRequest from OpenAI Chat Completions API kwargs.
+ This is a convenience wrapper around ChatCompletionConverter.
+
+ Args:
+ kwargs: OpenAI Chat Completions API parameters
+ strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
+
+ Returns:
+ LLMRequest instance
+
+ Raises:
+ ValueError: If strict=True and there are unhandled parameters
+
+ Note:
+ Model parameter is ignored - it should be managed by the LLMClient
+ """
+ return ChatCompletionConverter().from_external(kwargs, strict=strict)
+
+ @classmethod
+ def from_responses(
+ cls,
+ kwargs: openai.types.responses.response_create_params.ResponseCreateParamsBase,
+ *,
+ strict: bool = True,
+ ) -> "LLMRequest":
+ """Create LLMRequest from OpenAI Responses API kwargs.
+
+ This is a convenience wrapper around ResponsesConverter.
+
+ Args:
+ kwargs: OpenAI Responses API parameters
+ strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
+
+ Returns:
+ LLMRequest instance
+
+ Raises:
+ ValueError: If strict=True and there are unhandled parameters
+
+ Note:
+ Model parameter is ignored - it should be managed by the LLMClient
+ """
+ return ResponsesConverter().from_external(kwargs, strict=strict)
+
+ @classmethod
+ def from_messages(
+ cls,
+ kwargs: anthropic.types.MessageCreateParams,
+ *,
+ strict: bool = True,
+ ) -> "LLMRequest":
+ """Create LLMRequest from Claude Messages API kwargs.
+
+ This is a convenience wrapper around MessagesConverter.
+
+ Args:
+ kwargs: Claude Messages API parameters
+ strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
+
+ Returns:
+ LLMRequest instance
+
+ Raises:
+ ValueError: If strict=True and there are unhandled parameters
+ """
+ return MessagesConverter().from_external(kwargs, strict=strict)
+
+
+@dataclasses.dataclass(frozen=True)
+class ChatCompletionConverter:
+ """Converts between LLMRequest and OpenAI Chat Completions format.
+
+ This converter handles bidirectional conversion between ARES's internal LLMRequest
+ format and the OpenAI Chat Completions API format.
+
+ Conversion Notes:
+ - top_k is not supported (Claude-specific)
+ - service_tier="standard_only" is not supported
+ - stop_sequences truncated to 4 (OpenAI limit)
+ - system_prompt is converted to/from system message in messages list
+ - ToolCallMessage flattened into AssistantMessage.tool_calls
+ """
+
+ def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]:
+ """Convert ARES LLMRequest to OpenAI Chat Completions format.
+
+ Args:
+ request: ARES internal request format
+ strict: If True, raise ValueError on information loss. If False, log warnings.
+
+ Returns:
+ Dictionary of kwargs for openai.ChatCompletion.create() (without model)
+
+ Raises:
+ ValueError: If strict=True and information would be lost in conversion
+
+ Note:
+ Model parameter is NOT included - it should be added by the LLMClient
+ """
+ # Check for information loss
+ lost_info = []
+ if request.top_k is not None:
+ lost_info.append(f"top_k={request.top_k} (Claude-specific, not supported)")
+ if request.service_tier == "standard_only":
+ lost_info.append("service_tier='standard_only' (not supported by Chat API)")
+ if request.stop_sequences and len(request.stop_sequences) > 4:
+ lost_info.append(
+ f"stop_sequences truncated from {len(request.stop_sequences)} to 4 "
+ f"(Chat API limit: {request.stop_sequences[4:]} will be dropped)"
+ )
+
+ if lost_info:
+ msg = f"Converting to Chat Completions will lose information: {'; '.join(lost_info)}"
+ if strict:
+ raise ValueError(msg)
+ _LOGGER.warning(msg)
+
+ # Convert messages, flattening ToolCallMessage into AssistantMessage.tool_calls
+ chat_messages: list[dict[str, Any]] = []
+ pending_tool_calls: list[dict[str, Any]] = []
+
+ for msg in request.messages:
+ msg_dict = dict(msg)
+
+ # ToolCallMessage → collect for previous assistant message
+ if "call_id" in msg_dict and "name" in msg_dict and "arguments" in msg_dict:
+ # This is a ToolCallMessage
+ pending_tool_calls.append(
+ {
+ "id": msg_dict["call_id"],
+ "type": "function",
+ "function": {
+ "name": msg_dict["name"],
+ "arguments": msg_dict["arguments"],
+ },
+ }
+ )
+ else:
+ # Flush any pending tool calls to the last assistant message
+ if pending_tool_calls and chat_messages:
+ last_msg = chat_messages[-1]
+ if last_msg.get("role") == "assistant":
+ last_msg["tool_calls"] = pending_tool_calls
+ pending_tool_calls = []
+ else:
+ if strict:
+ role = last_msg.get("role")
+ raise ValueError(
+ f"ToolCallMessage found but previous message is not assistant (role={role})"
+ )
+ _LOGGER.warning(
+ "ToolCallMessage found but previous message is not assistant, discarding tool calls"
+ )
+ pending_tool_calls = []
+
+ # Add the current message
+ chat_messages.append(msg_dict)
+
+ # Flush any remaining tool calls
+ if pending_tool_calls and chat_messages:
+ last_msg = chat_messages[-1]
+ if last_msg.get("role") == "assistant":
+ last_msg["tool_calls"] = pending_tool_calls
+ elif strict:
+ raise ValueError("ToolCallMessage at end but last message is not assistant")
+
+ kwargs: dict[str, Any] = {
+ "messages": chat_messages,
+ }
+
+ # Add system prompt as first message if present
+ if request.system_prompt:
+ kwargs["messages"] = [
+ {"role": "system", "content": request.system_prompt},
+ *kwargs["messages"],
+ ]
+
+ # Add optional parameters (filter None values)
+ if request.max_output_tokens is not None:
+ kwargs["max_completion_tokens"] = request.max_output_tokens
+ if request.temperature is not None:
+ kwargs["temperature"] = request.temperature
+ if request.top_p is not None:
+ kwargs["top_p"] = request.top_p
+ if request.stream:
+ kwargs["stream"] = True
+ if request.tools:
+ kwargs["tools"] = [_tool_to_chat_completions(tool) for tool in request.tools]
+ if request.tool_choice is not None:
+ kwargs["tool_choice"] = _tool_choice_to_openai(request.tool_choice)
+ if request.metadata:
+ kwargs["metadata"] = request.metadata
+ if request.service_tier and request.service_tier != "standard_only":
+ kwargs["service_tier"] = request.service_tier
+ if request.stop_sequences:
+ # OpenAI Chat supports up to 4 stop sequences
+ kwargs["stop"] = request.stop_sequences[:4]
+
+ return kwargs
+
+ def from_external(
+ self,
+ kwargs: openai.types.chat.completion_create_params.CompletionCreateParams,
+ *,
+ strict: bool = True,
+ ) -> LLMRequest:
+ """Create LLMRequest from OpenAI Chat Completions API kwargs.
+
Args:
kwargs: OpenAI Chat Completions API parameters
strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
@@ -1003,7 +1071,7 @@ def from_chat_completion(
if system_prompt:
final_system_prompt = _extract_string_content(system_prompt, strict=strict, context="System prompt")
- return cls(
+ return LLMRequest(
messages=filtered_messages,
max_output_tokens=kwargs.get("max_completion_tokens") or kwargs.get("max_tokens"),
temperature=kwargs.get("temperature"),
@@ -1017,13 +1085,85 @@ def from_chat_completion(
system_prompt=final_system_prompt,
)
- @classmethod
- def from_responses(
- cls,
+
+@dataclasses.dataclass(frozen=True)
+class ResponsesConverter:
+ """Converts between LLMRequest and OpenAI Responses format.
+
+ This converter handles bidirectional conversion between ARES's internal LLMRequest
+ format and the OpenAI Responses API format.
+
+ Conversion Notes:
+ - stop_sequences are not supported in Responses API
+ - top_k is not supported (Claude-specific)
+ - service_tier="standard_only" is not supported
+ - system_prompt mapped to/from instructions parameter
+ - messages converted to/from input items
+ """
+
+ def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]:
+ """Convert ARES LLMRequest to OpenAI Responses format.
+
+ Args:
+ request: ARES internal request format
+ strict: If True, raise ValueError on information loss. If False, log warnings.
+
+ Returns:
+ Dictionary of kwargs for openai.Responses.create() (without model)
+
+ Raises:
+ ValueError: If strict=True and information would be lost in conversion
+
+ Note:
+ Model parameter is NOT included - it should be added by the LLMClient
+ """
+ # Check for information loss
+ lost_info = []
+ if request.stop_sequences:
+ lost_info.append(f"stop_sequences={request.stop_sequences} (not supported by Responses API)")
+ if request.top_k is not None:
+ lost_info.append(f"top_k={request.top_k} (Claude-specific, not supported)")
+ if request.service_tier == "standard_only":
+ lost_info.append("service_tier='standard_only' (not supported by Responses API)")
+
+ if lost_info:
+ msg = f"Converting to Responses will lose information: {'; '.join(lost_info)}"
+ if strict:
+ raise ValueError(msg)
+ _LOGGER.warning(msg)
+
+ kwargs: dict[str, Any] = {
+ "input": request._messages_to_responses_input(),
+ }
+
+ if request.system_prompt:
+ kwargs["instructions"] = request.system_prompt
+
+ if request.max_output_tokens is not None:
+ kwargs["max_output_tokens"] = request.max_output_tokens
+ if request.temperature is not None:
+ kwargs["temperature"] = request.temperature
+ if request.top_p is not None:
+ kwargs["top_p"] = request.top_p
+ if request.stream:
+ kwargs["stream"] = True
+ if request.tools:
+ kwargs["tools"] = [_tool_to_responses(tool) for tool in request.tools]
+ if request.tool_choice is not None:
+ kwargs["tool_choice"] = _tool_choice_to_responses(request.tool_choice)
+ if request.metadata:
+ kwargs["metadata"] = request.metadata
+ if request.service_tier and request.service_tier != "standard_only":
+ kwargs["service_tier"] = request.service_tier
+
+ return kwargs
+
+ def from_external(
+ self,
kwargs: openai.types.responses.response_create_params.ResponseCreateParamsBase,
*,
strict: bool = True,
- ) -> "LLMRequest":
+ ) -> LLMRequest:
"""Create LLMRequest from OpenAI Responses API kwargs.
Args:
@@ -1166,7 +1306,7 @@ def from_responses(
if temp_tools:
converted_tools = temp_tools
- return cls(
+ return LLMRequest(
messages=filtered_messages,
max_output_tokens=kwargs.get("max_output_tokens"),
temperature=kwargs.get("temperature"),
@@ -1179,13 +1319,99 @@ def from_responses(
system_prompt=kwargs.get("instructions"),
)
- @classmethod
- def from_messages(
- cls,
+
+@dataclasses.dataclass(frozen=True)
+class MessagesConverter:
+ """Converts between LLMRequest and Anthropic Messages format.
+
+ This converter handles bidirectional conversion between ARES's internal LLMRequest
+ format and the Anthropic Messages API format.
+
+ Conversion Notes:
+ - temperature converted between OpenAI range (0-2) and Claude range (0-1)
+ - messages must alternate user/assistant (enforced by Claude API)
+ - system_prompt mapped to/from system parameter
+ - service_tier options limited to "auto" and "standard_only"
+ - top_k is Claude-specific (supported)
+ """
+
+ def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]:
+ """Convert ARES LLMRequest to Claude Messages format.
+
+ Args:
+ request: ARES internal request format
+ strict: If True, raise ValueError on information loss. If False, log warnings.
+
+ Returns:
+ Dictionary of kwargs for anthropic.messages.create() (without model)
+
+ Raises:
+ ValueError: If strict=True and information would be lost in conversion
+
+ Note:
+ Model parameter is NOT included - it should be added by the LLMClient
+ """
+ # Check for information loss
+ lost_info = []
+ if request.service_tier not in (None, "auto", "standard_only"):
+ lost_info.append(f"service_tier='{request.service_tier}' (Claude only supports 'auto' and 'standard_only')")
+
+ # Check for filtered messages
+ filtered_messages = []
+ for msg in request.messages:
+ msg_dict = dict(msg)
+ role = msg_dict["role"]
+ if role in ("system", "developer"):
+ content = str(msg_dict.get("content", ""))[:50]
+ filtered_messages.append(f"{role} message: {content}...")
+
+ if filtered_messages:
+ lost_info.append(f"Messages filtered out (use system_prompt instead): {'; '.join(filtered_messages)}")
+
+ if lost_info:
+ msg = f"Converting to Claude Messages will lose information: {'; '.join(lost_info)}"
+ if strict:
+ raise ValueError(msg)
+ _LOGGER.warning(msg)
+
+ kwargs: dict[str, Any] = {
+ "messages": request._messages_to_claude_format(strict=strict),
+ "max_tokens": request.max_output_tokens or 1024, # max_tokens is required by Claude
+ }
+
+ if request.system_prompt:
+ kwargs["system"] = request.system_prompt
+
+ if request.temperature is not None:
+ # Convert from OpenAI range (0-2) to Claude range (0-1)
+ kwargs["temperature"] = min(request.temperature / 2.0, 1.0)
+ if request.top_p is not None:
+ kwargs["top_p"] = request.top_p
+ if request.top_k is not None:
+ kwargs["top_k"] = request.top_k
+ if request.stream:
+ kwargs["stream"] = True
+ if request.tools:
+ # Convert tools to Anthropic format (adds explicit type: "custom")
+ kwargs["tools"] = [_tool_to_anthropic(tool) for tool in request.tools]
+ if request.tool_choice is not None:
+ kwargs["tool_choice"] = _tool_choice_to_anthropic(request.tool_choice)
+ if request.metadata:
+ # Claude uses metadata.user_id specifically
+ kwargs["metadata"] = request.metadata
+ if request.service_tier in ("auto", "standard_only"):
+ kwargs["service_tier"] = request.service_tier
+ if request.stop_sequences:
+ kwargs["stop_sequences"] = request.stop_sequences
+
+ return kwargs
+
+ def from_external(
+ self,
kwargs: anthropic.types.MessageCreateParams,
*,
strict: bool = True,
- ) -> "LLMRequest":
+ ) -> LLMRequest:
"""Create LLMRequest from Claude Messages API kwargs.
Args:
@@ -1267,7 +1493,7 @@ def from_messages(
raise
_LOGGER.warning("Skipping invalid tool: %s", e)
- return cls(
+ return LLMRequest(
messages=filtered_messages,
max_output_tokens=kwargs["max_tokens"],
temperature=temperature,
diff --git a/src/ares/testing/mock_container.py b/src/ares/testing/mock_container.py
index cd2998e..f0b1b08 100644
--- a/src/ares/testing/mock_container.py
+++ b/src/ares/testing/mock_container.py
@@ -73,7 +73,7 @@ async def exec_run(
return self.exec_responses[command]
# Default to successful empty response
- return containers.ExecResult(output="", exit_code=0)
+ return containers.ExecResult(stdout="", stderr="", exit_code=0)
async def upload_files(self, local_paths: list[pathlib.Path], remote_paths: list[str]) -> None:
"""Record uploaded files."""
diff --git a/src/ares/testing/mock_container_test.py b/src/ares/testing/mock_container_test.py
index 7514b46..f5b4da6 100644
--- a/src/ares/testing/mock_container_test.py
+++ b/src/ares/testing/mock_container_test.py
@@ -55,7 +55,8 @@ async def test_mock_container_exec_run_configured_response():
"""Test that exec_run uses configured responses."""
container = mock_container.MockContainer()
container.exec_responses["ls -la"] = containers.ExecResult(
- output="file1.txt\nfile2.txt",
+ stdout="file1.txt\nfile2.txt",
+ stderr="",
exit_code=0,
)
@@ -72,8 +73,8 @@ async def test_mock_container_exec_run_custom_handler():
def handler(command: str) -> containers.ExecResult:
if "error" in command:
- return containers.ExecResult(output="Error!", exit_code=1)
- return containers.ExecResult(output=f"Executed: {command}", exit_code=0)
+ return containers.ExecResult(stdout="", stderr="Error!", exit_code=1)
+ return containers.ExecResult(stdout=f"Executed: {command}", stderr="", exit_code=0)
container.exec_handler = handler