diff --git a/src/ares/code_agents/mini_swe_agent.py b/src/ares/code_agents/mini_swe_agent.py
index da2238f..9db64fa 100644
--- a/src/ares/code_agents/mini_swe_agent.py
+++ b/src/ares/code_agents/mini_swe_agent.py
@@ -38,10 +38,10 @@
 
 # Copied from minisweagent's default config.
 _TIMEOUT_TEMPLATE = """
-The last command <command>{action}</command> timed out and has been killed.
+The last command <command>{{ action }}</command> timed out and has been killed.
 The output of the command was:
 <output>
-{output}
+{{ output }}
 </output>
 Please try another command and make sure to avoid those requiring interactive input.
 """.strip()
@@ -106,8 +106,19 @@ def _render_format_error_template(format_error_template: str, actions: list[str]
 
 
 def _render_timeout_template(action: str, output: str) -> str:
-    # TODO: Use jinja2, and allow updating of configuration.
-    return _TIMEOUT_TEMPLATE.format(action=action, output=output)
+    """Render the timeout error message using Jinja2.
+
+    Args:
+        action: The action/command that timed out
+        output: Any partial output from the command (may be empty)
+
+    Returns:
+        Rendered timeout error message
+    """
+    return jinja2.Template(_TIMEOUT_TEMPLATE, undefined=jinja2.StrictUndefined).render(
+        action=action,
+        output=output,
+    )
 
 
 @dataclasses.dataclass(kw_only=True)
diff --git a/src/ares/code_agents/terminus2/terminus2_agent.py b/src/ares/code_agents/terminus2/terminus2_agent.py
index 5bd76f4..57d3740 100644
--- a/src/ares/code_agents/terminus2/terminus2_agent.py
+++ b/src/ares/code_agents/terminus2/terminus2_agent.py
@@ -143,7 +143,6 @@ class Terminus2Agent(code_agent_base.CodeAgent):
 
     container: containers.Container
     llm_client: llm_clients.LLMClient
-    # TODO: Actually use the stat tracker in the agent.
     tracker: stat_tracker.StatTracker = dataclasses.field(default_factory=stat_tracker.NullStatTracker)
     parser_format: Literal["json", "xml"] = "json"
     max_turns: int = 1_000_000  # Match terminal-bench reference (effectively unlimited)
@@ -489,7 +488,8 @@ async def run(self, task: str) -> None:
         self._original_instruction = task  # Store for summarization
 
         # Initialize tmux session to capture initial terminal state
-        await self._ensure_tmux_session()
+        with self.tracker.timeit("t2/setup"):
+            await self._ensure_tmux_session()
 
         # Capture initial terminal state using incremental output
         # First call returns "Current Terminal Screen:\n{visible}" automatically
@@ -680,9 +680,10 @@ async def _query_llm(self) -> response.LLMResponse:
                 )
 
         try:
-            response = await self.llm_client(
-                request.LLMRequest(messages=self._messages, system_prompt=self._system_prompt)
-            )
+            with self.tracker.timeit("t2/llm_request"):
+                response = await self.llm_client(
+                    request.LLMRequest(messages=self._messages, system_prompt=self._system_prompt)
+                )
             _LOGGER.debug("[%d] Received LLM response", id(self))
             return response
 
diff --git a/src/ares/code_agents/terminus2/terminus2_agent_test.py b/src/ares/code_agents/terminus2/terminus2_agent_test.py
index 693923b..95f5a70 100644
--- a/src/ares/code_agents/terminus2/terminus2_agent_test.py
+++ b/src/ares/code_agents/terminus2/terminus2_agent_test.py
@@ -33,7 +33,7 @@ def handle_command(self, command: str) -> containers.ExecResult:
                 session_name = match.group(1)
                 self.sessions[session_name] = "active"
                 self.panes[session_name] = ""
-            return containers.ExecResult(output="", exit_code=0)
+            return containers.ExecResult(stdout="", stderr="", exit_code=0)
 
         elif "tmux send-keys" in command and "-l" in command:
             match = re.search(r"-t\s+(\S+)", command)
@@ -44,7 +44,7 @@ def handle_command(self, command: str) -> containers.ExecResult:
                 if text_match and session_name in self.panes:
                     text = text_match.group(1) or text_match.group(2) or ""
                     self.panes[session_name] += text
-            return containers.ExecResult(output="", exit_code=0)
+            return containers.ExecResult(stdout="", stderr="", exit_code=0)
 
         elif "tmux send-keys" in command and "Enter" in command:
             match = re.search(r"-t\s+(\S+)", command)
@@ -55,7 +55,7 @@ def handle_command(self, command: str) -> containers.ExecResult:
                     self.panes[session_name] += "\n"
                     if typed_command.strip():
                         self.panes[session_name] += f"[executed: {typed_command}]\n"
-            return containers.ExecResult(output="", exit_code=0)
+            return containers.ExecResult(stdout="", stderr="", exit_code=0)
 
         elif "tmux capture-pane" in command:
             match = re.search(r"-t\s+(\S+)", command)
@@ -63,7 +63,7 @@ def handle_command(self, command: str) -> containers.ExecResult:
             if match:
                 session_name = match.group(1)
                 output = self.panes.get(session_name, "")
-            return containers.ExecResult(output=output, exit_code=0)
+            return containers.ExecResult(stdout=output, stderr="", exit_code=0)
 
         elif "tmux kill-session" in command:
             match = re.search(r"-t\s+(\S+)", command)
@@ -71,24 +71,24 @@ def handle_command(self, command: str) -> containers.ExecResult:
                 session_name = match.group(1)
                 self.sessions.pop(session_name, None)
                 self.panes.pop(session_name, None)
-            return containers.ExecResult(output="", exit_code=0)
+            return containers.ExecResult(stdout="", stderr="", exit_code=0)
 
         elif "tmux has-session" in command:
             match = re.search(r"-t\s+(\S+)", command)
             if match:
                 session_name = match.group(1)
                 exit_code = 0 if session_name in self.sessions else 1
-                return containers.ExecResult(output="", exit_code=exit_code)
-            return containers.ExecResult(output="", exit_code=1)
+                return containers.ExecResult(stdout="", stderr="", exit_code=exit_code)
+            return containers.ExecResult(stdout="", stderr="", exit_code=1)
 
         elif "which tmux" in command:
-            return containers.ExecResult(output="/usr/bin/tmux", exit_code=0)
+            return containers.ExecResult(stdout="/usr/bin/tmux", stderr="", exit_code=0)
 
         elif "tmux set-option" in command:
-            return containers.ExecResult(output="", exit_code=0)
+            return containers.ExecResult(stdout="", stderr="", exit_code=0)
 
         # Default success for other commands
-        return containers.ExecResult(output="", exit_code=0)
+        return containers.ExecResult(stdout="", stderr="", exit_code=0)
 
 
 class TestTerminus2AgentBasics:
diff --git a/src/ares/containers/containers.py b/src/ares/containers/containers.py
index d4da940..fa56a2d 100644
--- a/src/ares/containers/containers.py
+++ b/src/ares/containers/containers.py
@@ -8,10 +8,30 @@
 
 @dataclasses.dataclass(frozen=True)
 class ExecResult:
-    # TODO: Maybe stdout/stderr?
-    output: str
+    """Result of executing a command in a container.
+
+    Attributes:
+        stdout: Standard output from the command.
+        stderr: Standard error output from the command.
+        exit_code: Exit code of the command (0 typically means success).
+        output: Combined stdout + stderr for backward compatibility.
+            This is a computed property - prefer using stdout/stderr directly.
+    """
+
+    stdout: str
+    stderr: str
     exit_code: int
 
+    @property
+    def output(self) -> str:
+        """Combined stdout and stderr for backward compatibility.
+
+        Returns stdout + stderr concatenated. For new code, prefer accessing
+        stdout and stderr separately for better error handling.
+        """
+        # Combine with stderr second so errors appear at the end
+        return self.stdout + self.stderr
+
 
 @dataclasses.dataclass(frozen=True)
 class Resources:
diff --git a/src/ares/containers/daytona.py b/src/ares/containers/daytona.py
index 497ef91..cb4df04 100644
--- a/src/ares/containers/daytona.py
+++ b/src/ares/containers/daytona.py
@@ -153,7 +153,9 @@ async def exec_run(
         if float(int_exit_code) != exit_code:
             raise ValueError(f"Exit code is not an integer: {exit_code}")
 
-        return containers.ExecResult(output=result.result, exit_code=int_exit_code)
+        # Daytona provides combined stdout+stderr in result field
+        # Put it in stdout, leave stderr empty for now
+        return containers.ExecResult(stdout=result.result, stderr="", exit_code=int_exit_code)
 
     def stop_and_remove(self) -> None:
         """Stop and remove the container."""
diff --git a/src/ares/containers/docker.py b/src/ares/containers/docker.py
index 6d1cae5..5eaaa79 100644
--- a/src/ares/containers/docker.py
+++ b/src/ares/containers/docker.py
@@ -118,7 +118,9 @@ async def exec_run(
             timeout=timeout_s,
         )
         result_str = result.output.decode("utf-8", errors="replace")
-        return containers.ExecResult(output=result_str, exit_code=result.exit_code)
+        # Docker provides combined stdout+stderr in output field
+        # Put it in stdout, leave stderr empty for now
+        return containers.ExecResult(stdout=result_str, stderr="", exit_code=result.exit_code)
 
     def stop_and_remove(self) -> None:
         """Stop and remove the container."""
diff --git a/src/ares/llms/request.py b/src/ares/llms/request.py
index 0da06d7..6e881be 100644
--- a/src/ares/llms/request.py
+++ b/src/ares/llms/request.py
@@ -2,7 +2,7 @@
 
 import dataclasses
 import logging
-from typing import Any, Literal, NotRequired, Required, TypedDict, cast
+from typing import Any, Literal, NotRequired, Protocol, Required, TypedDict, cast
 
 import anthropic.types
 import openai.types.chat
@@ -450,6 +450,52 @@ def _tool_choice_from_anthropic(
     return None
 
 
+class RequestConverter[RequestType](Protocol):
+    """Converts between ARES LLMRequest and external API formats.
+
+    This protocol defines the interface for bidirectional conversion between ARES's internal
+    LLMRequest format and external API request formats (OpenAI Chat Completions, OpenAI Responses,
+    Anthropic Messages, etc.).
+
+    Type Parameters:
+        RequestType: The external API's request parameters type (e.g., dict[str, Any] for kwargs)
+
+    Note:
+        Implementations should be frozen dataclasses for thread-safety in async contexts.
+        The model parameter is NOT included in conversions - it should be managed by the LLMClient.
+    """
+
+    def to_external(self, request: "LLMRequest", *, strict: bool = True) -> RequestType:
+        """Convert ARES LLMRequest to external API format.
+
+        Args:
+            request: ARES internal request format
+            strict: If True, raise ValueError on information loss. If False, log warnings.
+
+        Returns:
+            Request parameters in external API format (without model parameter)
+
+        Raises:
+            ValueError: If strict=True and information would be lost in conversion
+        """
+        ...
+
+    def from_external(self, kwargs: RequestType, *, strict: bool = True) -> "LLMRequest":
+        """Convert external API format to ARES LLMRequest.
+
+        Args:
+            kwargs: External API request parameters
+            strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
+
+        Returns:
+            LLMRequest instance
+
+        Raises:
+            ValueError: If strict=True and there are unhandled parameters
+        """
+        ...
+
+
 @dataclasses.dataclass(frozen=True, kw_only=True)
 class LLMRequest:
     """Unified request format for OpenAI Chat Completions, OpenAI Responses, and Claude Messages APIs.
@@ -497,6 +543,8 @@ class LLMRequest:
     def to_chat_completion_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
         """Convert to OpenAI Chat Completions API format.
 
+        This is a convenience wrapper around ChatCompletionConverter.
+
         Args:
             strict: If True, raise ValueError on information loss. If False, log warnings.
 
@@ -512,110 +560,13 @@ def to_chat_completion_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
             - service_tier="standard_only" is not supported
             - stop_sequences truncated to 4 if more provided
         """
-        # Check for information loss
-        lost_info = []
-        if self.top_k is not None:
-            lost_info.append(f"top_k={self.top_k} (Claude-specific, not supported)")
-        if self.service_tier == "standard_only":
-            lost_info.append("service_tier='standard_only' (not supported by Chat API)")
-        if self.stop_sequences and len(self.stop_sequences) > 4:
-            lost_info.append(
-                f"stop_sequences truncated from {len(self.stop_sequences)} to 4 "
-                f"(Chat API limit: {self.stop_sequences[4:]} will be dropped)"
-            )
-
-        if lost_info:
-            msg = f"Converting to Chat Completions will lose information: {'; '.join(lost_info)}"
-            if strict:
-                raise ValueError(msg)
-            _LOGGER.warning(msg)
-
-        # Convert messages, flattening ToolCallMessage into AssistantMessage.tool_calls
-        chat_messages: list[dict[str, Any]] = []
-        pending_tool_calls: list[dict[str, Any]] = []
-
-        for msg in self.messages:
-            msg_dict = dict(msg)
-
-            # ToolCallMessage → collect for previous assistant message
-            if "call_id" in msg_dict and "name" in msg_dict and "arguments" in msg_dict:
-                # This is a ToolCallMessage
-                pending_tool_calls.append(
-                    {
-                        "id": msg_dict["call_id"],
-                        "type": "function",
-                        "function": {
-                            "name": msg_dict["name"],
-                            "arguments": msg_dict["arguments"],
-                        },
-                    }
-                )
-            else:
-                # Flush any pending tool calls to the last assistant message
-                if pending_tool_calls and chat_messages:
-                    last_msg = chat_messages[-1]
-                    if last_msg.get("role") == "assistant":
-                        last_msg["tool_calls"] = pending_tool_calls
-                        pending_tool_calls = []
-                    else:
-                        if strict:
-                            role = last_msg.get("role")
-                            raise ValueError(
-                                f"ToolCallMessage found but previous message is not assistant (role={role})"
-                            )
-                        _LOGGER.warning(
-                            "ToolCallMessage found but previous message is not assistant, discarding tool calls"
-                        )
-                        pending_tool_calls = []
-
-                # Add the current message
-                chat_messages.append(msg_dict)
-
-        # Flush any remaining tool calls
-        if pending_tool_calls and chat_messages:
-            last_msg = chat_messages[-1]
-            if last_msg.get("role") == "assistant":
-                last_msg["tool_calls"] = pending_tool_calls
-            elif strict:
-                raise ValueError("ToolCallMessage at end but last message is not assistant")
-
-        kwargs: dict[str, Any] = {
-            "messages": chat_messages,
-        }
-
-        # Add system prompt as first message if present
-        if self.system_prompt:
-            kwargs["messages"] = [
-                {"role": "system", "content": self.system_prompt},
-                *kwargs["messages"],
-            ]
-
-        # Add optional parameters (filter None values)
-        if self.max_output_tokens is not None:
-            kwargs["max_completion_tokens"] = self.max_output_tokens
-        if self.temperature is not None:
-            kwargs["temperature"] = self.temperature
-        if self.top_p is not None:
-            kwargs["top_p"] = self.top_p
-        if self.stream:
-            kwargs["stream"] = True
-        if self.tools:
-            kwargs["tools"] = [_tool_to_chat_completions(tool) for tool in self.tools]
-        if self.tool_choice is not None:
-            kwargs["tool_choice"] = _tool_choice_to_openai(self.tool_choice)
-        if self.metadata:
-            kwargs["metadata"] = self.metadata
-        if self.service_tier and self.service_tier != "standard_only":
-            kwargs["service_tier"] = self.service_tier
-        if self.stop_sequences:
-            # OpenAI Chat supports up to 4 stop sequences
-            kwargs["stop"] = self.stop_sequences[:4]
-
-        return kwargs
+        return ChatCompletionConverter().to_external(self, strict=strict)
 
     def to_responses_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
         """Convert to OpenAI Responses API format.
 
+        This is a convenience wrapper around ResponsesConverter.
+
         Args:
             strict: If True, raise ValueError on information loss. If False, log warnings.
 
@@ -633,50 +584,13 @@ def to_responses_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
             - top_k is not supported (Claude-specific)
             - service_tier="standard_only" is not supported
         """
-        # Check for information loss
-        lost_info = []
-        if self.stop_sequences:
-            lost_info.append(f"stop_sequences={self.stop_sequences} (not supported by Responses API)")
-        if self.top_k is not None:
-            lost_info.append(f"top_k={self.top_k} (Claude-specific, not supported)")
-        if self.service_tier == "standard_only":
-            lost_info.append("service_tier='standard_only' (not supported by Responses API)")
-
-        if lost_info:
-            msg = f"Converting to Responses will lose information: {'; '.join(lost_info)}"
-            if strict:
-                raise ValueError(msg)
-            _LOGGER.warning(msg)
-
-        kwargs: dict[str, Any] = {
-            "input": self._messages_to_responses_input(),
-        }
-
-        if self.system_prompt:
-            kwargs["instructions"] = self.system_prompt
-
-        if self.max_output_tokens is not None:
-            kwargs["max_output_tokens"] = self.max_output_tokens
-        if self.temperature is not None:
-            kwargs["temperature"] = self.temperature
-        if self.top_p is not None:
-            kwargs["top_p"] = self.top_p
-        if self.stream:
-            kwargs["stream"] = True
-        if self.tools:
-            kwargs["tools"] = [_tool_to_responses(tool) for tool in self.tools]
-        if self.tool_choice is not None:
-            kwargs["tool_choice"] = _tool_choice_to_responses(self.tool_choice)
-        if self.metadata:
-            kwargs["metadata"] = self.metadata
-        if self.service_tier and self.service_tier != "standard_only":
-            kwargs["service_tier"] = self.service_tier
-
-        return kwargs
+        return ResponsesConverter().to_external(self, strict=strict)
 
     def to_messages_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
         """Convert to Claude Messages API format.
 
+        This is a convenience wrapper around MessagesConverter.
+
         Args:
             strict: If True, raise ValueError on information loss. If False, log warnings.
 
@@ -694,60 +608,7 @@ def to_messages_kwargs(self, *, strict: bool = True) -> dict[str, Any]:
             - service_tier options are limited to "auto" and "standard_only"
             - tool schemas may need conversion (not implemented yet)
         """
-        # Check for information loss
-        lost_info = []
-        if self.service_tier not in (None, "auto", "standard_only"):
-            lost_info.append(f"service_tier='{self.service_tier}' (Claude only supports 'auto' and 'standard_only')")
-
-        # Check for filtered messages
-        filtered_messages = []
-        for msg in self.messages:
-            msg_dict = dict(msg)
-            role = msg_dict["role"]
-            if role in ("system", "developer"):
-                content = str(msg_dict.get("content", ""))[:50]
-                filtered_messages.append(f"{role} message: {content}...")
-
-        if filtered_messages:
-            lost_info.append(f"Messages filtered out (use system_prompt instead): {'; '.join(filtered_messages)}")
-
-        if lost_info:
-            msg = f"Converting to Claude Messages will lose information: {'; '.join(lost_info)}"
-            if strict:
-                raise ValueError(msg)
-            _LOGGER.warning(msg)
-
-        kwargs: dict[str, Any] = {
-            "messages": self._messages_to_claude_format(strict=strict),
-            "max_tokens": self.max_output_tokens or 1024,  # max_tokens is required by Claude
-        }
-
-        if self.system_prompt:
-            kwargs["system"] = self.system_prompt
-
-        if self.temperature is not None:
-            # Convert from OpenAI range (0-2) to Claude range (0-1)
-            kwargs["temperature"] = min(self.temperature / 2.0, 1.0)
-        if self.top_p is not None:
-            kwargs["top_p"] = self.top_p
-        if self.top_k is not None:
-            kwargs["top_k"] = self.top_k
-        if self.stream:
-            kwargs["stream"] = True
-        if self.tools:
-            # Convert tools to Anthropic format (adds explicit type: "custom")
-            kwargs["tools"] = [_tool_to_anthropic(tool) for tool in self.tools]
-        if self.tool_choice is not None:
-            kwargs["tool_choice"] = _tool_choice_to_anthropic(self.tool_choice)
-        if self.metadata:
-            # Claude uses metadata.user_id specifically
-            kwargs["metadata"] = self.metadata
-        if self.service_tier in ("auto", "standard_only"):
-            kwargs["service_tier"] = self.service_tier
-        if self.stop_sequences:
-            kwargs["stop_sequences"] = self.stop_sequences
-
-        return kwargs
+        return MessagesConverter().to_external(self, strict=strict)
 
     def _messages_to_responses_input(self) -> list[dict[str, Any]]:
         """Convert messages from internal format to Responses input items.
@@ -875,6 +736,213 @@ def from_chat_completion(
     ) -> "LLMRequest":
         """Create LLMRequest from OpenAI Chat Completions API kwargs.
 
+        This is a convenience wrapper around ChatCompletionConverter.
+
+        Args:
+            kwargs: OpenAI Chat Completions API parameters
+            strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
+
+        Returns:
+            LLMRequest instance
+
+        Raises:
+            ValueError: If strict=True and there are unhandled parameters
+
+        Note:
+            Model parameter is ignored - it should be managed by the LLMClient
+        """
+        return ChatCompletionConverter().from_external(kwargs, strict=strict)
+
+    @classmethod
+    def from_responses(
+        cls,
+        kwargs: openai.types.responses.response_create_params.ResponseCreateParamsBase,
+        *,
+        strict: bool = True,
+    ) -> "LLMRequest":
+        """Create LLMRequest from OpenAI Responses API kwargs.
+
+        This is a convenience wrapper around ResponsesConverter.
+
+        Args:
+            kwargs: OpenAI Responses API parameters
+            strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
+
+        Returns:
+            LLMRequest instance
+
+        Raises:
+            ValueError: If strict=True and there are unhandled parameters
+
+        Note:
+            Model parameter is ignored - it should be managed by the LLMClient
+        """
+        return ResponsesConverter().from_external(kwargs, strict=strict)
+
+    @classmethod
+    def from_messages(
+        cls,
+        kwargs: anthropic.types.MessageCreateParams,
+        *,
+        strict: bool = True,
+    ) -> "LLMRequest":
+        """Create LLMRequest from Claude Messages API kwargs.
+
+        This is a convenience wrapper around MessagesConverter.
+
+        Args:
+            kwargs: Claude Messages API parameters
+            strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
+
+        Returns:
+            LLMRequest instance
+
+        Raises:
+            ValueError: If strict=True and there are unhandled parameters
+        """
+        return MessagesConverter().from_external(kwargs, strict=strict)
+
+
+@dataclasses.dataclass(frozen=True)
+class ChatCompletionConverter:
+    """Converts between LLMRequest and OpenAI Chat Completions format.
+
+    This converter handles bidirectional conversion between ARES's internal LLMRequest
+    format and the OpenAI Chat Completions API format.
+
+    Conversion Notes:
+        - top_k is not supported (Claude-specific)
+        - service_tier="standard_only" is not supported
+        - stop_sequences truncated to 4 (OpenAI limit)
+        - system_prompt is converted to/from system message in messages list
+        - ToolCallMessage flattened into AssistantMessage.tool_calls
+    """
+
+    def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]:
+        """Convert ARES LLMRequest to OpenAI Chat Completions format.
+
+        Args:
+            request: ARES internal request format
+            strict: If True, raise ValueError on information loss. If False, log warnings.
+
+        Returns:
+            Dictionary of kwargs for openai.ChatCompletion.create() (without model)
+
+        Raises:
+            ValueError: If strict=True and information would be lost in conversion
+
+        Note:
+            Model parameter is NOT included - it should be added by the LLMClient
+        """
+        # Check for information loss
+        lost_info = []
+        if request.top_k is not None:
+            lost_info.append(f"top_k={request.top_k} (Claude-specific, not supported)")
+        if request.service_tier == "standard_only":
+            lost_info.append("service_tier='standard_only' (not supported by Chat API)")
+        if request.stop_sequences and len(request.stop_sequences) > 4:
+            lost_info.append(
+                f"stop_sequences truncated from {len(request.stop_sequences)} to 4 "
+                f"(Chat API limit: {request.stop_sequences[4:]} will be dropped)"
+            )
+
+        if lost_info:
+            msg = f"Converting to Chat Completions will lose information: {'; '.join(lost_info)}"
+            if strict:
+                raise ValueError(msg)
+            _LOGGER.warning(msg)
+
+        # Convert messages, flattening ToolCallMessage into AssistantMessage.tool_calls
+        chat_messages: list[dict[str, Any]] = []
+        pending_tool_calls: list[dict[str, Any]] = []
+
+        for msg in request.messages:
+            msg_dict = dict(msg)
+
+            # ToolCallMessage → collect for previous assistant message
+            if "call_id" in msg_dict and "name" in msg_dict and "arguments" in msg_dict:
+                # This is a ToolCallMessage
+                pending_tool_calls.append(
+                    {
+                        "id": msg_dict["call_id"],
+                        "type": "function",
+                        "function": {
+                            "name": msg_dict["name"],
+                            "arguments": msg_dict["arguments"],
+                        },
+                    }
+                )
+            else:
+                # Flush any pending tool calls to the last assistant message
+                if pending_tool_calls and chat_messages:
+                    last_msg = chat_messages[-1]
+                    if last_msg.get("role") == "assistant":
+                        last_msg["tool_calls"] = pending_tool_calls
+                        pending_tool_calls = []
+                    else:
+                        if strict:
+                            role = last_msg.get("role")
+                            raise ValueError(
+                                f"ToolCallMessage found but previous message is not assistant (role={role})"
+                            )
+                        _LOGGER.warning(
+                            "ToolCallMessage found but previous message is not assistant, discarding tool calls"
+                        )
+                        pending_tool_calls = []
+
+                # Add the current message
+                chat_messages.append(msg_dict)
+
+        # Flush any remaining tool calls
+        if pending_tool_calls and chat_messages:
+            last_msg = chat_messages[-1]
+            if last_msg.get("role") == "assistant":
+                last_msg["tool_calls"] = pending_tool_calls
+            elif strict:
+                raise ValueError("ToolCallMessage at end but last message is not assistant")
+
+        kwargs: dict[str, Any] = {
+            "messages": chat_messages,
+        }
+
+        # Add system prompt as first message if present
+        if request.system_prompt:
+            kwargs["messages"] = [
+                {"role": "system", "content": request.system_prompt},
+                *kwargs["messages"],
+            ]
+
+        # Add optional parameters (filter None values)
+        if request.max_output_tokens is not None:
+            kwargs["max_completion_tokens"] = request.max_output_tokens
+        if request.temperature is not None:
+            kwargs["temperature"] = request.temperature
+        if request.top_p is not None:
+            kwargs["top_p"] = request.top_p
+        if request.stream:
+            kwargs["stream"] = True
+        if request.tools:
+            kwargs["tools"] = [_tool_to_chat_completions(tool) for tool in request.tools]
+        if request.tool_choice is not None:
+            kwargs["tool_choice"] = _tool_choice_to_openai(request.tool_choice)
+        if request.metadata:
+            kwargs["metadata"] = request.metadata
+        if request.service_tier and request.service_tier != "standard_only":
+            kwargs["service_tier"] = request.service_tier
+        if request.stop_sequences:
+            # OpenAI Chat supports up to 4 stop sequences
+            kwargs["stop"] = request.stop_sequences[:4]
+
+        return kwargs
+
+    def from_external(
+        self,
+        kwargs: openai.types.chat.completion_create_params.CompletionCreateParams,
+        *,
+        strict: bool = True,
+    ) -> LLMRequest:
+        """Create LLMRequest from OpenAI Chat Completions API kwargs.
+
         Args:
             kwargs: OpenAI Chat Completions API parameters
             strict: If True, raise ValueError for unhandled parameters. If False, log warnings.
@@ -1003,7 +1071,7 @@ def from_chat_completion(
         if system_prompt:
             final_system_prompt = _extract_string_content(system_prompt, strict=strict, context="System prompt")
 
-        return cls(
+        return LLMRequest(
             messages=filtered_messages,
             max_output_tokens=kwargs.get("max_completion_tokens") or kwargs.get("max_tokens"),
             temperature=kwargs.get("temperature"),
@@ -1017,13 +1085,85 @@ def from_chat_completion(
             system_prompt=final_system_prompt,
         )
 
-    @classmethod
-    def from_responses(
-        cls,
+
+@dataclasses.dataclass(frozen=True)
+class ResponsesConverter:
+    """Converts between LLMRequest and OpenAI Responses format.
+
+    This converter handles bidirectional conversion between ARES's internal LLMRequest
+    format and the OpenAI Responses API format.
+
+    Conversion Notes:
+        - stop_sequences are not supported in Responses API
+        - top_k is not supported (Claude-specific)
+        - service_tier="standard_only" is not supported
+        - system_prompt mapped to/from instructions parameter
+        - messages converted to/from input items
+    """
+
+    def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]:
+        """Convert ARES LLMRequest to OpenAI Responses format.
+
+        Args:
+            request: ARES internal request format
+            strict: If True, raise ValueError on information loss. If False, log warnings.
+
+        Returns:
+            Dictionary of kwargs for openai.Responses.create() (without model)
+
+        Raises:
+            ValueError: If strict=True and information would be lost in conversion
+
+        Note:
+            Model parameter is NOT included - it should be added by the LLMClient
+        """
+        # Check for information loss
+        lost_info = []
+        if request.stop_sequences:
+            lost_info.append(f"stop_sequences={request.stop_sequences} (not supported by Responses API)")
+        if request.top_k is not None:
+            lost_info.append(f"top_k={request.top_k} (Claude-specific, not supported)")
+        if request.service_tier == "standard_only":
+            lost_info.append("service_tier='standard_only' (not supported by Responses API)")
+
+        if lost_info:
+            msg = f"Converting to Responses will lose information: {'; '.join(lost_info)}"
+            if strict:
+                raise ValueError(msg)
+            _LOGGER.warning(msg)
+
+        kwargs: dict[str, Any] = {
+            "input": request._messages_to_responses_input(),
+        }
+
+        if request.system_prompt:
+            kwargs["instructions"] = request.system_prompt
+
+        if request.max_output_tokens is not None:
+            kwargs["max_output_tokens"] = request.max_output_tokens
+        if request.temperature is not None:
+            kwargs["temperature"] = request.temperature
+        if request.top_p is not None:
+            kwargs["top_p"] = request.top_p
+        if request.stream:
+            kwargs["stream"] = True
+        if request.tools:
+            kwargs["tools"] = [_tool_to_responses(tool) for tool in request.tools]
+        if request.tool_choice is not None:
+            kwargs["tool_choice"] = _tool_choice_to_responses(request.tool_choice)
+        if request.metadata:
+            kwargs["metadata"] = request.metadata
+        if request.service_tier and request.service_tier != "standard_only":
+            kwargs["service_tier"] = request.service_tier
+
+        return kwargs
+
+    def from_external(
+        self,
         kwargs: openai.types.responses.response_create_params.ResponseCreateParamsBase,
         *,
         strict: bool = True,
-    ) -> "LLMRequest":
+    ) -> LLMRequest:
         """Create LLMRequest from OpenAI Responses API kwargs.
 
         Args:
@@ -1166,7 +1306,7 @@ def from_responses(
             if temp_tools:
                 converted_tools = temp_tools
 
-        return cls(
+        return LLMRequest(
             messages=filtered_messages,
             max_output_tokens=kwargs.get("max_output_tokens"),
             temperature=kwargs.get("temperature"),
@@ -1179,13 +1319,99 @@ def from_responses(
             system_prompt=kwargs.get("instructions"),
         )
 
-    @classmethod
-    def from_messages(
-        cls,
+
+@dataclasses.dataclass(frozen=True)
+class MessagesConverter:
+    """Converts between LLMRequest and Anthropic Messages format.
+
+    This converter handles bidirectional conversion between ARES's internal LLMRequest
+    format and the Anthropic Messages API format.
+
+    Conversion Notes:
+        - temperature converted between OpenAI range (0-2) and Claude range (0-1)
+        - messages must alternate user/assistant (enforced by Claude API)
+        - system_prompt mapped to/from system parameter
+        - service_tier options limited to "auto" and "standard_only"
+        - top_k is Claude-specific (supported)
+    """
+
+    def to_external(self, request: LLMRequest, *, strict: bool = True) -> dict[str, Any]:
+        """Convert ARES LLMRequest to Claude Messages format.
+
+        Args:
+            request: ARES internal request format
+            strict: If True, raise ValueError on information loss. If False, log warnings.
+
+        Returns:
+            Dictionary of kwargs for anthropic.messages.create() (without model)
+
+        Raises:
+            ValueError: If strict=True and information would be lost in conversion
+
+        Note:
+            Model parameter is NOT included - it should be added by the LLMClient
+        """
+        # Check for information loss
+        lost_info = []
+        if request.service_tier not in (None, "auto", "standard_only"):
+            lost_info.append(f"service_tier='{request.service_tier}' (Claude only supports 'auto' and 'standard_only')")
+
+        # Check for filtered messages
+        filtered_messages = []
+        for msg in request.messages:
+            msg_dict = dict(msg)
+            role = msg_dict["role"]
+            if role in ("system", "developer"):
+                content = str(msg_dict.get("content", ""))[:50]
+                filtered_messages.append(f"{role} message: {content}...")
+
+        if filtered_messages:
+            lost_info.append(f"Messages filtered out (use system_prompt instead): {'; '.join(filtered_messages)}")
+
+        if lost_info:
+            msg = f"Converting to Claude Messages will lose information: {'; '.join(lost_info)}"
+            if strict:
+                raise ValueError(msg)
+            _LOGGER.warning(msg)
+
+        kwargs: dict[str, Any] = {
+            "messages": request._messages_to_claude_format(strict=strict),
+            "max_tokens": request.max_output_tokens or 1024,  # max_tokens is required by Claude
+        }
+
+        if request.system_prompt:
+            kwargs["system"] = request.system_prompt
+
+        if request.temperature is not None:
+            # Convert from OpenAI range (0-2) to Claude range (0-1)
+            kwargs["temperature"] = min(request.temperature / 2.0, 1.0)
+        if request.top_p is not None:
+            kwargs["top_p"] = request.top_p
+        if request.top_k is not None:
+            kwargs["top_k"] = request.top_k
+        if request.stream:
+            kwargs["stream"] = True
+        if request.tools:
+            # Convert tools to Anthropic format (adds explicit type: "custom")
+            kwargs["tools"] = [_tool_to_anthropic(tool) for tool in request.tools]
+        if request.tool_choice is not None:
+            kwargs["tool_choice"] = _tool_choice_to_anthropic(request.tool_choice)
+        if request.metadata:
+            # Claude uses metadata.user_id specifically
+            kwargs["metadata"] = request.metadata
+        if request.service_tier in ("auto", "standard_only"):
+            kwargs["service_tier"] = request.service_tier
+        if request.stop_sequences:
+            kwargs["stop_sequences"] = request.stop_sequences
+
+        return kwargs
+
+    def from_external(
+        self,
         kwargs: anthropic.types.MessageCreateParams,
         *,
         strict: bool = True,
-    ) -> "LLMRequest":
+    ) -> LLMRequest:
         """Create LLMRequest from Claude Messages API kwargs.
 
         Args:
@@ -1267,7 +1493,7 @@ def from_messages(
                         raise
                     _LOGGER.warning("Skipping invalid tool: %s", e)
 
-        return cls(
+        return LLMRequest(
             messages=filtered_messages,
             max_output_tokens=kwargs["max_tokens"],
             temperature=temperature,
diff --git a/src/ares/testing/mock_container.py b/src/ares/testing/mock_container.py
index cd2998e..f0b1b08 100644
--- a/src/ares/testing/mock_container.py
+++ b/src/ares/testing/mock_container.py
@@ -73,7 +73,7 @@ async def exec_run(
             return self.exec_responses[command]
 
         # Default to successful empty response
-        return containers.ExecResult(output="", exit_code=0)
+        return containers.ExecResult(stdout="", stderr="", exit_code=0)
 
     async def upload_files(self, local_paths: list[pathlib.Path], remote_paths: list[str]) -> None:
         """Record uploaded files."""
diff --git a/src/ares/testing/mock_container_test.py b/src/ares/testing/mock_container_test.py
index 7514b46..f5b4da6 100644
--- a/src/ares/testing/mock_container_test.py
+++ b/src/ares/testing/mock_container_test.py
@@ -55,7 +55,8 @@ async def test_mock_container_exec_run_configured_response():
     """Test that exec_run uses configured responses."""
     container = mock_container.MockContainer()
     container.exec_responses["ls -la"] = containers.ExecResult(
-        output="file1.txt\nfile2.txt",
+        stdout="file1.txt\nfile2.txt",
+        stderr="",
         exit_code=0,
     )
 
@@ -72,8 +73,8 @@ async def test_mock_container_exec_run_custom_handler():
 
     def handler(command: str) -> containers.ExecResult:
         if "error" in command:
-            return containers.ExecResult(output="Error!", exit_code=1)
-        return containers.ExecResult(output=f"Executed: {command}", exit_code=0)
+            return containers.ExecResult(stdout="", stderr="Error!", exit_code=1)
+        return containers.ExecResult(stdout=f"Executed: {command}", stderr="", exit_code=0)
 
     container.exec_handler = handler