luohaha · Li0k · Jan 28, 2026 · Jan 28, 2026 · Jan 29, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -152,6 +152,16 @@ Avoid over-specifying implementation details; focus on the "what" and "why", not
 
 Review existing RFCs before implementation to understand design decisions and constraints.
 
+## Code Review Standards (RFC Conformance)
+
+When reviewing code against an RFC:
+
+1. **State the RFC's core invariant in one sentence** before reading code
+2. **Trace data flow**: for key variables, verify input set == output set
+3. **Check all paths**: normal, fallback, error — do they all satisfy the invariant?
+
+If you can't answer "yes" with line numbers, dig deeper.
+
 ## Async Runtime Rules
 
 - **New runtime code must be async-first**: avoid introducing new blocking I/O in `agent/`, `llm/`, `memory/`, and `tools/`.

diff --git a/README.md b/README.md
@@ -249,6 +249,9 @@ See the full configuration template in `.env.example`. Key options:
 | `MAX_ITERATIONS` | Maximum agent iterations | `100` |
 | `MEMORY_COMPRESSION_THRESHOLD` | Compress when exceeded | `25000` |
 | `MEMORY_SHORT_TERM_SIZE` | Recent messages to keep | `100` |
+| `COMPACT_USER_MESSAGE_MAX_TOKENS` | User message budget during compaction | `20000` |
+| `TOOL_OUTPUT_TRUNCATION_POLICY` | Truncate tool outputs (`none|bytes|tokens`) | `tokens` |
+| `CONTEXT_OVERFLOW_MAX_RETRIES` | Retries on context overflow | `3` |
 | `RETRY_MAX_ATTEMPTS` | Retry attempts for rate limits | `3` |
 | `LOG_LEVEL` | Logging level | `DEBUG` |
 

diff --git a/agent/base.py b/agent/base.py
@@ -3,7 +3,9 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, List, Optional
 
+from config import Config
 from llm import LLMMessage, LLMResponse, StopReason, ToolResult
+from llm.retry import is_context_length_error
 from memory import MemoryManager
 from tools.base import BaseTool
 from tools.todo import TodoTool
@@ -52,10 +54,6 @@ def __init__(
         # Initialize memory manager (uses Config directly)
         self.memory = MemoryManager(llm)
 
-        # Set up todo context provider for memory compression
-        # This injects current todo state into summaries instead of preserving all todo messages
-        self.memory.set_todo_context_provider(self._get_todo_context)
-
     @abstractmethod
     def run(self, task: str) -> str:
         """Execute the agent on a task and return final answer."""
@@ -84,6 +82,42 @@ async def _call_llm(
                 messages=messages, tools=tools, max_tokens=4096, **kwargs
             )
 
+    async def _call_with_overflow_recovery(
+        self,
+        tools: Optional[List] = None,
+        spinner_message: str = "Thinking...",
+        **kwargs,
+    ) -> LLMResponse:
+        """Call LLM with context overflow recovery."""
+        last_error: Optional[BaseException] = None
+        max_retries = max(0, Config.CONTEXT_OVERFLOW_MAX_RETRIES)
+
+        for attempt in range(max_retries + 1):
+            context = self.memory.get_context_for_llm()
+            try:
+                return await self._call_llm(
+                    messages=context,
+                    tools=tools,
+                    spinner_message=spinner_message,
+                    **kwargs,
+                )
+            except Exception as e:  # noqa: BLE001
+                if not is_context_length_error(e):
+                    raise
+                last_error = e
+                removed = self.memory.remove_oldest_with_pair_integrity()
+                if removed is None:
+                    break
+                logger.warning(
+                    "Context length exceeded; removed oldest message and retrying (%s/%s)",
+                    attempt + 1,
+                    max_retries + 1,
+                )
+
+        if last_error:
+            raise last_error
+        raise RuntimeError("Context overflow recovery failed without an error.")
+
     def _extract_text(self, response: LLMResponse) -> str:
         """Extract text from LLM response.
 
@@ -95,17 +129,6 @@ def _extract_text(self, response: LLMResponse) -> str:
         """
         return self.llm.extract_text(response)
 
-    def _get_todo_context(self) -> Optional[str]:
-        """Get current todo list state for memory compression.
-
-        Returns formatted todo list if items exist, None otherwise.
-        This is used by MemoryManager to inject todo state into summaries.
-        """
-        items = self.todo_list.get_current()
-        if not items:
-            return None
-        return self.todo_list.format_list()
-
     async def _react_loop(
         self,
         messages: List[LLMMessage],
@@ -135,11 +158,19 @@ async def _react_loop(
             context = self.memory.get_context_for_llm() if use_memory else messages
 
             # Call LLM with tools
-            response = await self._call_llm(
-                messages=context,
-                tools=tools,
-                spinner_message="Analyzing request...",
-            )
+            if use_memory:
+                response = await self._call_with_overflow_recovery(
+                    tools=tools,
+                    spinner_message="Analyzing request...",
+                )
+            else:
+                normalized = self.memory.ensure_call_outputs_present(context)
+                normalized = self.memory.remove_orphan_outputs(normalized)
+                response = await self._call_llm(
+                    messages=normalized,
+                    tools=tools,
+                    spinner_message="Analyzing request...",
+                )
 
             # Save assistant response using response.to_message() for proper format
             assistant_msg = response.to_message()

diff --git a/config.py b/config.py
@@ -83,6 +83,39 @@ class Config:
     MEMORY_SHORT_TERM_MIN_SIZE = int(os.getenv("MEMORY_SHORT_TERM_MIN_SIZE", "6"))
     MEMORY_COMPRESSION_RATIO = float(os.getenv("MEMORY_COMPRESSION_RATIO", "0.3"))
     MEMORY_PRESERVE_SYSTEM_PROMPTS = True
+    TOOL_OUTPUT_TRUNCATION_POLICY = os.getenv("TOOL_OUTPUT_TRUNCATION_POLICY", "tokens").lower()
+    TOOL_OUTPUT_MAX_TOKENS = int(os.getenv("TOOL_OUTPUT_MAX_TOKENS", "5000"))
+    APPROX_CHARS_PER_TOKEN = int(os.getenv("APPROX_CHARS_PER_TOKEN", "4"))
+    TOOL_OUTPUT_MAX_BYTES = int(
+        os.getenv("TOOL_OUTPUT_MAX_BYTES", str(TOOL_OUTPUT_MAX_TOKENS * APPROX_CHARS_PER_TOKEN))
+    )
+    TOOL_OUTPUT_SERIALIZATION_BUFFER = float(os.getenv("TOOL_OUTPUT_SERIALIZATION_BUFFER", "1.2"))
+    COMPACT_USER_MESSAGE_MAX_TOKENS = int(os.getenv("COMPACT_USER_MESSAGE_MAX_TOKENS", "20000"))
+    CONTEXT_OVERFLOW_MAX_RETRIES = int(os.getenv("CONTEXT_OVERFLOW_MAX_RETRIES", "3"))
+    PROTECTED_TOOLS = [
+        name.strip()
+        for name in os.getenv("PROTECTED_TOOLS", "manage_todo_list").split(",")
+        if name.strip()
+    ]
+    COMPACT_SUMMARIZATION_PROMPT = os.getenv(
+        "COMPACT_SUMMARIZATION_PROMPT",
+        """You are performing a CONTEXT CHECKPOINT COMPACTION.
+Create a handoff summary for another LLM that will resume the task.
+
+Include:
+- Current progress and key decisions made
+- Important context, constraints, or user preferences
+- What remains to be done (clear next steps)
+- Any critical data needed to continue
+
+Be concise and focused on helping the next LLM seamlessly continue.""",
+    )
+    COMPACT_SUMMARY_PREFIX = os.getenv(
+        "COMPACT_SUMMARY_PREFIX",
+        """Another language model started this task and produced
+a summary. Use this to build on existing work and avoid duplication:
+""",
+    )
 
     # Logging Configuration
     # Note: Logging is now controlled via --verbose flag

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -79,6 +79,17 @@ MEMORY_ENABLED=true
 MEMORY_COMPRESSION_THRESHOLD=25000
 MEMORY_SHORT_TERM_SIZE=100
 MEMORY_COMPRESSION_RATIO=0.3
+MEMORY_SHORT_TERM_MIN_SIZE=6
+COMPACT_USER_MESSAGE_MAX_TOKENS=20000
+CONTEXT_OVERFLOW_MAX_RETRIES=3
+TOOL_OUTPUT_TRUNCATION_POLICY=tokens
+TOOL_OUTPUT_MAX_TOKENS=5000
+TOOL_OUTPUT_MAX_BYTES=20000
+TOOL_OUTPUT_SERIALIZATION_BUFFER=1.2
+APPROX_CHARS_PER_TOKEN=4
+PROTECTED_TOOLS=manage_todo_list
+COMPACT_SUMMARIZATION_PROMPT="You are performing a CONTEXT CHECKPOINT COMPACTION..."
+COMPACT_SUMMARY_PREFIX="Another language model started this task and produced..."
 ```
 
 ## Retry Configuration

diff --git a/docs/memory-management.md b/docs/memory-management.md
@@ -26,6 +26,21 @@ The memory system addresses this by:
 - **Configurable**: Multiple strategies and settings
 - **Multi-Provider**: Works with Anthropic, OpenAI, Gemini
 
+## Context Compaction Enhancements
+
+The memory system includes additional safeguards for large outputs and long-running sessions:
+
+- **Write-time tool output truncation**: Large tool outputs are truncated before being stored to protect context.
+  - Controls: `TOOL_OUTPUT_TRUNCATION_POLICY`, `TOOL_OUTPUT_MAX_TOKENS`, `TOOL_OUTPUT_MAX_BYTES`,
+    `TOOL_OUTPUT_SERIALIZATION_BUFFER`, `APPROX_CHARS_PER_TOKEN`
+- **Context overflow recovery**: On `context_length_exceeded` errors, the agent removes the oldest messages
+  (maintaining tool call/result integrity) and retries.
+  - Control: `CONTEXT_OVERFLOW_MAX_RETRIES`
+- **User message preservation**: Compaction keeps recent user messages up to a configurable token budget.
+  - Control: `COMPACT_USER_MESSAGE_MAX_TOKENS`
+- **Protected tools**: Tool results like `manage_todo_list` are preserved during compaction.
+  - Control: `PROTECTED_TOOLS`
+
 ## Quick Start
 
 ### 1. Enable Memory Management

diff --git a/interactive.py b/interactive.py
@@ -43,6 +43,7 @@ def __init__(self, agent):
                 "theme",
                 "verbose",
                 "compact",
+                "compact-output",
                 "exit",
                 "quit",
             ],
@@ -101,7 +102,10 @@ def _show_help(self) -> None:
             f"  [{colors.primary}]/verbose[/{colors.primary}]          - Toggle verbose thinking display"
         )
         terminal_ui.console.print(
-            f"  [{colors.primary}]/compact[/{colors.primary}]          - Toggle compact output mode"
+            f"  [{colors.primary}]/compact[/{colors.primary}]          - Compact memory now"
+        )
+        terminal_ui.console.print(
+            f"  [{colors.primary}]/compact-output[/{colors.primary}]   - Toggle compact output mode"
         )
         terminal_ui.console.print(
             f"  [{colors.primary}]/exit[/{colors.primary}]             - Exit interactive mode"
@@ -264,6 +268,19 @@ def _toggle_compact(self) -> None:
         status = "enabled" if self.compact_mode else "disabled"
         terminal_ui.print_info(f"Compact mode {status}")
 
+    async def _compact_memory(self) -> None:
+        """Manually compact memory and report savings."""
+        terminal_ui.print_info("Compacting memory...")
+        compressed = await self.agent.memory.compress()
+        if not compressed:
+            terminal_ui.print_info("No messages to compact.")
+            return
+
+        terminal_ui.print_success(
+            f"Compaction complete: {compressed.original_tokens} → {compressed.compressed_tokens} tokens "
+            f"({compressed.savings_percentage:.1f}% saved)"
+        )
+
     def _update_status_bar(self) -> None:
         """Update status bar with current stats."""
         stats = self.agent.memory.get_stats()
@@ -326,6 +343,9 @@ async def _handle_command(self, user_input: str) -> bool:
             self._toggle_verbose()
 
         elif command == "/compact":
+            await self._compact_memory()
+
+        elif command == "/compact-output":
             self._toggle_compact()
 
         else:

diff --git a/llm/retry.py b/llm/retry.py
@@ -26,6 +26,30 @@ def is_rate_limit_error(error: BaseException) -> bool:
     return any(indicator in error_str for indicator in rate_limit_indicators)
 
 
+def is_context_length_error(error: BaseException) -> bool:
+    """Check if an error is a context length overflow error."""
+    error_str = str(error).lower()
+    error_type = type(error).__name__
+
+    indicators = [
+        "context_length_exceeded",
+        "context length",
+        "maximum context",
+        "max context",
+        "prompt is too long",
+        "input is too long",
+        "too many tokens",
+        "token limit",
+        "max_tokens",
+        "maximum tokens",
+    ]
+
+    if "ContextLengthExceeded" in error_type or "TokenLimit" in error_type:
+        return True
+
+    return any(indicator in error_str for indicator in indicators)
+
+
 def is_retryable_error(error: BaseException) -> bool:
     """Check if an error is retryable."""
     if isinstance(error, asyncio.CancelledError):