Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,16 @@ Avoid over-specifying implementation details; focus on the "what" and "why", not

Review existing RFCs before implementation to understand design decisions and constraints.

## Code Review Standards (RFC Conformance)

When reviewing code against an RFC:

1. **State the RFC's core invariant in one sentence** before reading code
2. **Trace data flow**: for key variables, verify input set == output set
3. **Check all paths**: normal, fallback, error — do they all satisfy the invariant?

If you can't answer "yes" with line numbers, dig deeper.

## Async Runtime Rules

- **New runtime code must be async-first**: avoid introducing new blocking I/O in `agent/`, `llm/`, `memory/`, and `tools/`.
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,9 @@ See the full configuration template in `.env.example`. Key options:
| `MAX_ITERATIONS` | Maximum agent iterations | `100` |
| `MEMORY_COMPRESSION_THRESHOLD` | Compress when exceeded | `25000` |
| `MEMORY_SHORT_TERM_SIZE` | Recent messages to keep | `100` |
| `COMPACT_USER_MESSAGE_MAX_TOKENS` | User message budget during compaction | `20000` |
| `TOOL_OUTPUT_TRUNCATION_POLICY` | Truncate tool outputs (`none|bytes|tokens`) | `tokens` |
| `CONTEXT_OVERFLOW_MAX_RETRIES` | Retries on context overflow | `3` |
| `RETRY_MAX_ATTEMPTS` | Retry attempts for rate limits | `3` |
| `LOG_LEVEL` | Logging level | `DEBUG` |

Expand Down
71 changes: 51 additions & 20 deletions agent/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional

from config import Config
from llm import LLMMessage, LLMResponse, StopReason, ToolResult
from llm.retry import is_context_length_error
from memory import MemoryManager
from tools.base import BaseTool
from tools.todo import TodoTool
Expand Down Expand Up @@ -52,10 +54,6 @@ def __init__(
# Initialize memory manager (uses Config directly)
self.memory = MemoryManager(llm)

# Set up todo context provider for memory compression
# This injects current todo state into summaries instead of preserving all todo messages
self.memory.set_todo_context_provider(self._get_todo_context)

@abstractmethod
def run(self, task: str) -> str:
"""Execute the agent on a task and return final answer."""
Expand Down Expand Up @@ -84,6 +82,42 @@ async def _call_llm(
messages=messages, tools=tools, max_tokens=4096, **kwargs
)

async def _call_with_overflow_recovery(
self,
tools: Optional[List] = None,
spinner_message: str = "Thinking...",
**kwargs,
) -> LLMResponse:
"""Call LLM with context overflow recovery."""
last_error: Optional[BaseException] = None
max_retries = max(0, Config.CONTEXT_OVERFLOW_MAX_RETRIES)

for attempt in range(max_retries + 1):
context = self.memory.get_context_for_llm()
try:
return await self._call_llm(
messages=context,
tools=tools,
spinner_message=spinner_message,
**kwargs,
)
except Exception as e: # noqa: BLE001
if not is_context_length_error(e):
raise
last_error = e
removed = self.memory.remove_oldest_with_pair_integrity()
if removed is None:
break
logger.warning(
"Context length exceeded; removed oldest message and retrying (%s/%s)",
attempt + 1,
max_retries + 1,
)

if last_error:
raise last_error
raise RuntimeError("Context overflow recovery failed without an error.")

def _extract_text(self, response: LLMResponse) -> str:
"""Extract text from LLM response.

Expand All @@ -95,17 +129,6 @@ def _extract_text(self, response: LLMResponse) -> str:
"""
return self.llm.extract_text(response)

def _get_todo_context(self) -> Optional[str]:
"""Get current todo list state for memory compression.

Returns formatted todo list if items exist, None otherwise.
This is used by MemoryManager to inject todo state into summaries.
"""
items = self.todo_list.get_current()
if not items:
return None
return self.todo_list.format_list()

async def _react_loop(
self,
messages: List[LLMMessage],
Expand Down Expand Up @@ -135,11 +158,19 @@ async def _react_loop(
context = self.memory.get_context_for_llm() if use_memory else messages

# Call LLM with tools
response = await self._call_llm(
messages=context,
tools=tools,
spinner_message="Analyzing request...",
)
if use_memory:
response = await self._call_with_overflow_recovery(
tools=tools,
spinner_message="Analyzing request...",
)
else:
normalized = self.memory.ensure_call_outputs_present(context)
normalized = self.memory.remove_orphan_outputs(normalized)
response = await self._call_llm(
messages=normalized,
tools=tools,
spinner_message="Analyzing request...",
)

# Save assistant response using response.to_message() for proper format
assistant_msg = response.to_message()
Expand Down
33 changes: 33 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,39 @@ class Config:
MEMORY_SHORT_TERM_MIN_SIZE = int(os.getenv("MEMORY_SHORT_TERM_MIN_SIZE", "6"))
MEMORY_COMPRESSION_RATIO = float(os.getenv("MEMORY_COMPRESSION_RATIO", "0.3"))
MEMORY_PRESERVE_SYSTEM_PROMPTS = True
TOOL_OUTPUT_TRUNCATION_POLICY = os.getenv("TOOL_OUTPUT_TRUNCATION_POLICY", "tokens").lower()
TOOL_OUTPUT_MAX_TOKENS = int(os.getenv("TOOL_OUTPUT_MAX_TOKENS", "5000"))
APPROX_CHARS_PER_TOKEN = int(os.getenv("APPROX_CHARS_PER_TOKEN", "4"))
TOOL_OUTPUT_MAX_BYTES = int(
os.getenv("TOOL_OUTPUT_MAX_BYTES", str(TOOL_OUTPUT_MAX_TOKENS * APPROX_CHARS_PER_TOKEN))
)
TOOL_OUTPUT_SERIALIZATION_BUFFER = float(os.getenv("TOOL_OUTPUT_SERIALIZATION_BUFFER", "1.2"))
COMPACT_USER_MESSAGE_MAX_TOKENS = int(os.getenv("COMPACT_USER_MESSAGE_MAX_TOKENS", "20000"))
CONTEXT_OVERFLOW_MAX_RETRIES = int(os.getenv("CONTEXT_OVERFLOW_MAX_RETRIES", "3"))
PROTECTED_TOOLS = [
name.strip()
for name in os.getenv("PROTECTED_TOOLS", "manage_todo_list").split(",")
if name.strip()
]
COMPACT_SUMMARIZATION_PROMPT = os.getenv(
"COMPACT_SUMMARIZATION_PROMPT",
"""You are performing a CONTEXT CHECKPOINT COMPACTION.
Create a handoff summary for another LLM that will resume the task.

Include:
- Current progress and key decisions made
- Important context, constraints, or user preferences
- What remains to be done (clear next steps)
- Any critical data needed to continue

Be concise and focused on helping the next LLM seamlessly continue.""",
)
COMPACT_SUMMARY_PREFIX = os.getenv(
"COMPACT_SUMMARY_PREFIX",
"""Another language model started this task and produced
a summary. Use this to build on existing work and avoid duplication:
""",
)

# Logging Configuration
# Note: Logging is now controlled via --verbose flag
Expand Down
11 changes: 11 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,17 @@ MEMORY_ENABLED=true
MEMORY_COMPRESSION_THRESHOLD=25000
MEMORY_SHORT_TERM_SIZE=100
MEMORY_COMPRESSION_RATIO=0.3
MEMORY_SHORT_TERM_MIN_SIZE=6
COMPACT_USER_MESSAGE_MAX_TOKENS=20000
CONTEXT_OVERFLOW_MAX_RETRIES=3
TOOL_OUTPUT_TRUNCATION_POLICY=tokens
TOOL_OUTPUT_MAX_TOKENS=5000
TOOL_OUTPUT_MAX_BYTES=20000
TOOL_OUTPUT_SERIALIZATION_BUFFER=1.2
APPROX_CHARS_PER_TOKEN=4
PROTECTED_TOOLS=manage_todo_list
COMPACT_SUMMARIZATION_PROMPT="You are performing a CONTEXT CHECKPOINT COMPACTION..."
COMPACT_SUMMARY_PREFIX="Another language model started this task and produced..."
```

## Retry Configuration
Expand Down
15 changes: 15 additions & 0 deletions docs/memory-management.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,21 @@ The memory system addresses this by:
- **Configurable**: Multiple strategies and settings
- **Multi-Provider**: Works with Anthropic, OpenAI, Gemini

## Context Compaction Enhancements

The memory system includes additional safeguards for large outputs and long-running sessions:

- **Write-time tool output truncation**: Large tool outputs are truncated before being stored to protect context.
- Controls: `TOOL_OUTPUT_TRUNCATION_POLICY`, `TOOL_OUTPUT_MAX_TOKENS`, `TOOL_OUTPUT_MAX_BYTES`,
`TOOL_OUTPUT_SERIALIZATION_BUFFER`, `APPROX_CHARS_PER_TOKEN`
- **Context overflow recovery**: On `context_length_exceeded` errors, the agent removes the oldest messages
(maintaining tool call/result integrity) and retries.
- Control: `CONTEXT_OVERFLOW_MAX_RETRIES`
- **User message preservation**: Compaction keeps recent user messages up to a configurable token budget.
- Control: `COMPACT_USER_MESSAGE_MAX_TOKENS`
- **Protected tools**: Tool results like `manage_todo_list` are preserved during compaction.
- Control: `PROTECTED_TOOLS`

## Quick Start

### 1. Enable Memory Management
Expand Down
22 changes: 21 additions & 1 deletion interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(self, agent):
"theme",
"verbose",
"compact",
"compact-output",
"exit",
"quit",
],
Expand Down Expand Up @@ -101,7 +102,10 @@ def _show_help(self) -> None:
f" [{colors.primary}]/verbose[/{colors.primary}] - Toggle verbose thinking display"
)
terminal_ui.console.print(
f" [{colors.primary}]/compact[/{colors.primary}] - Toggle compact output mode"
f" [{colors.primary}]/compact[/{colors.primary}] - Compact memory now"
)
terminal_ui.console.print(
f" [{colors.primary}]/compact-output[/{colors.primary}] - Toggle compact output mode"
)
terminal_ui.console.print(
f" [{colors.primary}]/exit[/{colors.primary}] - Exit interactive mode"
Expand Down Expand Up @@ -264,6 +268,19 @@ def _toggle_compact(self) -> None:
status = "enabled" if self.compact_mode else "disabled"
terminal_ui.print_info(f"Compact mode {status}")

async def _compact_memory(self) -> None:
"""Manually compact memory and report savings."""
terminal_ui.print_info("Compacting memory...")
compressed = await self.agent.memory.compress()
if not compressed:
terminal_ui.print_info("No messages to compact.")
return

terminal_ui.print_success(
f"Compaction complete: {compressed.original_tokens} → {compressed.compressed_tokens} tokens "
f"({compressed.savings_percentage:.1f}% saved)"
)

def _update_status_bar(self) -> None:
"""Update status bar with current stats."""
stats = self.agent.memory.get_stats()
Expand Down Expand Up @@ -326,6 +343,9 @@ async def _handle_command(self, user_input: str) -> bool:
self._toggle_verbose()

elif command == "/compact":
await self._compact_memory()

elif command == "/compact-output":
self._toggle_compact()

else:
Expand Down
24 changes: 24 additions & 0 deletions llm/retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,30 @@ def is_rate_limit_error(error: BaseException) -> bool:
return any(indicator in error_str for indicator in rate_limit_indicators)


def is_context_length_error(error: BaseException) -> bool:
"""Check if an error is a context length overflow error."""
error_str = str(error).lower()
error_type = type(error).__name__

indicators = [
"context_length_exceeded",
"context length",
"maximum context",
"max context",
"prompt is too long",
"input is too long",
"too many tokens",
"token limit",
"max_tokens",
"maximum tokens",
]

if "ContextLengthExceeded" in error_type or "TokenLimit" in error_type:
return True

return any(indicator in error_str for indicator in indicators)


def is_retryable_error(error: BaseException) -> bool:
"""Check if an error is retryable."""
if isinstance(error, asyncio.CancelledError):
Expand Down
Loading