Skip to content

Commit 9975b99

Browse files
authored
🤖 fix: exclude reasoning tokens from context threshold calculation (#1275)
## Summary Fixes compaction loop with Anthropic Extended Thinking models. ## Problem The `getTotalTokens()` function included `reasoning.tokens` in context usage calculation, but reasoning tokens are **OUTPUT** from the current response, not part of the input context window. Extended Thinking can generate 50k+ reasoning tokens per response, incorrectly inflating context percentage and triggering force-compaction even with low actual context (e.g., 20k input + 50k reasoning = 70k "context" = 35% on 200k window, but 75k would trigger at 37.5%). Per Anthropic docs: > "The Anthropic API automatically ignores thinking blocks from previous turns and they are not included when calculating context usage." So `inputTokens` already correctly reflects context size. ## Changes 1. **Primary fix**: Rename `getTotalTokens` → `getContextTokens`, exclude output/reasoning tokens 2. **Secondary fix**: Preserve signatures when coalescing consecutive reasoning parts (signatures arrive at end of streaming, were being lost during merge) 3. **Tests**: Add test case verifying reasoning tokens don't trigger compaction ## Testing - `bun test src/browser/utils/compaction/autoCompactionCheck.test.ts` - 34 pass - `bun test src/browser/utils/messages/modelMessageTransform.test.ts` - 46 pass - `make typecheck` - pass --- _Generated with `mux` • Model: `anthropic:claude-opus-4-5` • Thinking: `high`_
1 parent db460aa commit 9975b99

File tree

3 files changed

+64
-19
lines changed

3 files changed

+64
-19
lines changed

‎src/browser/utils/compaction/autoCompactionCheck.test.ts‎

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,21 @@ import type { ChatUsageDisplay } from "@/common/utils/tokens/usageAggregator";
55
import { KNOWN_MODELS } from "@/common/constants/knownModels";
66

77
// Helper to create a mock usage entry
8+
// The tokens parameter represents CONTEXT tokens (input + cached + cacheCreate).
9+
// Output and reasoning are set separately since they don't count toward context.
810
const createUsageEntry = (
9-
tokens: number,
11+
contextTokens: number,
1012
model: string = KNOWN_MODELS.SONNET.id
1113
): ChatUsageDisplay => {
12-
// Distribute tokens across different types (realistic pattern)
13-
const inputTokens = Math.floor(tokens * 0.6); // 60% input
14-
const outputTokens = Math.floor(tokens * 0.3); // 30% output
15-
const cachedTokens = Math.floor(tokens * 0.1); // 10% cached
14+
// Distribute context tokens (only these count toward compaction threshold)
15+
const inputTokens = Math.floor(contextTokens * 0.9); // 90% input
16+
const cachedTokens = Math.floor(contextTokens * 0.1); // 10% cached
1617

1718
return {
1819
input: { tokens: inputTokens },
1920
cached: { tokens: cachedTokens },
2021
cacheCreate: { tokens: 0 },
21-
output: { tokens: outputTokens },
22+
output: { tokens: 1_000 }, // Some output (doesn't affect context calculation)
2223
reasoning: { tokens: 0 },
2324
model,
2425
};
@@ -144,8 +145,34 @@ describe("checkAutoCompaction", () => {
144145

145146
const result = checkAutoCompaction(usage, KNOWN_MODELS.SONNET.id, false);
146147

147-
// Total: 10k + 5k + 2k + 3k + 1k = 21k tokens = 10.5%
148-
expect(result.usagePercentage).toBe(10.5);
148+
// Context = input + cached + cacheCreate = 10k + 5k + 2k = 17k tokens = 8.5%
149+
// Output and reasoning are excluded (they're response tokens, not context)
150+
expect(result.usagePercentage).toBe(8.5);
151+
});
152+
153+
test("excludes output and reasoning tokens from context calculation (prevents compaction loops)", () => {
154+
// Extended Thinking can generate 50k+ reasoning tokens. These should NOT
155+
// count toward context window limits or trigger compaction loops.
156+
const usageEntry = {
157+
input: { tokens: 20_000 }, // Low actual context
158+
cached: { tokens: 0 },
159+
cacheCreate: { tokens: 0 },
160+
output: { tokens: 5_000 },
161+
reasoning: { tokens: 50_000 }, // High reasoning from Extended Thinking
162+
model: KNOWN_MODELS.SONNET.id,
163+
};
164+
const usage: WorkspaceUsageState = {
165+
lastContextUsage: usageEntry,
166+
totalTokens: 0,
167+
};
168+
169+
const result = checkAutoCompaction(usage, KNOWN_MODELS.SONNET.id, false);
170+
171+
// Only input tokens count: 20k = 10% of 200k context
172+
// NOT 75k (37.5%) which would incorrectly trigger compaction
173+
expect(result.usagePercentage).toBe(10);
174+
expect(result.shouldShowWarning).toBe(false);
175+
expect(result.shouldForceCompact).toBe(false);
149176
});
150177
});
151178

‎src/browser/utils/compaction/autoCompactionCheck.ts‎

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ import {
2424
FORCE_COMPACTION_BUFFER_PERCENT,
2525
} from "@/common/constants/ui";
2626

27-
/** Sum all token components from a ChatUsageDisplay */
28-
function getTotalTokens(usage: ChatUsageDisplay): number {
29-
return (
30-
usage.input.tokens +
31-
usage.cached.tokens +
32-
usage.cacheCreate.tokens +
33-
usage.output.tokens +
34-
usage.reasoning.tokens
35-
);
27+
/**
28+
* Get context window tokens (input only).
29+
* Output and reasoning tokens are excluded because they represent the model's
30+
* response, not the context window size. This prevents compaction loops with
31+
* Extended Thinking models where high reasoning token counts (50k+) would
32+
* incorrectly inflate context usage calculations.
33+
*/
34+
function getContextTokens(usage: ChatUsageDisplay): number {
35+
return usage.input.tokens + usage.cached.tokens + usage.cacheCreate.tokens;
3636
}
3737

3838
export interface AutoCompactionCheckResult {
@@ -100,15 +100,15 @@ export function checkAutoCompaction(
100100
const currentUsage = usage.liveUsage ?? lastUsage;
101101

102102
// Usage percentage from current context (live when streaming, otherwise last completed)
103-
const usagePercentage = currentUsage ? (getTotalTokens(currentUsage) / maxTokens) * 100 : 0;
103+
const usagePercentage = currentUsage ? (getContextTokens(currentUsage) / maxTokens) * 100 : 0;
104104

105105
// Force-compact when usage exceeds threshold + buffer
106106
const forceCompactThreshold = thresholdPercentage + FORCE_COMPACTION_BUFFER_PERCENT;
107107
const shouldForceCompact = usagePercentage >= forceCompactThreshold;
108108

109109
// Warning uses max of last completed and current (live when streaming)
110110
// This ensures warning shows when live usage spikes above threshold mid-stream
111-
const lastUsagePercentage = lastUsage ? (getTotalTokens(lastUsage) / maxTokens) * 100 : 0;
111+
const lastUsagePercentage = lastUsage ? (getContextTokens(lastUsage) / maxTokens) * 100 : 0;
112112
const shouldShowWarning =
113113
Math.max(lastUsagePercentage, usagePercentage) >= thresholdPercentage - warningAdvancePercent;
114114

‎src/browser/utils/messages/modelMessageTransform.ts‎

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,24 @@ function coalesceConsecutiveParts(messages: ModelMessage[]): ModelMessage[] {
582582
// Merge consecutive reasoning parts (extended thinking)
583583
if (part.type === "reasoning" && lastPart?.type === "reasoning") {
584584
lastPart.text += part.text;
585+
// Preserve signature from later parts - during streaming, the signature
586+
// arrives at the end and is attached to the last reasoning part.
587+
// Cast needed because AI SDK's ReasoningPart doesn't have signature,
588+
// but our MuxReasoningPart (which flows through convertToModelMessages) does.
589+
const partWithSig = part as typeof part & {
590+
signature?: string;
591+
providerOptions?: { anthropic?: { signature?: string } };
592+
};
593+
const lastWithSig = lastPart as typeof lastPart & {
594+
signature?: string;
595+
providerOptions?: { anthropic?: { signature?: string } };
596+
};
597+
if (partWithSig.signature) {
598+
lastWithSig.signature = partWithSig.signature;
599+
}
600+
if (partWithSig.providerOptions) {
601+
lastWithSig.providerOptions = partWithSig.providerOptions;
602+
}
585603
continue;
586604
}
587605

0 commit comments

Comments
 (0)