🤖 fix: exclude reasoning tokens from context threshold calculation (#1275)

ammar-agent · web-flow · commit 9975b99e42ec · 2025-12-21T14:12:15.000-06:00
## Summary

Fixes compaction loop with Anthropic Extended Thinking models.

## Problem

The `getTotalTokens()` function included `reasoning.tokens` in context
usage calculation, but reasoning tokens are **OUTPUT** from the current
response, not part of the input context window.

Extended Thinking can generate 50k+ reasoning tokens per response,
incorrectly inflating context percentage and triggering force-compaction
even with low actual context (e.g., 20k input + 50k reasoning = 70k
"context" = 35% on 200k window, but 75k would trigger at 37.5%).

Per Anthropic docs:
&gt; "The Anthropic API automatically ignores thinking blocks from previous
turns and they are not included when calculating context usage."

So `inputTokens` already correctly reflects context size.

## Changes

1. **Primary fix**: Rename `getTotalTokens` → `getContextTokens`,
exclude output/reasoning tokens
2. **Secondary fix**: Preserve signatures when coalescing consecutive
reasoning parts (signatures arrive at end of streaming, were being lost
during merge)
3. **Tests**: Add test case verifying reasoning tokens don't trigger
compaction

## Testing

- `bun test src/browser/utils/compaction/autoCompactionCheck.test.ts` -
34 pass
- `bun test src/browser/utils/messages/modelMessageTransform.test.ts` -
46 pass
- `make typecheck` - pass

---
_Generated with `mux` • Model: `anthropic:claude-opus-4-5` • Thinking:
`high`_
diff --git a/src/browser/utils/compaction/autoCompactionCheck.test.ts b/src/browser/utils/compaction/autoCompactionCheck.test.ts
@@ -5,20 +5,21 @@ import type { ChatUsageDisplay } from "@/common/utils/tokens/usageAggregator";
 import { KNOWN_MODELS } from "@/common/constants/knownModels";
 
 // Helper to create a mock usage entry
+// The tokens parameter represents CONTEXT tokens (input + cached + cacheCreate).
+// Output and reasoning are set separately since they don't count toward context.
 const createUsageEntry = (
-  tokens: number,
+  contextTokens: number,
   model: string = KNOWN_MODELS.SONNET.id
 ): ChatUsageDisplay => {
-  // Distribute tokens across different types (realistic pattern)
-  const inputTokens = Math.floor(tokens * 0.6); // 60% input
-  const outputTokens = Math.floor(tokens * 0.3); // 30% output
-  const cachedTokens = Math.floor(tokens * 0.1); // 10% cached
+  // Distribute context tokens (only these count toward compaction threshold)
+  const inputTokens = Math.floor(contextTokens * 0.9); // 90% input
+  const cachedTokens = Math.floor(contextTokens * 0.1); // 10% cached
 
   return {
     input: { tokens: inputTokens },
     cached: { tokens: cachedTokens },
     cacheCreate: { tokens: 0 },
-    output: { tokens: outputTokens },
+    output: { tokens: 1_000 }, // Some output (doesn't affect context calculation)
     reasoning: { tokens: 0 },
     model,
   };
@@ -144,8 +145,34 @@ describe("checkAutoCompaction", () => {
 
       const result = checkAutoCompaction(usage, KNOWN_MODELS.SONNET.id, false);
 
-      // Total: 10k + 5k + 2k + 3k + 1k = 21k tokens = 10.5%
-      expect(result.usagePercentage).toBe(10.5);
+      // Context = input + cached + cacheCreate = 10k + 5k + 2k = 17k tokens = 8.5%
+      // Output and reasoning are excluded (they're response tokens, not context)
+      expect(result.usagePercentage).toBe(8.5);
+    });
+
+    test("excludes output and reasoning tokens from context calculation (prevents compaction loops)", () => {
+      // Extended Thinking can generate 50k+ reasoning tokens. These should NOT
+      // count toward context window limits or trigger compaction loops.
+      const usageEntry = {
+        input: { tokens: 20_000 }, // Low actual context
+        cached: { tokens: 0 },
+        cacheCreate: { tokens: 0 },
+        output: { tokens: 5_000 },
+        reasoning: { tokens: 50_000 }, // High reasoning from Extended Thinking
+        model: KNOWN_MODELS.SONNET.id,
+      };
+      const usage: WorkspaceUsageState = {
+        lastContextUsage: usageEntry,
+        totalTokens: 0,
+      };
+
+      const result = checkAutoCompaction(usage, KNOWN_MODELS.SONNET.id, false);
+
+      // Only input tokens count: 20k = 10% of 200k context
+      // NOT 75k (37.5%) which would incorrectly trigger compaction
+      expect(result.usagePercentage).toBe(10);
+      expect(result.shouldShowWarning).toBe(false);
+      expect(result.shouldForceCompact).toBe(false);
     });
   });
 
diff --git a/src/browser/utils/compaction/autoCompactionCheck.ts b/src/browser/utils/compaction/autoCompactionCheck.ts
@@ -24,15 +24,15 @@ import {
   FORCE_COMPACTION_BUFFER_PERCENT,
 } from "@/common/constants/ui";
 
-/** Sum all token components from a ChatUsageDisplay */
-function getTotalTokens(usage: ChatUsageDisplay): number {
-  return (
-    usage.input.tokens +
-    usage.cached.tokens +
-    usage.cacheCreate.tokens +
-    usage.output.tokens +
-    usage.reasoning.tokens
-  );
+/**
+ * Get context window tokens (input only).
+ * Output and reasoning tokens are excluded because they represent the model's
+ * response, not the context window size. This prevents compaction loops with
+ * Extended Thinking models where high reasoning token counts (50k+) would
+ * incorrectly inflate context usage calculations.
+ */
+function getContextTokens(usage: ChatUsageDisplay): number {
+  return usage.input.tokens + usage.cached.tokens + usage.cacheCreate.tokens;
 }
 
 export interface AutoCompactionCheckResult {
@@ -100,15 +100,15 @@ export function checkAutoCompaction(
   const currentUsage = usage.liveUsage ?? lastUsage;
 
   // Usage percentage from current context (live when streaming, otherwise last completed)
-  const usagePercentage = currentUsage ? (getTotalTokens(currentUsage) / maxTokens) * 100 : 0;
+  const usagePercentage = currentUsage ? (getContextTokens(currentUsage) / maxTokens) * 100 : 0;
 
   // Force-compact when usage exceeds threshold + buffer
   const forceCompactThreshold = thresholdPercentage + FORCE_COMPACTION_BUFFER_PERCENT;
   const shouldForceCompact = usagePercentage >= forceCompactThreshold;
 
   // Warning uses max of last completed and current (live when streaming)
   // This ensures warning shows when live usage spikes above threshold mid-stream
-  const lastUsagePercentage = lastUsage ? (getTotalTokens(lastUsage) / maxTokens) * 100 : 0;
+  const lastUsagePercentage = lastUsage ? (getContextTokens(lastUsage) / maxTokens) * 100 : 0;
   const shouldShowWarning =
     Math.max(lastUsagePercentage, usagePercentage) >= thresholdPercentage - warningAdvancePercent;
 
diff --git a/src/browser/utils/messages/modelMessageTransform.ts b/src/browser/utils/messages/modelMessageTransform.ts
@@ -582,6 +582,24 @@ function coalesceConsecutiveParts(messages: ModelMessage[]): ModelMessage[] {
       // Merge consecutive reasoning parts (extended thinking)
       if (part.type === "reasoning" && lastPart?.type === "reasoning") {
         lastPart.text += part.text;
+        // Preserve signature from later parts - during streaming, the signature
+        // arrives at the end and is attached to the last reasoning part.
+        // Cast needed because AI SDK's ReasoningPart doesn't have signature,
+        // but our MuxReasoningPart (which flows through convertToModelMessages) does.
+        const partWithSig = part as typeof part & {
+          signature?: string;
+          providerOptions?: { anthropic?: { signature?: string } };
+        };
+        const lastWithSig = lastPart as typeof lastPart & {
+          signature?: string;
+          providerOptions?: { anthropic?: { signature?: string } };
+        };
+        if (partWithSig.signature) {
+          lastWithSig.signature = partWithSig.signature;
+        }
+        if (partWithSig.providerOptions) {
+          lastWithSig.providerOptions = partWithSig.providerOptions;
+        }
         continue;
       }