From f7eb8878453b71fd8054bdec0e2a21b57ad25a13 Mon Sep 17 00:00:00 2001
From: Murali Varadarajan <muraalee@gmail.com>
Date: Mon, 2 Feb 2026 18:03:23 -0800
Subject: [PATCH] fix(reference): improve line attribution accuracy and add
 Claude Code model extraction

The reference implementation had two issues affecting trace accuracy:

1. Line Attribution: When processing edits with context lines (common in
   Claude Code's Edit tool), the entire new_string was attributed to AI,
   including unchanged surrounding lines. This produced inflated attribution
   ranges.

2. Model Identification: Claude Code does not include the model identifier
   in hook payloads. Traces were created with missing model_id, making it
   impossible to distinguish which model produced the code.

Changes:
- Add diffToFindChangedLines() to compute actual changed lines by comparing
  old_string and new_string, excluding context lines from attribution
- Add extractModelFromTranscript() to parse Claude Code's JSONL transcript
  files and extract the model identifier from message entries
- Add resolveModel() helper to transparently handle model resolution for
  both Cursor (direct payload) and Claude Code (transcript extraction)
- Update PostToolUse, SessionStart, and SessionEnd handlers to use the
  new model resolution logic
---
 reference/trace-hook.ts  |  39 ++++++++-
 reference/trace-store.ts | 169 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 198 insertions(+), 10 deletions(-)

diff --git a/reference/trace-hook.ts b/reference/trace-hook.ts
index 37977dc..ac9018e 100644
--- a/reference/trace-hook.ts
+++ b/reference/trace-hook.ts
@@ -1,11 +1,23 @@
 #!/usr/bin/env bun
 
+/**
+ * Agent Trace Hook Handler
+ *
+ * This script processes hook events from AI coding tools (Cursor, Claude Code)
+ * and generates trace records for attribution tracking. It reads JSON input
+ * from stdin and dispatches to the appropriate handler based on hook_event_name.
+ *
+ * Supported tools:
+ * - Cursor: afterFileEdit, afterTabFileEdit, afterShellExecution, sessionStart, sessionEnd
+ * - Claude Code: PostToolUse, SessionStart, SessionEnd
+ */
+
 import {
   createTrace,
   appendTrace,
   computeRangePositions,
   tryReadFile,
-  type ContributorType,
+  extractModelFromTranscript,
   type FileEdit,
 } from "./trace-store";
 
@@ -32,6 +44,25 @@ interface HookInput {
   cwd?: string;
 }
 
+/**
+ * Resolves the model identifier from hook input.
+ *
+ * Different tools provide model information differently:
+ * - Cursor: Sends model directly in the hook payload via `input.model`
+ * - Claude Code: Does not include model in payload; must be extracted from transcript
+ *
+ * This function handles both cases transparently.
+ */
+function resolveModel(input: HookInput): string | undefined {
+  if (input.model) {
+    return input.model;
+  }
+  if (input.transcript_path) {
+    return extractModelFromTranscript(input.transcript_path);
+  }
+  return undefined;
+}
+
 const handlers: Record<string, (input: HookInput) => void> = {
   afterFileEdit: (input) => {
     const rangePositions = computeRangePositions(input.edits ?? [], tryReadFile(input.file_path!));
@@ -108,7 +139,7 @@ const handlers: Record<string, (input: HookInput) => void> = {
       : undefined;
 
     appendTrace(createTrace("ai", file, {
-      model: input.model,
+      model: resolveModel(input),
       rangePositions,
       transcript: input.transcript_path,
       metadata: {
@@ -122,14 +153,14 @@ const handlers: Record<string, (input: HookInput) => void> = {
 
   SessionStart: (input) => {
     appendTrace(createTrace("ai", ".sessions", {
-      model: input.model,
+      model: resolveModel(input),
       metadata: { event: "session_start", session_id: input.session_id, source: input.source },
     }));
   },
 
   SessionEnd: (input) => {
     appendTrace(createTrace("ai", ".sessions", {
-      model: input.model,
+      model: resolveModel(input),
       metadata: { event: "session_end", session_id: input.session_id, reason: input.reason },
     }));
   },
diff --git a/reference/trace-store.ts b/reference/trace-store.ts
index c9ccf55..3536722 100644
--- a/reference/trace-store.ts
+++ b/reference/trace-store.ts
@@ -1,5 +1,5 @@
 import { execFileSync } from "child_process";
-import { existsSync, mkdirSync, appendFileSync, readFileSync } from "fs";
+import { existsSync, mkdirSync, appendFileSync, readFileSync, openSync, fstatSync, readSync, closeSync } from "fs";
 import { join, relative } from "path";
 
 export interface Range {
@@ -94,30 +94,187 @@ export function normalizeModelId(model?: string): string | undefined {
   return model;
 }
 
+/**
+ * Extracts the model identifier from a Claude Code transcript file.
+ *
+ * Claude Code stores conversation transcripts as JSONL files where each line
+ * represents a message exchange. The model identifier is stored at `entry.message.model`.
+ * This function reads only the tail of the file to efficiently get the most recent model,
+ * which handles cases where the model may have changed during a session.
+ *
+ * @param transcriptPath - Absolute path to the Claude Code transcript JSONL file
+ * @returns The model identifier (e.g., "claude-opus-4-5-20251101") or undefined if not found
+ *
+ * @example
+ * ```typescript
+ * const model = extractModelFromTranscript("/path/to/transcript.jsonl");
+ * // Returns: "claude-opus-4-5-20251101"
+ * ```
+ */
+export function extractModelFromTranscript(transcriptPath: string): string | undefined {
+  try {
+    const fd = openSync(transcriptPath, "r");
+    const stats = fstatSync(fd);
+
+    // Start with 4KB, expand if needed (balances syscall overhead vs read size)
+    let readSize = Math.min(stats.size, 4 * 1024);
+
+    while (readSize <= stats.size) {
+      const buffer = Buffer.alloc(readSize);
+      readSync(fd, buffer, 0, readSize, stats.size - readSize);
+
+      const content = buffer.toString("utf-8");
+      const lines = content.split("\n");
+
+      // Iterate from end to get the most recent model
+      for (let i = lines.length - 1; i >= 0; i--) {
+        const line = lines[i].trim();
+        if (!line) continue;
+
+        try {
+          const entry = JSON.parse(line);
+          if (entry.message?.model) {
+            closeSync(fd);
+            return entry.message.model;
+          }
+        } catch {
+          // Skip malformed/partial JSON lines
+          continue;
+        }
+      }
+
+      // No model found, try larger chunk
+      if (readSize >= stats.size) break;
+      readSize = Math.min(stats.size, readSize * 2);
+    }
+
+    closeSync(fd);
+    return undefined;
+  } catch {
+    // File doesn't exist or isn't readable
+    return undefined;
+  }
+}
+
 export interface RangePosition {
   start_line: number;
   end_line: number;
 }
 
+/**
+ * Computes which lines in `newStr` are actually new or modified compared to `oldStr`.
+ *
+ * This function performs a simple line-by-line diff to distinguish between:
+ * - Context lines: Lines that exist in both old and new strings (not attributed)
+ * - Changed lines: Lines that are new or modified (attributed to AI)
+ *
+ * This is necessary because some tools (like Claude Code's Edit tool) include
+ * surrounding context lines in both `old_string` and `new_string`. Without this
+ * diff, we would incorrectly attribute unchanged context lines to the AI.
+ *
+ * @param oldStr - The original string before the edit
+ * @param newStr - The new string after the edit
+ * @returns Array of 0-indexed line offsets within `newStr` that are new or modified
+ *
+ * @example
+ * ```typescript
+ * // old: "line1\nline2\nline3"
+ * // new: "line1\nNEW LINE\nline3"
+ * diffToFindChangedLines(old, new); // Returns [1] - only the middle line changed
+ * ```
+ */
+function diffToFindChangedLines(oldStr: string, newStr: string): number[] {
+  const oldLines = oldStr.split("\n");
+  const newLines = newStr.split("\n");
+  const changedOffsets: number[] = [];
+
+  let oldIdx = 0;
+
+  for (let newIdx = 0; newIdx < newLines.length; newIdx++) {
+    if (oldIdx < oldLines.length && oldLines[oldIdx] === newLines[newIdx]) {
+      // Matching line - this is context, not a change
+      oldIdx++;
+    } else {
+      // Check if this line from newStr exists later in oldStr (handles deletions)
+      let foundAhead = false;
+      for (let lookAhead = oldIdx; lookAhead < oldLines.length; lookAhead++) {
+        if (oldLines[lookAhead] === newLines[newIdx]) {
+          oldIdx = lookAhead + 1;
+          foundAhead = true;
+          break;
+        }
+      }
+
+      if (!foundAhead) {
+        // Line is genuinely new or modified - attribute to AI
+        changedOffsets.push(newIdx);
+      }
+    }
+  }
+
+  return changedOffsets;
+}
+
 export function computeRangePositions(edits: FileEdit[], fileContent?: string): RangePosition[] {
   return edits
     .filter((e) => e.new_string)
-    .map((edit) => {
+    .flatMap((edit) => {
+      // Case 1: Has explicit range from tool → use it
       if (edit.range) {
-        return {
+        return [{
           start_line: edit.range.start_line_number,
           end_line: edit.range.end_line_number,
-        };
+        }];
       }
+
+      // Case 2: Has both old_string and new_string → diff them to find actual changes
+      if (edit.old_string && edit.new_string && fileContent) {
+        const idx = fileContent.indexOf(edit.new_string);
+        if (idx !== -1) {
+          const startLine = fileContent.substring(0, idx).split("\n").length;
+          const changedOffsets = diffToFindChangedLines(edit.old_string, edit.new_string);
+
+          if (changedOffsets.length === 0) {
+            return [];
+          }
+
+          // Convert offsets to line ranges, merging adjacent lines
+          const ranges: RangePosition[] = [];
+          let rangeStart = changedOffsets[0];
+          let rangeEnd = changedOffsets[0];
+
+          for (let i = 1; i < changedOffsets.length; i++) {
+            if (changedOffsets[i] === rangeEnd + 1) {
+              rangeEnd = changedOffsets[i];
+            } else {
+              ranges.push({
+                start_line: startLine + rangeStart,
+                end_line: startLine + rangeEnd,
+              });
+              rangeStart = changedOffsets[i];
+              rangeEnd = changedOffsets[i];
+            }
+          }
+
+          ranges.push({
+            start_line: startLine + rangeStart,
+            end_line: startLine + rangeEnd,
+          });
+
+          return ranges;
+        }
+      }
+
+      // Case 3: Fallback - attribute entire new_string (original behavior)
       const lineCount = edit.new_string.split("\n").length;
       if (fileContent) {
         const idx = fileContent.indexOf(edit.new_string);
         if (idx !== -1) {
           const startLine = fileContent.substring(0, idx).split("\n").length;
-          return { start_line: startLine, end_line: startLine + lineCount - 1 };
+          return [{ start_line: startLine, end_line: startLine + lineCount - 1 }];
         }
       }
-      return { start_line: 1, end_line: lineCount };
+      return [{ start_line: 1, end_line: lineCount }];
     });
 }