From 12442132ffb231ceddad304cfb47c7ccc3affd8b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 21 Feb 2026 03:14:55 +0000 Subject: [PATCH 1/5] fix: reduce memory overflow from checkpoint reads and writes - Make append_checkpoint truly append-only (O(1) instead of O(N) read-write-all) - Use BufReader for streaming JSONL reads instead of read_to_string - Eliminate 3 redundant read_all_checkpoints() calls in checkpoint::run() - Pass pre-loaded checkpoints to get_all_tracked_files - Defer char-level attribution pruning to write_all_checkpoints - Use BufWriter for efficient checkpoint serialization Addresses #344 Co-Authored-By: Sasha Varlamov --- src/authorship/post_commit.rs | 2 +- src/commands/checkpoint.rs | 105 +++++++++++++++------------ src/commands/hooks/checkout_hooks.rs | 4 +- src/git/repo_storage.rs | 65 ++++++++++------- 4 files changed, 99 insertions(+), 77 deletions(-) diff --git a/src/authorship/post_commit.rs b/src/authorship/post_commit.rs index 6c18c8ce4..f87cd8582 100644 --- a/src/authorship/post_commit.rs +++ b/src/authorship/post_commit.rs @@ -92,7 +92,7 @@ pub fn post_commit( ); } - working_log.write_all_checkpoints(&parent_working_log)?; + working_log.write_all_checkpoints(&mut parent_working_log)?; // Create VirtualAttributions from working log (fast path - no blame) // We don't need to run blame because we only care about the working log data diff --git a/src/commands/checkpoint.rs b/src/commands/checkpoint.rs index 3d7df0bd8..1e38acf68 100644 --- a/src/commands/checkpoint.rs +++ b/src/commands/checkpoint.rs @@ -156,12 +156,29 @@ pub fn run( storage_start.elapsed() )); + // Read checkpoints once and reuse throughout the function. + // This eliminates multiple redundant read_all_checkpoints() calls that were + // previously the #1 cause of memory overflow (each read deserializes the entire + // JSONL file into memory). + let read_checkpoints_start = Instant::now(); + let mut checkpoints = if reset { + working_log.reset_working_log()?; + Vec::new() + } else { + working_log.read_all_checkpoints()? + }; + debug_log(&format!( + "[BENCHMARK] Reading {} checkpoints took {:?}", + checkpoints.len(), + read_checkpoints_start.elapsed() + )); + // Early exit for human only if is_pre_commit { - let has_no_ai_edits = working_log - .all_ai_touched_files() - .map(|files| files.is_empty()) - .unwrap_or(true); + let has_no_ai_edits = checkpoints.iter().all(|cp| { + cp.entries.is_empty() + || (cp.kind != CheckpointKind::AiAgent && cp.kind != CheckpointKind::AiTab) + }); // Also check for INITIAL attributions - these are AI attributions from previous // commits that weren't staged (e.g., after an amend). We must process these. @@ -262,6 +279,7 @@ pub fn run( pathspec_start.elapsed() )); + // Pass pre-loaded checkpoints to avoid redundant reads inside get_all_tracked_files let files_start = Instant::now(); let files = get_all_tracked_files( repo, @@ -270,6 +288,7 @@ pub fn run( pathspec_filter, is_pre_commit, &ignore_matcher, + Some(&checkpoints), )?; debug_log(&format!( "[BENCHMARK] get_all_tracked_files found {} files, took {:?}", @@ -277,20 +296,6 @@ pub fn run( files_start.elapsed() )); - let read_checkpoints_start = Instant::now(); - let mut checkpoints = if reset { - // If reset flag is set, start with an empty working log - working_log.reset_working_log()?; - Vec::new() - } else { - working_log.read_all_checkpoints()? - }; - debug_log(&format!( - "[BENCHMARK] Reading {} checkpoints took {:?}", - checkpoints.len(), - read_checkpoints_start.elapsed() - )); - if show_working_log { if checkpoints.is_empty() { eprintln!("No working log entries found."); @@ -607,6 +612,7 @@ fn get_all_tracked_files( edited_filepaths: Option<&Vec>, is_pre_commit: bool, ignore_matcher: &IgnoreMatcher, + preloaded_checkpoints: Option<&[Checkpoint]>, ) -> Result, GitAiError> { let mut files: HashSet = edited_filepaths .map(|paths| { @@ -659,28 +665,38 @@ fn get_all_tracked_files( initial_read_start.elapsed() )); + // Use pre-loaded checkpoints if available, otherwise read from disk. + // This eliminates redundant read_all_checkpoints() calls when the caller + // already has the data loaded (e.g., checkpoint::run reads once and passes it through). + let owned_checkpoints; + let checkpoint_data: &[Checkpoint] = match preloaded_checkpoints { + Some(data) => data, + None => { + owned_checkpoints = working_log.read_all_checkpoints().unwrap_or_default(); + &owned_checkpoints + } + }; + let checkpoints_read_start = Instant::now(); - if let Ok(working_log_data) = working_log.read_all_checkpoints() { - for checkpoint in &working_log_data { - for entry in &checkpoint.entries { - // Normalize path separators to forward slashes - let normalized_path = normalize_to_posix(&entry.file); - // Filter out paths outside the repository to prevent git command failures - if !is_path_in_repo(&normalized_path) { - debug_log(&format!( - "Skipping checkpoint file outside repository: {}", - normalized_path - )); - continue; - } - if should_ignore_file_with_matcher(&normalized_path, ignore_matcher) { - continue; - } - if !files.contains(&normalized_path) { - // Check if it's a text file before adding - if is_text_file(working_log, &normalized_path) { - files.insert(normalized_path); - } + for checkpoint in checkpoint_data { + for entry in &checkpoint.entries { + // Normalize path separators to forward slashes + let normalized_path = normalize_to_posix(&entry.file); + // Filter out paths outside the repository to prevent git command failures + if !is_path_in_repo(&normalized_path) { + debug_log(&format!( + "Skipping checkpoint file outside repository: {}", + normalized_path + )); + continue; + } + if should_ignore_file_with_matcher(&normalized_path, ignore_matcher) { + continue; + } + if !files.contains(&normalized_path) { + // Check if it's a text file before adding + if is_text_file(working_log, &normalized_path) { + files.insert(normalized_path); } } } @@ -690,13 +706,10 @@ fn get_all_tracked_files( checkpoints_read_start.elapsed() )); - let has_ai_checkpoints = if let Ok(working_log_data) = working_log.read_all_checkpoints() { - working_log_data.iter().any(|checkpoint| { - checkpoint.kind == CheckpointKind::AiAgent || checkpoint.kind == CheckpointKind::AiTab - }) - } else { - false - }; + // Use same checkpoint data to check for AI checkpoints (no extra read) + let has_ai_checkpoints = checkpoint_data.iter().any(|checkpoint| { + checkpoint.kind == CheckpointKind::AiAgent || checkpoint.kind == CheckpointKind::AiTab + }); let status_files_start = Instant::now(); let mut results_for_tracked_files = if is_pre_commit && !has_ai_checkpoints { diff --git a/src/commands/hooks/checkout_hooks.rs b/src/commands/hooks/checkout_hooks.rs index 49804e0ec..69736e307 100644 --- a/src/commands/hooks/checkout_hooks.rs +++ b/src/commands/hooks/checkout_hooks.rs @@ -166,7 +166,7 @@ fn remove_attributions_for_pathspecs(repository: &Repository, head: &str, pathsp // Filter checkpoints if let Ok(checkpoints) = working_log.read_all_checkpoints() { - let filtered: Vec<_> = checkpoints + let mut filtered: Vec<_> = checkpoints .into_iter() .map(|mut cp| { cp.entries @@ -175,7 +175,7 @@ fn remove_attributions_for_pathspecs(repository: &Repository, head: &str, pathsp }) .filter(|cp| !cp.entries.is_empty()) .collect(); - let _ = working_log.write_all_checkpoints(&filtered); + let _ = working_log.write_all_checkpoints(&mut filtered); } } diff --git a/src/git/repo_storage.rs b/src/git/repo_storage.rs index f46c8b994..9a953bc1a 100644 --- a/src/git/repo_storage.rs +++ b/src/git/repo_storage.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::collections::{HashMap, HashSet}; use std::fs; +use std::io::{BufRead, BufReader, Write}; use std::path::{Path, PathBuf}; /// Initial attributions data structure stored in the INITIAL file @@ -321,8 +322,7 @@ impl PersistedWorkingLog { /* append checkpoint */ pub fn append_checkpoint(&self, checkpoint: &Checkpoint) -> Result<(), GitAiError> { - // Read existing checkpoints - let mut checkpoints = self.read_all_checkpoints().unwrap_or_default(); + let checkpoints_file = self.dir.join("checkpoints.jsonl"); // Create a copy, potentially without transcript to reduce storage size. // Transcripts are refetched in update_prompts_to_latest() before post-commit @@ -366,15 +366,18 @@ impl PersistedWorkingLog { storage_checkpoint.transcript = None; } - // Add the new checkpoint - checkpoints.push(storage_checkpoint); - - // Prune char-level attributions from older checkpoints for the same files - // Only the most recent checkpoint per file needs char-level precision - self.prune_old_char_attributions(&mut checkpoints); + // Append only the new checkpoint to the file (O(1) instead of O(N)) + // This avoids reading all existing checkpoints just to add one. + // Char-level attribution pruning is deferred to write_all_checkpoints(), + // which is called by post_commit after reading all checkpoints anyway. + let json_line = serde_json::to_string(&storage_checkpoint)?; + let mut file = fs::OpenOptions::new() + .create(true) + .append(true) + .open(&checkpoints_file)?; + writeln!(file, "{}", json_line)?; - // Write all checkpoints back - self.write_all_checkpoints(&checkpoints) + Ok(()) } pub fn read_all_checkpoints(&self) -> Result, GitAiError> { @@ -384,16 +387,19 @@ impl PersistedWorkingLog { return Ok(Vec::new()); } - let content = fs::read_to_string(&checkpoints_file)?; + // Use BufReader to stream line-by-line instead of loading entire file into memory. + // This avoids holding both the full file string AND the parsed structs simultaneously. + let file = fs::File::open(&checkpoints_file)?; + let reader = BufReader::new(file); let mut checkpoints = Vec::new(); - // Parse JSONL file - each line is a separate JSON object - for line in content.lines() { + for line_result in reader.lines() { + let line = line_result?; if line.trim().is_empty() { continue; } - let checkpoint: Checkpoint = serde_json::from_str(line) + let checkpoint: Checkpoint = serde_json::from_str(&line) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; if checkpoint.api_version != CHECKPOINT_API_VERSION { @@ -482,26 +488,29 @@ impl PersistedWorkingLog { } } - /// Write all checkpoints to the JSONL file, replacing any existing content + /// Write all checkpoints to the JSONL file, replacing any existing content. + /// Also prunes char-level attributions from older checkpoints (deferred from append_checkpoint). /// Note: Unlike append_checkpoint(), this preserves transcripts because it's used /// by post-commit after transcripts have been refetched and need to be preserved /// for from_just_working_log() to read them. - pub fn write_all_checkpoints(&self, checkpoints: &[Checkpoint]) -> Result<(), GitAiError> { + pub fn write_all_checkpoints( + &self, + checkpoints: &mut [Checkpoint], + ) -> Result<(), GitAiError> { let checkpoints_file = self.dir.join("checkpoints.jsonl"); - // Serialize all checkpoints to JSONL - let mut lines = Vec::new(); - for checkpoint in checkpoints { - let json_line = serde_json::to_string(checkpoint)?; - lines.push(json_line); - } + // Prune char-level attributions from older checkpoints for the same files. + // Only the most recent checkpoint per file needs char-level precision. + // This was previously done in append_checkpoint but is now deferred here + // to avoid the read-all-then-write-all pattern on every append. + self.prune_old_char_attributions(checkpoints); - // Write all lines to file - let content = lines.join("\n"); - if !content.is_empty() { - fs::write(&checkpoints_file, format!("{}\n", content))?; - } else { - fs::write(&checkpoints_file, "")?; + // Use BufWriter for efficient serialization directly to file + let file = fs::File::create(&checkpoints_file)?; + let mut writer = std::io::BufWriter::new(file); + for checkpoint in checkpoints.iter() { + let json_line = serde_json::to_string(checkpoint)?; + writeln!(writer, "{}", json_line)?; } Ok(()) From 365e555025a6573fcd8f8057e90ae3a7173c6c69 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 21 Feb 2026 03:38:45 +0000 Subject: [PATCH 2/5] style: fix rustfmt formatting Co-Authored-By: Sasha Varlamov --- src/git/repo_storage.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/git/repo_storage.rs b/src/git/repo_storage.rs index 9a953bc1a..8a9a99cb6 100644 --- a/src/git/repo_storage.rs +++ b/src/git/repo_storage.rs @@ -493,10 +493,7 @@ impl PersistedWorkingLog { /// Note: Unlike append_checkpoint(), this preserves transcripts because it's used /// by post-commit after transcripts have been refetched and need to be preserved /// for from_just_working_log() to read them. - pub fn write_all_checkpoints( - &self, - checkpoints: &mut [Checkpoint], - ) -> Result<(), GitAiError> { + pub fn write_all_checkpoints(&self, checkpoints: &mut [Checkpoint]) -> Result<(), GitAiError> { let checkpoints_file = self.dir.join("checkpoints.jsonl"); // Prune char-level attributions from older checkpoints for the same files. From a3b78a283f5bc6bf731d2da135e1359278cd591b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 21 Feb 2026 05:10:52 +0000 Subject: [PATCH 3/5] fix: explicitly flush BufWriter in write_all_checkpoints to avoid silent data loss Co-Authored-By: Sasha Varlamov --- src/git/repo_storage.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/git/repo_storage.rs b/src/git/repo_storage.rs index 8a9a99cb6..448ca661d 100644 --- a/src/git/repo_storage.rs +++ b/src/git/repo_storage.rs @@ -509,6 +509,7 @@ impl PersistedWorkingLog { let json_line = serde_json::to_string(checkpoint)?; writeln!(writer, "{}", json_line)?; } + writer.flush()?; Ok(()) } From b4c90f1ef4780170e1ac47fb662fb28fa95c751b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 21 Feb 2026 23:31:27 +0000 Subject: [PATCH 4/5] fix: restore streaming prune-on-append to keep checkpoint file small during agent loops Co-Authored-By: Sasha Varlamov --- src/authorship/post_commit.rs | 2 +- src/commands/hooks/checkout_hooks.rs | 4 +- src/git/repo_storage.rs | 80 +++++++++++++++++++++------- 3 files changed, 64 insertions(+), 22 deletions(-) diff --git a/src/authorship/post_commit.rs b/src/authorship/post_commit.rs index f87cd8582..6c18c8ce4 100644 --- a/src/authorship/post_commit.rs +++ b/src/authorship/post_commit.rs @@ -92,7 +92,7 @@ pub fn post_commit( ); } - working_log.write_all_checkpoints(&mut parent_working_log)?; + working_log.write_all_checkpoints(&parent_working_log)?; // Create VirtualAttributions from working log (fast path - no blame) // We don't need to run blame because we only care about the working log data diff --git a/src/commands/hooks/checkout_hooks.rs b/src/commands/hooks/checkout_hooks.rs index 69736e307..49804e0ec 100644 --- a/src/commands/hooks/checkout_hooks.rs +++ b/src/commands/hooks/checkout_hooks.rs @@ -166,7 +166,7 @@ fn remove_attributions_for_pathspecs(repository: &Repository, head: &str, pathsp // Filter checkpoints if let Ok(checkpoints) = working_log.read_all_checkpoints() { - let mut filtered: Vec<_> = checkpoints + let filtered: Vec<_> = checkpoints .into_iter() .map(|mut cp| { cp.entries @@ -175,7 +175,7 @@ fn remove_attributions_for_pathspecs(repository: &Repository, head: &str, pathsp }) .filter(|cp| !cp.entries.is_empty()) .collect(); - let _ = working_log.write_all_checkpoints(&mut filtered); + let _ = working_log.write_all_checkpoints(&filtered); } } diff --git a/src/git/repo_storage.rs b/src/git/repo_storage.rs index 448ca661d..a55784ac8 100644 --- a/src/git/repo_storage.rs +++ b/src/git/repo_storage.rs @@ -366,16 +366,66 @@ impl PersistedWorkingLog { storage_checkpoint.transcript = None; } - // Append only the new checkpoint to the file (O(1) instead of O(N)) - // This avoids reading all existing checkpoints just to add one. - // Char-level attribution pruning is deferred to write_all_checkpoints(), - // which is called by post_commit after reading all checkpoints anyway. - let json_line = serde_json::to_string(&storage_checkpoint)?; - let mut file = fs::OpenOptions::new() - .create(true) - .append(true) - .open(&checkpoints_file)?; - writeln!(file, "{}", json_line)?; + // Collect files from the new checkpoint to know which older attributions to prune + let new_files: HashSet = storage_checkpoint + .entries + .iter() + .map(|e| e.file.clone()) + .collect(); + + // If the file doesn't exist yet or the new checkpoint has no overlapping files, + // we can potentially skip the rewrite. But we always need to prune to keep the + // file small during long agent loops. + let needs_prune = !new_files.is_empty() && checkpoints_file.exists(); + + if needs_prune { + // Streaming prune-and-rewrite: read existing checkpoints one at a time, + // prune char attributions for files that the new checkpoint supersedes, + // write to a temp file, then append the new checkpoint and rename. + // Peak memory: one checkpoint at a time + the new checkpoint. + let tmp_file = self.dir.join("checkpoints.jsonl.tmp"); + { + let existing = fs::File::open(&checkpoints_file)?; + let reader = BufReader::new(existing); + let out = fs::File::create(&tmp_file)?; + let mut writer = std::io::BufWriter::new(out); + + for line_result in reader.lines() { + let line = line_result?; + if line.trim().is_empty() { + continue; + } + let mut cp: Checkpoint = serde_json::from_str(&line) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + + // Prune char-level attributions from entries whose file is superseded + // by the new checkpoint. Only the newest checkpoint per file needs + // char-level precision. + for entry in &mut cp.entries { + if new_files.contains(&entry.file) { + entry.attributions.clear(); + } + } + + let json_line = serde_json::to_string(&cp)?; + writeln!(writer, "{}", json_line)?; + } + + // Append the new checkpoint + let json_line = serde_json::to_string(&storage_checkpoint)?; + writeln!(writer, "{}", json_line)?; + writer.flush()?; + } + fs::rename(&tmp_file, &checkpoints_file)?; + } else { + // No existing file or no overlapping files — simple append + let json_line = serde_json::to_string(&storage_checkpoint)?; + let mut file = fs::OpenOptions::new() + .create(true) + .append(true) + .open(&checkpoints_file)?; + writeln!(file, "{}", json_line)?; + } Ok(()) } @@ -489,20 +539,12 @@ impl PersistedWorkingLog { } /// Write all checkpoints to the JSONL file, replacing any existing content. - /// Also prunes char-level attributions from older checkpoints (deferred from append_checkpoint). /// Note: Unlike append_checkpoint(), this preserves transcripts because it's used /// by post-commit after transcripts have been refetched and need to be preserved /// for from_just_working_log() to read them. - pub fn write_all_checkpoints(&self, checkpoints: &mut [Checkpoint]) -> Result<(), GitAiError> { + pub fn write_all_checkpoints(&self, checkpoints: &[Checkpoint]) -> Result<(), GitAiError> { let checkpoints_file = self.dir.join("checkpoints.jsonl"); - // Prune char-level attributions from older checkpoints for the same files. - // Only the most recent checkpoint per file needs char-level precision. - // This was previously done in append_checkpoint but is now deferred here - // to avoid the read-all-then-write-all pattern on every append. - self.prune_old_char_attributions(checkpoints); - - // Use BufWriter for efficient serialization directly to file let file = fs::File::create(&checkpoints_file)?; let mut writer = std::io::BufWriter::new(file); for checkpoint in checkpoints.iter() { From d9ce41717a421a39f0346121b5aa043dde7e38c2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 21 Feb 2026 23:33:06 +0000 Subject: [PATCH 5/5] fix: remove unused prune_old_char_attributions (now inlined in append_checkpoint) Co-Authored-By: Sasha Varlamov --- src/git/repo_storage.rs | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/src/git/repo_storage.rs b/src/git/repo_storage.rs index a55784ac8..123f6820c 100644 --- a/src/git/repo_storage.rs +++ b/src/git/repo_storage.rs @@ -510,34 +510,6 @@ impl PersistedWorkingLog { Ok(migrated_checkpoints) } - /// Remove char-level attributions from all but the most recent checkpoint per file. - /// This reduces storage size while preserving precision for the entries that matter. - /// Only the most recent checkpoint entry for each file is used when computing new entries. - fn prune_old_char_attributions(&self, checkpoints: &mut [Checkpoint]) { - // Track which checkpoint index has the most recent entry for each file - // Iterate from newest to oldest - let mut newest_for_file: HashMap = HashMap::new(); - - for (checkpoint_idx, checkpoint) in checkpoints.iter().enumerate().rev() { - for entry in &checkpoint.entries { - newest_for_file - .entry(entry.file.clone()) - .or_insert(checkpoint_idx); - } - } - - // Clear attributions from entries that aren't the most recent for their file - for (checkpoint_idx, checkpoint) in checkpoints.iter_mut().enumerate() { - for entry in &mut checkpoint.entries { - if let Some(&newest_idx) = newest_for_file.get(&entry.file) - && checkpoint_idx != newest_idx - { - entry.attributions.clear(); - } - } - } - } - /// Write all checkpoints to the JSONL file, replacing any existing content. /// Note: Unlike append_checkpoint(), this preserves transcripts because it's used /// by post-commit after transcripts have been refetched and need to be preserved