diff --git a/src/commands/checkpoint.rs b/src/commands/checkpoint.rs index 3d7df0bd..1e38acf6 100644 --- a/src/commands/checkpoint.rs +++ b/src/commands/checkpoint.rs @@ -156,12 +156,29 @@ pub fn run( storage_start.elapsed() )); + // Read checkpoints once and reuse throughout the function. + // This eliminates multiple redundant read_all_checkpoints() calls that were + // previously the #1 cause of memory overflow (each read deserializes the entire + // JSONL file into memory). + let read_checkpoints_start = Instant::now(); + let mut checkpoints = if reset { + working_log.reset_working_log()?; + Vec::new() + } else { + working_log.read_all_checkpoints()? + }; + debug_log(&format!( + "[BENCHMARK] Reading {} checkpoints took {:?}", + checkpoints.len(), + read_checkpoints_start.elapsed() + )); + // Early exit for human only if is_pre_commit { - let has_no_ai_edits = working_log - .all_ai_touched_files() - .map(|files| files.is_empty()) - .unwrap_or(true); + let has_no_ai_edits = checkpoints.iter().all(|cp| { + cp.entries.is_empty() + || (cp.kind != CheckpointKind::AiAgent && cp.kind != CheckpointKind::AiTab) + }); // Also check for INITIAL attributions - these are AI attributions from previous // commits that weren't staged (e.g., after an amend). We must process these. @@ -262,6 +279,7 @@ pub fn run( pathspec_start.elapsed() )); + // Pass pre-loaded checkpoints to avoid redundant reads inside get_all_tracked_files let files_start = Instant::now(); let files = get_all_tracked_files( repo, @@ -270,6 +288,7 @@ pub fn run( pathspec_filter, is_pre_commit, &ignore_matcher, + Some(&checkpoints), )?; debug_log(&format!( "[BENCHMARK] get_all_tracked_files found {} files, took {:?}", @@ -277,20 +296,6 @@ pub fn run( files_start.elapsed() )); - let read_checkpoints_start = Instant::now(); - let mut checkpoints = if reset { - // If reset flag is set, start with an empty working log - working_log.reset_working_log()?; - Vec::new() - } else { - working_log.read_all_checkpoints()? - }; - debug_log(&format!( - "[BENCHMARK] Reading {} checkpoints took {:?}", - checkpoints.len(), - read_checkpoints_start.elapsed() - )); - if show_working_log { if checkpoints.is_empty() { eprintln!("No working log entries found."); @@ -607,6 +612,7 @@ fn get_all_tracked_files( edited_filepaths: Option<&Vec>, is_pre_commit: bool, ignore_matcher: &IgnoreMatcher, + preloaded_checkpoints: Option<&[Checkpoint]>, ) -> Result, GitAiError> { let mut files: HashSet = edited_filepaths .map(|paths| { @@ -659,28 +665,38 @@ fn get_all_tracked_files( initial_read_start.elapsed() )); + // Use pre-loaded checkpoints if available, otherwise read from disk. + // This eliminates redundant read_all_checkpoints() calls when the caller + // already has the data loaded (e.g., checkpoint::run reads once and passes it through). + let owned_checkpoints; + let checkpoint_data: &[Checkpoint] = match preloaded_checkpoints { + Some(data) => data, + None => { + owned_checkpoints = working_log.read_all_checkpoints().unwrap_or_default(); + &owned_checkpoints + } + }; + let checkpoints_read_start = Instant::now(); - if let Ok(working_log_data) = working_log.read_all_checkpoints() { - for checkpoint in &working_log_data { - for entry in &checkpoint.entries { - // Normalize path separators to forward slashes - let normalized_path = normalize_to_posix(&entry.file); - // Filter out paths outside the repository to prevent git command failures - if !is_path_in_repo(&normalized_path) { - debug_log(&format!( - "Skipping checkpoint file outside repository: {}", - normalized_path - )); - continue; - } - if should_ignore_file_with_matcher(&normalized_path, ignore_matcher) { - continue; - } - if !files.contains(&normalized_path) { - // Check if it's a text file before adding - if is_text_file(working_log, &normalized_path) { - files.insert(normalized_path); - } + for checkpoint in checkpoint_data { + for entry in &checkpoint.entries { + // Normalize path separators to forward slashes + let normalized_path = normalize_to_posix(&entry.file); + // Filter out paths outside the repository to prevent git command failures + if !is_path_in_repo(&normalized_path) { + debug_log(&format!( + "Skipping checkpoint file outside repository: {}", + normalized_path + )); + continue; + } + if should_ignore_file_with_matcher(&normalized_path, ignore_matcher) { + continue; + } + if !files.contains(&normalized_path) { + // Check if it's a text file before adding + if is_text_file(working_log, &normalized_path) { + files.insert(normalized_path); } } } @@ -690,13 +706,10 @@ fn get_all_tracked_files( checkpoints_read_start.elapsed() )); - let has_ai_checkpoints = if let Ok(working_log_data) = working_log.read_all_checkpoints() { - working_log_data.iter().any(|checkpoint| { - checkpoint.kind == CheckpointKind::AiAgent || checkpoint.kind == CheckpointKind::AiTab - }) - } else { - false - }; + // Use same checkpoint data to check for AI checkpoints (no extra read) + let has_ai_checkpoints = checkpoint_data.iter().any(|checkpoint| { + checkpoint.kind == CheckpointKind::AiAgent || checkpoint.kind == CheckpointKind::AiTab + }); let status_files_start = Instant::now(); let mut results_for_tracked_files = if is_pre_commit && !has_ai_checkpoints { diff --git a/src/git/repo_storage.rs b/src/git/repo_storage.rs index f46c8b99..123f6820 100644 --- a/src/git/repo_storage.rs +++ b/src/git/repo_storage.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::collections::{HashMap, HashSet}; use std::fs; +use std::io::{BufRead, BufReader, Write}; use std::path::{Path, PathBuf}; /// Initial attributions data structure stored in the INITIAL file @@ -321,8 +322,7 @@ impl PersistedWorkingLog { /* append checkpoint */ pub fn append_checkpoint(&self, checkpoint: &Checkpoint) -> Result<(), GitAiError> { - // Read existing checkpoints - let mut checkpoints = self.read_all_checkpoints().unwrap_or_default(); + let checkpoints_file = self.dir.join("checkpoints.jsonl"); // Create a copy, potentially without transcript to reduce storage size. // Transcripts are refetched in update_prompts_to_latest() before post-commit @@ -366,15 +366,68 @@ impl PersistedWorkingLog { storage_checkpoint.transcript = None; } - // Add the new checkpoint - checkpoints.push(storage_checkpoint); + // Collect files from the new checkpoint to know which older attributions to prune + let new_files: HashSet = storage_checkpoint + .entries + .iter() + .map(|e| e.file.clone()) + .collect(); + + // If the file doesn't exist yet or the new checkpoint has no overlapping files, + // we can potentially skip the rewrite. But we always need to prune to keep the + // file small during long agent loops. + let needs_prune = !new_files.is_empty() && checkpoints_file.exists(); + + if needs_prune { + // Streaming prune-and-rewrite: read existing checkpoints one at a time, + // prune char attributions for files that the new checkpoint supersedes, + // write to a temp file, then append the new checkpoint and rename. + // Peak memory: one checkpoint at a time + the new checkpoint. + let tmp_file = self.dir.join("checkpoints.jsonl.tmp"); + { + let existing = fs::File::open(&checkpoints_file)?; + let reader = BufReader::new(existing); + let out = fs::File::create(&tmp_file)?; + let mut writer = std::io::BufWriter::new(out); + + for line_result in reader.lines() { + let line = line_result?; + if line.trim().is_empty() { + continue; + } + let mut cp: Checkpoint = serde_json::from_str(&line) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + + // Prune char-level attributions from entries whose file is superseded + // by the new checkpoint. Only the newest checkpoint per file needs + // char-level precision. + for entry in &mut cp.entries { + if new_files.contains(&entry.file) { + entry.attributions.clear(); + } + } + + let json_line = serde_json::to_string(&cp)?; + writeln!(writer, "{}", json_line)?; + } - // Prune char-level attributions from older checkpoints for the same files - // Only the most recent checkpoint per file needs char-level precision - self.prune_old_char_attributions(&mut checkpoints); + // Append the new checkpoint + let json_line = serde_json::to_string(&storage_checkpoint)?; + writeln!(writer, "{}", json_line)?; + writer.flush()?; + } + fs::rename(&tmp_file, &checkpoints_file)?; + } else { + // No existing file or no overlapping files — simple append + let json_line = serde_json::to_string(&storage_checkpoint)?; + let mut file = fs::OpenOptions::new() + .create(true) + .append(true) + .open(&checkpoints_file)?; + writeln!(file, "{}", json_line)?; + } - // Write all checkpoints back - self.write_all_checkpoints(&checkpoints) + Ok(()) } pub fn read_all_checkpoints(&self) -> Result, GitAiError> { @@ -384,16 +437,19 @@ impl PersistedWorkingLog { return Ok(Vec::new()); } - let content = fs::read_to_string(&checkpoints_file)?; + // Use BufReader to stream line-by-line instead of loading entire file into memory. + // This avoids holding both the full file string AND the parsed structs simultaneously. + let file = fs::File::open(&checkpoints_file)?; + let reader = BufReader::new(file); let mut checkpoints = Vec::new(); - // Parse JSONL file - each line is a separate JSON object - for line in content.lines() { + for line_result in reader.lines() { + let line = line_result?; if line.trim().is_empty() { continue; } - let checkpoint: Checkpoint = serde_json::from_str(line) + let checkpoint: Checkpoint = serde_json::from_str(&line) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; if checkpoint.api_version != CHECKPOINT_API_VERSION { @@ -454,55 +510,20 @@ impl PersistedWorkingLog { Ok(migrated_checkpoints) } - /// Remove char-level attributions from all but the most recent checkpoint per file. - /// This reduces storage size while preserving precision for the entries that matter. - /// Only the most recent checkpoint entry for each file is used when computing new entries. - fn prune_old_char_attributions(&self, checkpoints: &mut [Checkpoint]) { - // Track which checkpoint index has the most recent entry for each file - // Iterate from newest to oldest - let mut newest_for_file: HashMap = HashMap::new(); - - for (checkpoint_idx, checkpoint) in checkpoints.iter().enumerate().rev() { - for entry in &checkpoint.entries { - newest_for_file - .entry(entry.file.clone()) - .or_insert(checkpoint_idx); - } - } - - // Clear attributions from entries that aren't the most recent for their file - for (checkpoint_idx, checkpoint) in checkpoints.iter_mut().enumerate() { - for entry in &mut checkpoint.entries { - if let Some(&newest_idx) = newest_for_file.get(&entry.file) - && checkpoint_idx != newest_idx - { - entry.attributions.clear(); - } - } - } - } - - /// Write all checkpoints to the JSONL file, replacing any existing content + /// Write all checkpoints to the JSONL file, replacing any existing content. /// Note: Unlike append_checkpoint(), this preserves transcripts because it's used /// by post-commit after transcripts have been refetched and need to be preserved /// for from_just_working_log() to read them. pub fn write_all_checkpoints(&self, checkpoints: &[Checkpoint]) -> Result<(), GitAiError> { let checkpoints_file = self.dir.join("checkpoints.jsonl"); - // Serialize all checkpoints to JSONL - let mut lines = Vec::new(); - for checkpoint in checkpoints { + let file = fs::File::create(&checkpoints_file)?; + let mut writer = std::io::BufWriter::new(file); + for checkpoint in checkpoints.iter() { let json_line = serde_json::to_string(checkpoint)?; - lines.push(json_line); - } - - // Write all lines to file - let content = lines.join("\n"); - if !content.is_empty() { - fs::write(&checkpoints_file, format!("{}\n", content))?; - } else { - fs::write(&checkpoints_file, "")?; + writeln!(writer, "{}", json_line)?; } + writer.flush()?; Ok(()) }