From 32f5041d5a3994ca7009595e181aab4e7c27a773 Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Sat, 7 Jun 2025 16:52:51 +0800 Subject: [PATCH 1/2] feat(cli): store file hash in metadata for change checking. --- src/vectorcode/subcommands/update.py | 2 +- src/vectorcode/subcommands/vectorise.py | 38 +++++++++++---- tests/subcommands/test_vectorise.py | 63 +++++++++++++++++++++---- 3 files changed, 84 insertions(+), 19 deletions(-) diff --git a/src/vectorcode/subcommands/update.py b/src/vectorcode/subcommands/update.py index 2044a2a7..901c605f 100644 --- a/src/vectorcode/subcommands/update.py +++ b/src/vectorcode/subcommands/update.py @@ -72,7 +72,7 @@ async def update(configs: Config) -> int: for task in asyncio.as_completed(tasks): await task bar.update(1) - except asyncio.CancelledError: + except asyncio.CancelledError: # pragma: nocover print("Abort.", file=sys.stderr) return 1 diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index a838124f..a45bc361 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -32,6 +32,19 @@ def hash_str(string: str) -> str: return hashlib.sha256(string.encode()).hexdigest() +def hash_file(path: str) -> str: + """return the sha-256 hash of a file.""" + hasher = hashlib.sha256() + with open(path, "rb") as file: + while True: + chunk = file.read(8192) + if chunk: + hasher.update(chunk) + else: + break + return hasher.hexdigest() + + def get_uuid() -> str: return uuid.uuid4().hex @@ -47,15 +60,21 @@ async def chunked_add( semaphore: asyncio.Semaphore, ): full_path_str = str(expand_path(str(file_path), True)) + orig_sha256 = None + new_sha256 = hash_file(full_path_str) async with collection_lock: - num_existing_chunks = len( - ( - await collection.get( - where={"path": full_path_str}, - include=[IncludeEnum.metadatas], - ) - )["ids"] + existing_chunks = await collection.get( + where={"path": full_path_str}, + include=[IncludeEnum.metadatas], ) + num_existing_chunks = len((existing_chunks)["ids"]) + if existing_chunks["metadatas"]: + orig_sha256 = existing_chunks["metadatas"][0].get("sha256") + if orig_sha256 and orig_sha256 == new_sha256: + logger.debug( + f"Skipping {full_path_str} because it's unchanged since last vectorisation." + ) + return if num_existing_chunks: logger.debug( @@ -78,7 +97,10 @@ async def chunked_add( logger.debug(f"Chunked into {len(chunks)} pieces.") metas = [] for chunk in chunks: - meta: dict[str, str | int] = {"path": full_path_str} + meta: dict[str, str | int] = { + "path": full_path_str, + "sha256": new_sha256, + } if isinstance(chunk, Chunk): meta["start"] = chunk.start.row meta["end"] = chunk.end.row diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py index 24e60c62..027f43a8 100644 --- a/tests/subcommands/test_vectorise.py +++ b/tests/subcommands/test_vectorise.py @@ -3,6 +3,7 @@ import json import os import socket +import tempfile from contextlib import ExitStack from unittest.mock import AsyncMock, MagicMock, mock_open, patch @@ -17,6 +18,7 @@ chunked_add, exclude_paths_by_spec, get_uuid, + hash_file, hash_str, include_paths_by_spec, load_files_from_include, @@ -31,6 +33,36 @@ def test_hash_str(): assert hash_str(test_string) == expected_hash +def test_hash_file_basic(): + content = b"This is a test file for hashing." + expected_hash = hashlib.sha256(content).hexdigest() + + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + tmp_file.write(content) + tmp_file_path = tmp_file.name + + try: + actual_hash = hash_file(tmp_file_path) + assert actual_hash == expected_hash + finally: + os.remove(tmp_file_path) + + +def test_hash_file_empty(): + content = b"" + expected_hash = hashlib.sha256(content).hexdigest() + + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + tmp_file.write(content) + tmp_file_path = tmp_file.name + + try: + actual_hash = hash_file(tmp_file_path) + assert actual_hash == expected_hash + finally: + os.remove(tmp_file_path) + + def test_get_uuid(): uuid_str = get_uuid() assert isinstance(uuid_str, str) @@ -48,7 +80,11 @@ async def test_chunked_add(): max_batch_size = 50 semaphore = asyncio.Semaphore(1) - with patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk: + with ( + patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk, + patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file, + ): + mock_hash_file.return_value = "hash1" mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"] await chunked_add( file_path, @@ -72,7 +108,7 @@ async def test_chunked_add_with_existing(): file_path = "test_file.py" collection = AsyncMock() collection.get = AsyncMock() - collection.get.return_value = {"ids": ["id1"]} + collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]} collection_lock = asyncio.Lock() stats = {"add": 0, "update": 0} stats_lock = asyncio.Lock() @@ -80,7 +116,11 @@ async def test_chunked_add_with_existing(): max_batch_size = 50 semaphore = asyncio.Semaphore(1) - with patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk: + with ( + patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk, + patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file, + ): + mock_hash_file.return_value = "hash1" mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"] await chunked_add( file_path, @@ -94,11 +134,8 @@ async def test_chunked_add_with_existing(): ) assert stats["add"] == 0 - assert stats["update"] == 1 - collection.add.assert_called() - assert collection.add.call_count == 1 - collection.delete.assert_called - assert collection.delete.call_count == 1 + assert stats["update"] == 0 + collection.add.assert_not_called() @pytest.mark.asyncio @@ -112,7 +149,11 @@ async def test_chunked_add_empty_file(): max_batch_size = 50 semaphore = asyncio.Semaphore(1) - with patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk: + with ( + patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk, + patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file, + ): + mock_hash_file.return_value = "hash1" mock_chunk.return_value = [] await chunked_add( file_path, @@ -390,7 +431,7 @@ async def test_vectorise_orphaned_files(): "metadatas": [{"path": "test_file.py"}, {"path": "non_existent_file.py"}] } mock_collection.get.side_effect = [ - {"ids": []}, # Return value for chunked_add + {"ids": [], "metadatas": []}, # Return value for chunked_add get_return, # Return value for orphaned files ] mock_collection.delete.return_value = None @@ -428,7 +469,9 @@ def is_file_side_effect(path): "vectorcode.subcommands.vectorise.expand_globs", return_value=["test_file.py"], # Ensure expand_globs returns a valid file ), + patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file, ): + mock_hash_file.return_value = "hash1" result = await vectorise(configs) assert result == 0 From f443789343c5401158d5246bb7ded35f52d0b854 Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Sun, 8 Jun 2025 13:55:00 +0800 Subject: [PATCH 2/2] test(cli): add test for `chunked_add` when updating existing chunks --- tests/subcommands/test_vectorise.py | 35 +++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py index 027f43a8..37b2b109 100644 --- a/tests/subcommands/test_vectorise.py +++ b/tests/subcommands/test_vectorise.py @@ -138,6 +138,41 @@ async def test_chunked_add_with_existing(): collection.add.assert_not_called() +@pytest.mark.asyncio +async def test_chunked_add_update_existing(): + file_path = "test_file.py" + collection = AsyncMock() + collection.get = AsyncMock() + collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]} + collection_lock = asyncio.Lock() + stats = {"add": 0, "update": 0} + stats_lock = asyncio.Lock() + configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".") + max_batch_size = 50 + semaphore = asyncio.Semaphore(1) + + with ( + patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk, + patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file, + ): + mock_hash_file.return_value = "hash2" + mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"] + await chunked_add( + file_path, + collection, + collection_lock, + stats, + stats_lock, + configs, + max_batch_size, + semaphore, + ) + + assert stats["add"] == 0 + assert stats["update"] == 1 + collection.add.assert_called() + + @pytest.mark.asyncio async def test_chunked_add_empty_file(): file_path = "test_file.py"