Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/vectorcode/subcommands/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ async def update(configs: Config) -> int:
for task in asyncio.as_completed(tasks):
await task
bar.update(1)
except asyncio.CancelledError:
except asyncio.CancelledError: # pragma: nocover
print("Abort.", file=sys.stderr)
return 1

Expand Down
38 changes: 30 additions & 8 deletions src/vectorcode/subcommands/vectorise.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ def hash_str(string: str) -> str:
return hashlib.sha256(string.encode()).hexdigest()


def hash_file(path: str) -> str:
"""return the sha-256 hash of a file."""
hasher = hashlib.sha256()
with open(path, "rb") as file:
while True:
chunk = file.read(8192)
if chunk:
hasher.update(chunk)
else:
break
return hasher.hexdigest()


def get_uuid() -> str:
return uuid.uuid4().hex

Expand All @@ -47,15 +60,21 @@ async def chunked_add(
semaphore: asyncio.Semaphore,
):
full_path_str = str(expand_path(str(file_path), True))
orig_sha256 = None
new_sha256 = hash_file(full_path_str)
async with collection_lock:
num_existing_chunks = len(
(
await collection.get(
where={"path": full_path_str},
include=[IncludeEnum.metadatas],
)
)["ids"]
existing_chunks = await collection.get(
where={"path": full_path_str},
include=[IncludeEnum.metadatas],
)
num_existing_chunks = len((existing_chunks)["ids"])
if existing_chunks["metadatas"]:
orig_sha256 = existing_chunks["metadatas"][0].get("sha256")
if orig_sha256 and orig_sha256 == new_sha256:
logger.debug(
f"Skipping {full_path_str} because it's unchanged since last vectorisation."
)
return

if num_existing_chunks:
logger.debug(
Expand All @@ -78,7 +97,10 @@ async def chunked_add(
logger.debug(f"Chunked into {len(chunks)} pieces.")
metas = []
for chunk in chunks:
meta: dict[str, str | int] = {"path": full_path_str}
meta: dict[str, str | int] = {
"path": full_path_str,
"sha256": new_sha256,
}
if isinstance(chunk, Chunk):
meta["start"] = chunk.start.row
meta["end"] = chunk.end.row
Expand Down
94 changes: 86 additions & 8 deletions tests/subcommands/test_vectorise.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import os
import socket
import tempfile
from contextlib import ExitStack
from unittest.mock import AsyncMock, MagicMock, mock_open, patch

Expand All @@ -17,6 +18,7 @@
chunked_add,
exclude_paths_by_spec,
get_uuid,
hash_file,
hash_str,
include_paths_by_spec,
load_files_from_include,
Expand All @@ -31,6 +33,36 @@ def test_hash_str():
assert hash_str(test_string) == expected_hash


def test_hash_file_basic():
content = b"This is a test file for hashing."
expected_hash = hashlib.sha256(content).hexdigest()

with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(content)
tmp_file_path = tmp_file.name

try:
actual_hash = hash_file(tmp_file_path)
assert actual_hash == expected_hash
finally:
os.remove(tmp_file_path)


def test_hash_file_empty():
content = b""
expected_hash = hashlib.sha256(content).hexdigest()

with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(content)
tmp_file_path = tmp_file.name

try:
actual_hash = hash_file(tmp_file_path)
assert actual_hash == expected_hash
finally:
os.remove(tmp_file_path)


def test_get_uuid():
uuid_str = get_uuid()
assert isinstance(uuid_str, str)
Expand All @@ -48,7 +80,11 @@ async def test_chunked_add():
max_batch_size = 50
semaphore = asyncio.Semaphore(1)

with patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk:
with (
patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk,
patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file,
):
mock_hash_file.return_value = "hash1"
mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"]
await chunked_add(
file_path,
Expand All @@ -72,15 +108,54 @@ async def test_chunked_add_with_existing():
file_path = "test_file.py"
collection = AsyncMock()
collection.get = AsyncMock()
collection.get.return_value = {"ids": ["id1"]}
collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]}
collection_lock = asyncio.Lock()
stats = {"add": 0, "update": 0}
stats_lock = asyncio.Lock()
configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
max_batch_size = 50
semaphore = asyncio.Semaphore(1)

with (
patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk,
patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file,
):
mock_hash_file.return_value = "hash1"
mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"]
await chunked_add(
file_path,
collection,
collection_lock,
stats,
stats_lock,
configs,
max_batch_size,
semaphore,
)

assert stats["add"] == 0
assert stats["update"] == 0
collection.add.assert_not_called()


@pytest.mark.asyncio
async def test_chunked_add_update_existing():
file_path = "test_file.py"
collection = AsyncMock()
collection.get = AsyncMock()
collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]}
collection_lock = asyncio.Lock()
stats = {"add": 0, "update": 0}
stats_lock = asyncio.Lock()
configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
max_batch_size = 50
semaphore = asyncio.Semaphore(1)

with patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk:
with (
patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk,
patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file,
):
mock_hash_file.return_value = "hash2"
mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"]
await chunked_add(
file_path,
Expand All @@ -96,9 +171,6 @@ async def test_chunked_add_with_existing():
assert stats["add"] == 0
assert stats["update"] == 1
collection.add.assert_called()
assert collection.add.call_count == 1
collection.delete.assert_called
assert collection.delete.call_count == 1


@pytest.mark.asyncio
Expand All @@ -112,7 +184,11 @@ async def test_chunked_add_empty_file():
max_batch_size = 50
semaphore = asyncio.Semaphore(1)

with patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk:
with (
patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk,
patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file,
):
mock_hash_file.return_value = "hash1"
mock_chunk.return_value = []
await chunked_add(
file_path,
Expand Down Expand Up @@ -390,7 +466,7 @@ async def test_vectorise_orphaned_files():
"metadatas": [{"path": "test_file.py"}, {"path": "non_existent_file.py"}]
}
mock_collection.get.side_effect = [
{"ids": []}, # Return value for chunked_add
{"ids": [], "metadatas": []}, # Return value for chunked_add
get_return, # Return value for orphaned files
]
mock_collection.delete.return_value = None
Expand Down Expand Up @@ -428,7 +504,9 @@ def is_file_side_effect(path):
"vectorcode.subcommands.vectorise.expand_globs",
return_value=["test_file.py"], # Ensure expand_globs returns a valid file
),
patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file,
):
mock_hash_file.return_value = "hash1"
result = await vectorise(configs)

assert result == 0
Expand Down