Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion doc/VectorCode-cli.txt
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,10 @@ VECTORCODE VECTORISE

The output is in JSON format. It contains a dictionary with the following
fields: - `"add"`number of added documents; - `"update"`number of updated
documents; - `"removed"`number of removed documents;
documents; - `"removed"`number of removed documents; - `"skipped"`number of
skipped documents (because it’s empty or its hash matches the metadata saved
in the database); - `"failed"`number of documents that failed to be vectorised.
This is usually due to encoding issues.


VECTORCODE LS
Expand Down
4 changes: 4 additions & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,10 @@ The output is in JSON format. It contains a dictionary with the following fields
- `"add"`: number of added documents;
- `"update"`: number of updated documents;
- `"removed"`: number of removed documents;
- `"skipped"`: number of skipped documents (because it's empty or its hash
matches the metadata saved in the database);
- `"failed"`: number of documents that failed to be vectorised. This is usually
due to encoding issues.

#### `vectorcode ls`
A JSON array of collection information of the following format will be printed:
Expand Down
7 changes: 4 additions & 3 deletions src/vectorcode/lsp_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import shtab

from vectorcode.subcommands.vectorise import (
VectoriseStats,
chunked_add,
exclude_paths_by_spec,
find_exclude_specs,
Expand Down Expand Up @@ -188,7 +189,7 @@ async def execute_command(ls: LanguageServer, args: list[str]):
if os.path.isfile(spec):
logger.info(f"Loading ignore specs from {spec}.")
files = exclude_paths_by_spec((str(i) for i in files), spec)
stats = {"add": 0, "update": 0, "removed": 0}
stats = VectoriseStats()
collection_lock = asyncio.Lock()
stats_lock = asyncio.Lock()
max_batch_size = await client.get_max_batch_size()
Expand Down Expand Up @@ -220,10 +221,10 @@ async def execute_command(ls: LanguageServer, args: list[str]):
ls.progress.end(
progress_token,
types.WorkDoneProgressEnd(
message=f"Vectorised {stats['add'] + stats['update']} files."
message=f"Vectorised {stats.add + stats.update} files."
),
)
return stats
return stats.to_dict()
case _ as c: # pragma: nocover
error_message = f"Unsupported vectorcode subcommand: {str(c)}"
logger.error(
Expand Down
4 changes: 2 additions & 2 deletions src/vectorcode/subcommands/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from vectorcode.cli_utils import Config
from vectorcode.common import get_client, get_collection, verify_ef
from vectorcode.subcommands.vectorise import chunked_add, show_stats
from vectorcode.subcommands.vectorise import VectoriseStats, chunked_add, show_stats

logger = logging.getLogger(name=__name__)

Expand Down Expand Up @@ -43,7 +43,7 @@ async def update(configs: Config) -> int:
else:
orphanes.add(file)

stats = {"add": 0, "update": 0, "removed": len(orphanes)}
stats = VectoriseStats(removed=len(orphanes))
collection_lock = Lock()
stats_lock = Lock()
max_batch_size = await client.get_max_batch_size()
Expand Down
53 changes: 37 additions & 16 deletions src/vectorcode/subcommands/vectorise.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sys
import uuid
from asyncio import Lock
from dataclasses import dataclass, fields
from typing import Iterable, Optional

import pathspec
Expand All @@ -27,6 +28,31 @@
logger = logging.getLogger(name=__name__)


@dataclass
class VectoriseStats:
add: int = 0
update: int = 0
removed: int = 0
skipped: int = 0
failed: int = 0

def to_json(self) -> str:
return json.dumps(self.to_dict())

def to_dict(self) -> dict[str, int]:
return {i.name: getattr(self, i.name) for i in fields(self)}

def to_table(self) -> str:
_fields = fields(self)
return tabulate.tabulate(
[
[i.name.capitalize() for i in _fields],
[getattr(self, i.name) for i in _fields],
],
headers="firstrow",
)


def hash_str(string: str) -> str:
"""Return the sha-256 hash of a string."""
return hashlib.sha256(string.encode()).hexdigest()
Expand All @@ -53,7 +79,7 @@ async def chunked_add(
file_path: str,
collection: AsyncCollection,
collection_lock: Lock,
stats: dict[str, int],
stats: VectoriseStats,
stats_lock: Lock,
configs: Config,
max_batch_size: int,
Expand All @@ -74,6 +100,7 @@ async def chunked_add(
logger.debug(
f"Skipping {full_path_str} because it's unchanged since last vectorisation."
)
stats.skipped += 1
return

if num_existing_chunks:
Expand All @@ -92,6 +119,7 @@ async def chunked_add(
if len(chunks) == 0 or (len(chunks) == 1 and chunks[0] == ""):
# empty file
logger.debug(f"Skipping {full_path_str} because it's empty.")
stats.skipped += 1
return
chunks.append(str(os.path.relpath(full_path_str, configs.project_root)))
logger.debug(f"Chunked into {len(chunks)} pieces.")
Expand All @@ -116,29 +144,22 @@ async def chunked_add(
)
except (UnicodeDecodeError, UnicodeError): # pragma: nocover
logger.warning(f"Failed to decode {full_path_str}.")
stats.failed += 1
return

if num_existing_chunks:
async with stats_lock:
stats["update"] += 1
stats.update += 1
else:
async with stats_lock:
stats["add"] += 1
stats.add += 1


def show_stats(configs: Config, stats):
def show_stats(configs: Config, stats: VectoriseStats):
if configs.pipe:
print(json.dumps(stats))
print(stats.to_json())
else:
print(
tabulate.tabulate(
[
["Added", "Updated", "Removed"],
[stats["add"], stats["update"], stats["removed"]],
],
headers="firstrow",
)
)
print(stats.to_table())


def exclude_paths_by_spec(
Expand Down Expand Up @@ -229,7 +250,7 @@ async def vectorise(configs: Config) -> int:
else: # pragma: nocover
logger.info("Ignoring exclude specs.")

stats = {"add": 0, "update": 0, "removed": 0}
stats = VectoriseStats()
collection_lock = Lock()
stats_lock = Lock()
max_batch_size = await client.get_max_batch_size()
Expand Down Expand Up @@ -270,7 +291,7 @@ async def vectorise(configs: Config) -> int:
if isinstance(path, str) and not os.path.isfile(path):
orphans.add(path)
async with stats_lock:
stats["removed"] = len(orphans)
stats.removed = len(orphans)
if len(orphans):
logger.info(f"Removing {len(orphans)} orphaned files from database.")
await collection.delete(where={"path": {"$in": list(orphans)}})
Expand Down
32 changes: 16 additions & 16 deletions tests/subcommands/test_vectorise.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import asyncio
import hashlib
import json
import os
import socket
import tempfile
Expand All @@ -15,6 +14,7 @@
from vectorcode.chunking import Chunk
from vectorcode.cli_utils import Config
from vectorcode.subcommands.vectorise import (
VectoriseStats,
chunked_add,
exclude_paths_by_spec,
get_uuid,
Expand Down Expand Up @@ -74,7 +74,7 @@ async def test_chunked_add():
file_path = "test_file.py"
collection = AsyncMock()
collection_lock = asyncio.Lock()
stats = {"add": 0, "update": 0}
stats = VectoriseStats()
stats_lock = asyncio.Lock()
configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
max_batch_size = 50
Expand All @@ -97,8 +97,8 @@ async def test_chunked_add():
semaphore,
)

assert stats["add"] == 1
assert stats["update"] == 0
assert stats.add == 1
assert stats.update == 0
collection.add.assert_called()
assert collection.add.call_count == 1

Expand All @@ -110,7 +110,7 @@ async def test_chunked_add_with_existing():
collection.get = AsyncMock()
collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]}
collection_lock = asyncio.Lock()
stats = {"add": 0, "update": 0}
stats = VectoriseStats()
stats_lock = asyncio.Lock()
configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
max_batch_size = 50
Expand All @@ -133,8 +133,8 @@ async def test_chunked_add_with_existing():
semaphore,
)

assert stats["add"] == 0
assert stats["update"] == 0
assert stats.add == 0
assert stats.update == 0
collection.add.assert_not_called()


Expand All @@ -145,7 +145,7 @@ async def test_chunked_add_update_existing():
collection.get = AsyncMock()
collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]}
collection_lock = asyncio.Lock()
stats = {"add": 0, "update": 0}
stats = VectoriseStats()
stats_lock = asyncio.Lock()
configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
max_batch_size = 50
Expand All @@ -168,8 +168,8 @@ async def test_chunked_add_update_existing():
semaphore,
)

assert stats["add"] == 0
assert stats["update"] == 1
assert stats.add == 0
assert stats.update == 1
collection.add.assert_called()


Expand All @@ -178,7 +178,7 @@ async def test_chunked_add_empty_file():
file_path = "test_file.py"
collection = AsyncMock()
collection_lock = asyncio.Lock()
stats = {"add": 0, "update": 0}
stats = VectoriseStats(**{"add": 0, "update": 0})
stats_lock = asyncio.Lock()
configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
max_batch_size = 50
Expand All @@ -201,25 +201,25 @@ async def test_chunked_add_empty_file():
semaphore,
)

assert stats["add"] == 0
assert stats["update"] == 0
assert stats.add == 0
assert stats.update == 0
assert collection.add.call_count == 0


@patch("tabulate.tabulate")
def test_show_stats_pipe_false(mock_tabulate, capsys):
configs = Config(pipe=False)
stats = {"add": 1, "update": 2, "removed": 3}
stats = VectoriseStats(**{"add": 1, "update": 2, "removed": 3})
show_stats(configs, stats)
mock_tabulate.assert_called_once()


def test_show_stats_pipe_true(capsys):
configs = Config(pipe=True)
stats = {"add": 1, "update": 2, "removed": 3}
stats = VectoriseStats(**{"add": 1, "update": 2, "removed": 3})
show_stats(configs, stats)
captured = capsys.readouterr()
assert captured.out == json.dumps(stats) + "\n"
assert captured.out.strip() == (stats.to_json())


def test_exclude_paths_by_spec():
Expand Down
5 changes: 4 additions & 1 deletion tests/test_lsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,10 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf
result = await execute_command(
mock_language_server, ["vectorise", "/test/project"]
)
assert isinstance(result, dict)
assert isinstance(result, dict) and all(
k in ("add", "update", "removed", "failed", "skipped")
for k in result.keys()
)

# Assertions
mock_language_server.progress.create_async.assert_called_once()
Expand Down