From 4685ae7961c554903e6796e88d9912e8d2d6ef8d Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Tue, 17 Jun 2025 17:11:33 +0800 Subject: [PATCH 1/3] feat(cli): Return more detailed vectorise stats in CLI and LSP server --- docs/cli.md | 4 ++ src/vectorcode/lsp_main.py | 5 ++- src/vectorcode/subcommands/update.py | 4 +- src/vectorcode/subcommands/vectorise.py | 50 +++++++++++++++++-------- tests/subcommands/test_vectorise.py | 32 ++++++++-------- tests/test_lsp.py | 3 +- 6 files changed, 61 insertions(+), 37 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 71397191..49e7e1ba 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -616,6 +616,10 @@ The output is in JSON format. It contains a dictionary with the following fields - `"add"`: number of added documents; - `"update"`: number of updated documents; - `"removed"`: number of removed documents; +- `"skipped"`: number of skipped documents (because it's empty or its hash + matches the metadata saved in the database); +- `"failed"`: number of documents that failed to be vectorised. This is usually + due to encoding issues. #### `vectorcode ls` A JSON array of collection information of the following format will be printed: diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py index d376e96e..58a966b9 100644 --- a/src/vectorcode/lsp_main.py +++ b/src/vectorcode/lsp_main.py @@ -10,6 +10,7 @@ import shtab from vectorcode.subcommands.vectorise import ( + VectoriseStats, chunked_add, exclude_paths_by_spec, find_exclude_specs, @@ -188,7 +189,7 @@ async def execute_command(ls: LanguageServer, args: list[str]): if os.path.isfile(spec): logger.info(f"Loading ignore specs from {spec}.") files = exclude_paths_by_spec((str(i) for i in files), spec) - stats = {"add": 0, "update": 0, "removed": 0} + stats = VectoriseStats() collection_lock = asyncio.Lock() stats_lock = asyncio.Lock() max_batch_size = await client.get_max_batch_size() @@ -220,7 +221,7 @@ async def execute_command(ls: LanguageServer, args: list[str]): ls.progress.end( progress_token, types.WorkDoneProgressEnd( - message=f"Vectorised {stats['add'] + stats['update']} files." + message=f"Vectorised {stats.add + stats.update} files." ), ) return stats diff --git a/src/vectorcode/subcommands/update.py b/src/vectorcode/subcommands/update.py index 901c605f..41578eeb 100644 --- a/src/vectorcode/subcommands/update.py +++ b/src/vectorcode/subcommands/update.py @@ -10,7 +10,7 @@ from vectorcode.cli_utils import Config from vectorcode.common import get_client, get_collection, verify_ef -from vectorcode.subcommands.vectorise import chunked_add, show_stats +from vectorcode.subcommands.vectorise import VectoriseStats, chunked_add, show_stats logger = logging.getLogger(name=__name__) @@ -43,7 +43,7 @@ async def update(configs: Config) -> int: else: orphanes.add(file) - stats = {"add": 0, "update": 0, "removed": len(orphanes)} + stats = VectoriseStats(removed=len(orphanes)) collection_lock = Lock() stats_lock = Lock() max_batch_size = await client.get_max_batch_size() diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index 8efc91a4..0e4f0469 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -6,6 +6,7 @@ import sys import uuid from asyncio import Lock +from dataclasses import dataclass, fields from typing import Iterable, Optional import pathspec @@ -27,6 +28,28 @@ logger = logging.getLogger(name=__name__) +@dataclass +class VectoriseStats: + add: int = 0 + update: int = 0 + removed: int = 0 + skipped: int = 0 + failed: int = 0 + + def to_json(self) -> str: + return json.dumps({i.name: getattr(self, i.name) for i in fields(self)}) + + def to_table(self) -> str: + _fields = fields(self) + return tabulate.tabulate( + [ + [i.name.capitalize() for i in _fields], + [getattr(self, i.name) for i in _fields], + ], + headers="firstrow", + ) + + def hash_str(string: str) -> str: """Return the sha-256 hash of a string.""" return hashlib.sha256(string.encode()).hexdigest() @@ -53,7 +76,7 @@ async def chunked_add( file_path: str, collection: AsyncCollection, collection_lock: Lock, - stats: dict[str, int], + stats: VectoriseStats, stats_lock: Lock, configs: Config, max_batch_size: int, @@ -74,6 +97,7 @@ async def chunked_add( logger.debug( f"Skipping {full_path_str} because it's unchanged since last vectorisation." ) + stats.skipped += 1 return if num_existing_chunks: @@ -92,6 +116,7 @@ async def chunked_add( if len(chunks) == 0 or (len(chunks) == 1 and chunks[0] == ""): # empty file logger.debug(f"Skipping {full_path_str} because it's empty.") + stats.skipped += 1 return chunks.append(str(os.path.relpath(full_path_str, configs.project_root))) logger.debug(f"Chunked into {len(chunks)} pieces.") @@ -116,29 +141,22 @@ async def chunked_add( ) except (UnicodeDecodeError, UnicodeError): # pragma: nocover logger.warning(f"Failed to decode {full_path_str}.") + stats.failed += 1 return if num_existing_chunks: async with stats_lock: - stats["update"] += 1 + stats.update += 1 else: async with stats_lock: - stats["add"] += 1 + stats.add += 1 -def show_stats(configs: Config, stats): +def show_stats(configs: Config, stats: VectoriseStats): if configs.pipe: - print(json.dumps(stats)) + print(stats.to_json()) else: - print( - tabulate.tabulate( - [ - ["Added", "Updated", "Removed"], - [stats["add"], stats["update"], stats["removed"]], - ], - headers="firstrow", - ) - ) + print(stats.to_table()) def exclude_paths_by_spec( @@ -229,7 +247,7 @@ async def vectorise(configs: Config) -> int: else: # pragma: nocover logger.info("Ignoring exclude specs.") - stats = {"add": 0, "update": 0, "removed": 0} + stats = VectoriseStats() collection_lock = Lock() stats_lock = Lock() max_batch_size = await client.get_max_batch_size() @@ -270,7 +288,7 @@ async def vectorise(configs: Config) -> int: if isinstance(path, str) and not os.path.isfile(path): orphans.add(path) async with stats_lock: - stats["removed"] = len(orphans) + stats.removed = len(orphans) if len(orphans): logger.info(f"Removing {len(orphans)} orphaned files from database.") await collection.delete(where={"path": {"$in": list(orphans)}}) diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py index 37b2b109..2f363a8b 100644 --- a/tests/subcommands/test_vectorise.py +++ b/tests/subcommands/test_vectorise.py @@ -1,6 +1,5 @@ import asyncio import hashlib -import json import os import socket import tempfile @@ -15,6 +14,7 @@ from vectorcode.chunking import Chunk from vectorcode.cli_utils import Config from vectorcode.subcommands.vectorise import ( + VectoriseStats, chunked_add, exclude_paths_by_spec, get_uuid, @@ -74,7 +74,7 @@ async def test_chunked_add(): file_path = "test_file.py" collection = AsyncMock() collection_lock = asyncio.Lock() - stats = {"add": 0, "update": 0} + stats = VectoriseStats() stats_lock = asyncio.Lock() configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".") max_batch_size = 50 @@ -97,8 +97,8 @@ async def test_chunked_add(): semaphore, ) - assert stats["add"] == 1 - assert stats["update"] == 0 + assert stats.add == 1 + assert stats.update == 0 collection.add.assert_called() assert collection.add.call_count == 1 @@ -110,7 +110,7 @@ async def test_chunked_add_with_existing(): collection.get = AsyncMock() collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]} collection_lock = asyncio.Lock() - stats = {"add": 0, "update": 0} + stats = VectoriseStats() stats_lock = asyncio.Lock() configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".") max_batch_size = 50 @@ -133,8 +133,8 @@ async def test_chunked_add_with_existing(): semaphore, ) - assert stats["add"] == 0 - assert stats["update"] == 0 + assert stats.add == 0 + assert stats.update == 0 collection.add.assert_not_called() @@ -145,7 +145,7 @@ async def test_chunked_add_update_existing(): collection.get = AsyncMock() collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]} collection_lock = asyncio.Lock() - stats = {"add": 0, "update": 0} + stats = VectoriseStats() stats_lock = asyncio.Lock() configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".") max_batch_size = 50 @@ -168,8 +168,8 @@ async def test_chunked_add_update_existing(): semaphore, ) - assert stats["add"] == 0 - assert stats["update"] == 1 + assert stats.add == 0 + assert stats.update == 1 collection.add.assert_called() @@ -178,7 +178,7 @@ async def test_chunked_add_empty_file(): file_path = "test_file.py" collection = AsyncMock() collection_lock = asyncio.Lock() - stats = {"add": 0, "update": 0} + stats = VectoriseStats(**{"add": 0, "update": 0}) stats_lock = asyncio.Lock() configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".") max_batch_size = 50 @@ -201,25 +201,25 @@ async def test_chunked_add_empty_file(): semaphore, ) - assert stats["add"] == 0 - assert stats["update"] == 0 + assert stats.add == 0 + assert stats.update == 0 assert collection.add.call_count == 0 @patch("tabulate.tabulate") def test_show_stats_pipe_false(mock_tabulate, capsys): configs = Config(pipe=False) - stats = {"add": 1, "update": 2, "removed": 3} + stats = VectoriseStats(**{"add": 1, "update": 2, "removed": 3}) show_stats(configs, stats) mock_tabulate.assert_called_once() def test_show_stats_pipe_true(capsys): configs = Config(pipe=True) - stats = {"add": 1, "update": 2, "removed": 3} + stats = VectoriseStats(**{"add": 1, "update": 2, "removed": 3}) show_stats(configs, stats) captured = capsys.readouterr() - assert captured.out == json.dumps(stats) + "\n" + assert captured.out.strip() == (stats.to_json()) def test_exclude_paths_by_spec(): diff --git a/tests/test_lsp.py b/tests/test_lsp.py index d5036f99..ad50572b 100644 --- a/tests/test_lsp.py +++ b/tests/test_lsp.py @@ -11,6 +11,7 @@ lsp_start, make_caches, ) +from vectorcode.subcommands.vectorise import VectoriseStats @pytest.fixture @@ -294,7 +295,7 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf result = await execute_command( mock_language_server, ["vectorise", "/test/project"] ) - assert isinstance(result, dict) + assert isinstance(result, VectoriseStats) # Assertions mock_language_server.progress.create_async.assert_called_once() From e928d02ed1934696689226c14a955ef8111e1ddb Mon Sep 17 00:00:00 2001 From: Davidyz Date: Tue, 17 Jun 2025 09:12:32 +0000 Subject: [PATCH 2/3] Auto generate docs --- doc/VectorCode-cli.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt index 5947fca0..c56228db 100644 --- a/doc/VectorCode-cli.txt +++ b/doc/VectorCode-cli.txt @@ -682,7 +682,10 @@ VECTORCODE VECTORISE The output is in JSON format. It contains a dictionary with the following fields: - `"add"`number of added documents; - `"update"`number of updated -documents; - `"removed"`number of removed documents; +documents; - `"removed"`number of removed documents; - `"skipped"`number of +skipped documents (because it’s empty or its hash matches the metadata saved +in the database); - `"failed"`number of documents that failed to be vectorised. +This is usually due to encoding issues. VECTORCODE LS From e517972fbba411cc714b72dd3e98354f92f0efe1 Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Tue, 17 Jun 2025 17:24:14 +0800 Subject: [PATCH 3/3] refactor(cli): Return dict instead of VectoriseStats in LSP server --- src/vectorcode/lsp_main.py | 2 +- src/vectorcode/subcommands/vectorise.py | 5 ++++- tests/test_lsp.py | 6 ++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py index 58a966b9..45891d00 100644 --- a/src/vectorcode/lsp_main.py +++ b/src/vectorcode/lsp_main.py @@ -224,7 +224,7 @@ async def execute_command(ls: LanguageServer, args: list[str]): message=f"Vectorised {stats.add + stats.update} files." ), ) - return stats + return stats.to_dict() case _ as c: # pragma: nocover error_message = f"Unsupported vectorcode subcommand: {str(c)}" logger.error( diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index 0e4f0469..ebaa2f6c 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -37,7 +37,10 @@ class VectoriseStats: failed: int = 0 def to_json(self) -> str: - return json.dumps({i.name: getattr(self, i.name) for i in fields(self)}) + return json.dumps(self.to_dict()) + + def to_dict(self) -> dict[str, int]: + return {i.name: getattr(self, i.name) for i in fields(self)} def to_table(self) -> str: _fields = fields(self) diff --git a/tests/test_lsp.py b/tests/test_lsp.py index ad50572b..530c2316 100644 --- a/tests/test_lsp.py +++ b/tests/test_lsp.py @@ -11,7 +11,6 @@ lsp_start, make_caches, ) -from vectorcode.subcommands.vectorise import VectoriseStats @pytest.fixture @@ -295,7 +294,10 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf result = await execute_command( mock_language_server, ["vectorise", "/test/project"] ) - assert isinstance(result, VectoriseStats) + assert isinstance(result, dict) and all( + k in ("add", "update", "removed", "failed", "skipped") + for k in result.keys() + ) # Assertions mock_language_server.progress.create_async.assert_called_once()