From 4685ae7961c554903e6796e88d9912e8d2d6ef8d Mon Sep 17 00:00:00 2001
From: Zhe Yu <zcabzyu@ucl.ac.uk>
Date: Tue, 17 Jun 2025 17:11:33 +0800
Subject: [PATCH 1/3] feat(cli): Return more detailed vectorise stats in CLI
 and LSP server

---
 docs/cli.md                             |  4 ++
 src/vectorcode/lsp_main.py              |  5 ++-
 src/vectorcode/subcommands/update.py    |  4 +-
 src/vectorcode/subcommands/vectorise.py | 50 +++++++++++++++++--------
 tests/subcommands/test_vectorise.py     | 32 ++++++++--------
 tests/test_lsp.py                       |  3 +-
 6 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/docs/cli.md b/docs/cli.md
index 71397191..49e7e1ba 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -616,6 +616,10 @@ The output is in JSON format. It contains a dictionary with the following fields
 - `"add"`: number of added documents;
 - `"update"`: number of updated documents;
 - `"removed"`: number of removed documents;
+- `"skipped"`: number of skipped documents (because it's empty or its hash
+  matches the metadata saved in the database);
+- `"failed"`: number of documents that failed to be vectorised. This is usually
+  due to encoding issues.
 
 #### `vectorcode ls`
 A JSON array of collection information of the following format will be printed:
diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py
index d376e96e..58a966b9 100644
--- a/src/vectorcode/lsp_main.py
+++ b/src/vectorcode/lsp_main.py
@@ -10,6 +10,7 @@
 import shtab
 
 from vectorcode.subcommands.vectorise import (
+    VectoriseStats,
     chunked_add,
     exclude_paths_by_spec,
     find_exclude_specs,
@@ -188,7 +189,7 @@ async def execute_command(ls: LanguageServer, args: list[str]):
                         if os.path.isfile(spec):
                             logger.info(f"Loading ignore specs from {spec}.")
                             files = exclude_paths_by_spec((str(i) for i in files), spec)
-                stats = {"add": 0, "update": 0, "removed": 0}
+                stats = VectoriseStats()
                 collection_lock = asyncio.Lock()
                 stats_lock = asyncio.Lock()
                 max_batch_size = await client.get_max_batch_size()
@@ -220,7 +221,7 @@ async def execute_command(ls: LanguageServer, args: list[str]):
                 ls.progress.end(
                     progress_token,
                     types.WorkDoneProgressEnd(
-                        message=f"Vectorised {stats['add'] + stats['update']} files."
+                        message=f"Vectorised {stats.add + stats.update} files."
                     ),
                 )
                 return stats
diff --git a/src/vectorcode/subcommands/update.py b/src/vectorcode/subcommands/update.py
index 901c605f..41578eeb 100644
--- a/src/vectorcode/subcommands/update.py
+++ b/src/vectorcode/subcommands/update.py
@@ -10,7 +10,7 @@
 
 from vectorcode.cli_utils import Config
 from vectorcode.common import get_client, get_collection, verify_ef
-from vectorcode.subcommands.vectorise import chunked_add, show_stats
+from vectorcode.subcommands.vectorise import VectoriseStats, chunked_add, show_stats
 
 logger = logging.getLogger(name=__name__)
 
@@ -43,7 +43,7 @@ async def update(configs: Config) -> int:
         else:
             orphanes.add(file)
 
-    stats = {"add": 0, "update": 0, "removed": len(orphanes)}
+    stats = VectoriseStats(removed=len(orphanes))
     collection_lock = Lock()
     stats_lock = Lock()
     max_batch_size = await client.get_max_batch_size()
diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py
index 8efc91a4..0e4f0469 100644
--- a/src/vectorcode/subcommands/vectorise.py
+++ b/src/vectorcode/subcommands/vectorise.py
@@ -6,6 +6,7 @@
 import sys
 import uuid
 from asyncio import Lock
+from dataclasses import dataclass, fields
 from typing import Iterable, Optional
 
 import pathspec
@@ -27,6 +28,28 @@
 logger = logging.getLogger(name=__name__)
 
 
+@dataclass
+class VectoriseStats:
+    add: int = 0
+    update: int = 0
+    removed: int = 0
+    skipped: int = 0
+    failed: int = 0
+
+    def to_json(self) -> str:
+        return json.dumps({i.name: getattr(self, i.name) for i in fields(self)})
+
+    def to_table(self) -> str:
+        _fields = fields(self)
+        return tabulate.tabulate(
+            [
+                [i.name.capitalize() for i in _fields],
+                [getattr(self, i.name) for i in _fields],
+            ],
+            headers="firstrow",
+        )
+
+
 def hash_str(string: str) -> str:
     """Return the sha-256 hash of a string."""
     return hashlib.sha256(string.encode()).hexdigest()
@@ -53,7 +76,7 @@ async def chunked_add(
     file_path: str,
     collection: AsyncCollection,
     collection_lock: Lock,
-    stats: dict[str, int],
+    stats: VectoriseStats,
     stats_lock: Lock,
     configs: Config,
     max_batch_size: int,
@@ -74,6 +97,7 @@ async def chunked_add(
         logger.debug(
             f"Skipping {full_path_str} because it's unchanged since last vectorisation."
         )
+        stats.skipped += 1
         return
 
     if num_existing_chunks:
@@ -92,6 +116,7 @@ async def chunked_add(
             if len(chunks) == 0 or (len(chunks) == 1 and chunks[0] == ""):
                 # empty file
                 logger.debug(f"Skipping {full_path_str} because it's empty.")
+                stats.skipped += 1
                 return
             chunks.append(str(os.path.relpath(full_path_str, configs.project_root)))
             logger.debug(f"Chunked into {len(chunks)} pieces.")
@@ -116,29 +141,22 @@ async def chunked_add(
                     )
     except (UnicodeDecodeError, UnicodeError):  # pragma: nocover
         logger.warning(f"Failed to decode {full_path_str}.")
+        stats.failed += 1
         return
 
     if num_existing_chunks:
         async with stats_lock:
-            stats["update"] += 1
+            stats.update += 1
     else:
         async with stats_lock:
-            stats["add"] += 1
+            stats.add += 1
 
 
-def show_stats(configs: Config, stats):
+def show_stats(configs: Config, stats: VectoriseStats):
     if configs.pipe:
-        print(json.dumps(stats))
+        print(stats.to_json())
     else:
-        print(
-            tabulate.tabulate(
-                [
-                    ["Added", "Updated", "Removed"],
-                    [stats["add"], stats["update"], stats["removed"]],
-                ],
-                headers="firstrow",
-            )
-        )
+        print(stats.to_table())
 
 
 def exclude_paths_by_spec(
@@ -229,7 +247,7 @@ async def vectorise(configs: Config) -> int:
     else:  # pragma: nocover
         logger.info("Ignoring exclude specs.")
 
-    stats = {"add": 0, "update": 0, "removed": 0}
+    stats = VectoriseStats()
     collection_lock = Lock()
     stats_lock = Lock()
     max_batch_size = await client.get_max_batch_size()
@@ -270,7 +288,7 @@ async def vectorise(configs: Config) -> int:
                 if isinstance(path, str) and not os.path.isfile(path):
                     orphans.add(path)
             async with stats_lock:
-                stats["removed"] = len(orphans)
+                stats.removed = len(orphans)
             if len(orphans):
                 logger.info(f"Removing {len(orphans)} orphaned files from database.")
                 await collection.delete(where={"path": {"$in": list(orphans)}})
diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py
index 37b2b109..2f363a8b 100644
--- a/tests/subcommands/test_vectorise.py
+++ b/tests/subcommands/test_vectorise.py
@@ -1,6 +1,5 @@
 import asyncio
 import hashlib
-import json
 import os
 import socket
 import tempfile
@@ -15,6 +14,7 @@
 from vectorcode.chunking import Chunk
 from vectorcode.cli_utils import Config
 from vectorcode.subcommands.vectorise import (
+    VectoriseStats,
     chunked_add,
     exclude_paths_by_spec,
     get_uuid,
@@ -74,7 +74,7 @@ async def test_chunked_add():
     file_path = "test_file.py"
     collection = AsyncMock()
     collection_lock = asyncio.Lock()
-    stats = {"add": 0, "update": 0}
+    stats = VectoriseStats()
     stats_lock = asyncio.Lock()
     configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
     max_batch_size = 50
@@ -97,8 +97,8 @@ async def test_chunked_add():
             semaphore,
         )
 
-    assert stats["add"] == 1
-    assert stats["update"] == 0
+    assert stats.add == 1
+    assert stats.update == 0
     collection.add.assert_called()
     assert collection.add.call_count == 1
 
@@ -110,7 +110,7 @@ async def test_chunked_add_with_existing():
     collection.get = AsyncMock()
     collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]}
     collection_lock = asyncio.Lock()
-    stats = {"add": 0, "update": 0}
+    stats = VectoriseStats()
     stats_lock = asyncio.Lock()
     configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
     max_batch_size = 50
@@ -133,8 +133,8 @@ async def test_chunked_add_with_existing():
             semaphore,
         )
 
-    assert stats["add"] == 0
-    assert stats["update"] == 0
+    assert stats.add == 0
+    assert stats.update == 0
     collection.add.assert_not_called()
 
 
@@ -145,7 +145,7 @@ async def test_chunked_add_update_existing():
     collection.get = AsyncMock()
     collection.get.return_value = {"ids": ["id1"], "metadatas": [{"sha256": "hash1"}]}
     collection_lock = asyncio.Lock()
-    stats = {"add": 0, "update": 0}
+    stats = VectoriseStats()
     stats_lock = asyncio.Lock()
     configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
     max_batch_size = 50
@@ -168,8 +168,8 @@ async def test_chunked_add_update_existing():
             semaphore,
         )
 
-    assert stats["add"] == 0
-    assert stats["update"] == 1
+    assert stats.add == 0
+    assert stats.update == 1
     collection.add.assert_called()
 
 
@@ -178,7 +178,7 @@ async def test_chunked_add_empty_file():
     file_path = "test_file.py"
     collection = AsyncMock()
     collection_lock = asyncio.Lock()
-    stats = {"add": 0, "update": 0}
+    stats = VectoriseStats(**{"add": 0, "update": 0})
     stats_lock = asyncio.Lock()
     configs = Config(chunk_size=100, overlap_ratio=0.2, project_root=".")
     max_batch_size = 50
@@ -201,25 +201,25 @@ async def test_chunked_add_empty_file():
             semaphore,
         )
 
-    assert stats["add"] == 0
-    assert stats["update"] == 0
+    assert stats.add == 0
+    assert stats.update == 0
     assert collection.add.call_count == 0
 
 
 @patch("tabulate.tabulate")
 def test_show_stats_pipe_false(mock_tabulate, capsys):
     configs = Config(pipe=False)
-    stats = {"add": 1, "update": 2, "removed": 3}
+    stats = VectoriseStats(**{"add": 1, "update": 2, "removed": 3})
     show_stats(configs, stats)
     mock_tabulate.assert_called_once()
 
 
 def test_show_stats_pipe_true(capsys):
     configs = Config(pipe=True)
-    stats = {"add": 1, "update": 2, "removed": 3}
+    stats = VectoriseStats(**{"add": 1, "update": 2, "removed": 3})
     show_stats(configs, stats)
     captured = capsys.readouterr()
-    assert captured.out == json.dumps(stats) + "\n"
+    assert captured.out.strip() == (stats.to_json())
 
 
 def test_exclude_paths_by_spec():
diff --git a/tests/test_lsp.py b/tests/test_lsp.py
index d5036f99..ad50572b 100644
--- a/tests/test_lsp.py
+++ b/tests/test_lsp.py
@@ -11,6 +11,7 @@
     lsp_start,
     make_caches,
 )
+from vectorcode.subcommands.vectorise import VectoriseStats
 
 
 @pytest.fixture
@@ -294,7 +295,7 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf
         result = await execute_command(
             mock_language_server, ["vectorise", "/test/project"]
         )
-        assert isinstance(result, dict)
+        assert isinstance(result, VectoriseStats)
 
         # Assertions
         mock_language_server.progress.create_async.assert_called_once()

From e928d02ed1934696689226c14a955ef8111e1ddb Mon Sep 17 00:00:00 2001
From: Davidyz <Davidyz@users.noreply.github.com>
Date: Tue, 17 Jun 2025 09:12:32 +0000
Subject: [PATCH 2/3] Auto generate docs

---
 doc/VectorCode-cli.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt
index 5947fca0..c56228db 100644
--- a/doc/VectorCode-cli.txt
+++ b/doc/VectorCode-cli.txt
@@ -682,7 +682,10 @@ VECTORCODE VECTORISE
 
 The output is in JSON format. It contains a dictionary with the following
 fields: - `"add"`number of added documents; - `"update"`number of updated
-documents; - `"removed"`number of removed documents;
+documents; - `"removed"`number of removed documents; - `"skipped"`number of
+skipped documents (because it’s empty or its hash matches the metadata saved
+in the database); - `"failed"`number of documents that failed to be vectorised.
+This is usually due to encoding issues.
 
 
 VECTORCODE LS

From e517972fbba411cc714b72dd3e98354f92f0efe1 Mon Sep 17 00:00:00 2001
From: Zhe Yu <zcabzyu@ucl.ac.uk>
Date: Tue, 17 Jun 2025 17:24:14 +0800
Subject: [PATCH 3/3] refactor(cli): Return dict instead of VectoriseStats in
 LSP server

---
 src/vectorcode/lsp_main.py              | 2 +-
 src/vectorcode/subcommands/vectorise.py | 5 ++++-
 tests/test_lsp.py                       | 6 ++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py
index 58a966b9..45891d00 100644
--- a/src/vectorcode/lsp_main.py
+++ b/src/vectorcode/lsp_main.py
@@ -224,7 +224,7 @@ async def execute_command(ls: LanguageServer, args: list[str]):
                         message=f"Vectorised {stats.add + stats.update} files."
                     ),
                 )
-                return stats
+                return stats.to_dict()
             case _ as c:  # pragma: nocover
                 error_message = f"Unsupported vectorcode subcommand: {str(c)}"
                 logger.error(
diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py
index 0e4f0469..ebaa2f6c 100644
--- a/src/vectorcode/subcommands/vectorise.py
+++ b/src/vectorcode/subcommands/vectorise.py
@@ -37,7 +37,10 @@ class VectoriseStats:
     failed: int = 0
 
     def to_json(self) -> str:
-        return json.dumps({i.name: getattr(self, i.name) for i in fields(self)})
+        return json.dumps(self.to_dict())
+
+    def to_dict(self) -> dict[str, int]:
+        return {i.name: getattr(self, i.name) for i in fields(self)}
 
     def to_table(self) -> str:
         _fields = fields(self)
diff --git a/tests/test_lsp.py b/tests/test_lsp.py
index ad50572b..530c2316 100644
--- a/tests/test_lsp.py
+++ b/tests/test_lsp.py
@@ -11,7 +11,6 @@
     lsp_start,
     make_caches,
 )
-from vectorcode.subcommands.vectorise import VectoriseStats
 
 
 @pytest.fixture
@@ -295,7 +294,10 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf
         result = await execute_command(
             mock_language_server, ["vectorise", "/test/project"]
         )
-        assert isinstance(result, VectoriseStats)
+        assert isinstance(result, dict) and all(
+            k in ("add", "update", "removed", "failed", "skipped")
+            for k in result.keys()
+        )
 
         # Assertions
         mock_language_server.progress.create_async.assert_called_once()