diff --git a/src/vectorcode/common.py b/src/vectorcode/common.py index f4fff1a6..65ce495e 100644 --- a/src/vectorcode/common.py +++ b/src/vectorcode/common.py @@ -12,6 +12,7 @@ import httpx from chromadb.api import AsyncClientAPI from chromadb.api.models.AsyncCollection import AsyncCollection +from chromadb.api.types import IncludeEnum from chromadb.config import APIVersion, Settings from chromadb.utils import embedding_functions @@ -248,3 +249,15 @@ def verify_ef(collection: AsyncCollection, configs: Config): f"The collection was embedded with a different set of configurations: {collection_ep}. The result may be inaccurate.", ) return True + + +async def list_collection_files(collection: AsyncCollection) -> list[str]: + return list( + set( + str(c.get("path", None)) + for c in (await collection.get(include=[IncludeEnum.metadatas])).get( + "metadatas" + ) + or [] + ) + ) diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py index f6fae6d1..e26bd7b8 100644 --- a/src/vectorcode/lsp_main.py +++ b/src/vectorcode/lsp_main.py @@ -15,6 +15,7 @@ exclude_paths_by_spec, find_exclude_specs, load_files_from_include, + remove_orphanes, ) try: # pragma: nocover @@ -220,6 +221,9 @@ async def execute_command(ls: LanguageServer, args: list[str]): percentage=int(100 * i / len(tasks)), ), ) + + await remove_orphanes(collection, collection_lock, stats, stats_lock) + ls.progress.end( progress_token, types.WorkDoneProgressEnd( diff --git a/src/vectorcode/mcp_main.py b/src/vectorcode/mcp_main.py index e9a9812c..86d9989c 100644 --- a/src/vectorcode/mcp_main.py +++ b/src/vectorcode/mcp_main.py @@ -17,6 +17,7 @@ chunked_add, exclude_paths_by_spec, find_exclude_specs, + remove_orphanes, ) try: # pragma: nocover @@ -157,6 +158,9 @@ async def vectorise_files(paths: list[str], project_root: str) -> dict[str, int] ] for i, task in enumerate(asyncio.as_completed(tasks), start=1): await task + + await remove_orphanes(collection, collection_lock, stats, stats_lock) + return stats.to_dict() diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index e1c61dbb..40ef1619 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -23,7 +23,12 @@ expand_globs, expand_path, ) -from vectorcode.common import get_client, get_collection, verify_ef +from vectorcode.common import ( + get_client, + get_collection, + list_collection_files, + verify_ef, +) logger = logging.getLogger(name=__name__) @@ -155,6 +160,25 @@ async def chunked_add( stats.add += 1 +async def remove_orphanes( + collection: AsyncCollection, + collection_lock: Lock, + stats: VectoriseStats, + stats_lock: Lock, +): + async with collection_lock: + paths = await list_collection_files(collection) + orphans = set() + for path in paths: + if isinstance(path, str) and not os.path.isfile(path): + orphans.add(path) + async with stats_lock: + stats.removed = len(orphans) + if len(orphans): + logger.info(f"Removing {len(orphans)} orphaned files from database.") + await collection.delete(where={"path": {"$in": list(orphans)}}) + + def show_stats(configs: Config, stats: VectoriseStats): if configs.pipe: print(stats.to_json()) @@ -284,19 +308,7 @@ async def vectorise(configs: Config) -> int: print("Abort.", file=sys.stderr) return 1 - async with collection_lock: - all_results = await collection.get(include=[IncludeEnum.metadatas]) - if all_results is not None and all_results.get("metadatas"): - paths = (meta["path"] for meta in (all_results["metadatas"] or [])) - orphans = set() - for path in paths: - if isinstance(path, str) and not os.path.isfile(path): - orphans.add(path) - async with stats_lock: - stats.removed = len(orphans) - if len(orphans): - logger.info(f"Removing {len(orphans)} orphaned files from database.") - await collection.delete(where={"path": {"$in": list(orphans)}}) + await remove_orphanes(collection, collection_lock, stats, stats_lock) show_stats(configs=configs, stats=stats) return 0 diff --git a/tests/test_lsp.py b/tests/test_lsp.py index 530c2316..46bcf7eb 100644 --- a/tests/test_lsp.py +++ b/tests/test_lsp.py @@ -265,6 +265,9 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf patch( "vectorcode.lsp_main.make_caches", new_callable=AsyncMock ), # Mock make_caches to avoid actual file system ops + patch( + "vectorcode.lsp_main.remove_orphanes", new_callable=AsyncMock + ) as mock_remove_orphanes, ): from unittest.mock import ANY @@ -279,7 +282,7 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf mock_parse_cli_args.return_value = mock_config mock_client = AsyncMock() mock_get_client.return_value = mock_client - mock_collection = MagicMock() + mock_collection = AsyncMock() mock_get_collection.return_value = mock_collection mock_client.get_max_batch_size.return_value = 100 # Mock batch size @@ -337,6 +340,7 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf assert mock_language_server.progress.report.call_count == len( dummy_expanded_files ) + mock_remove_orphanes.assert_called_once() mock_language_server.progress.end.assert_called_once()