Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/vectorcode/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import httpx
from chromadb.api import AsyncClientAPI
from chromadb.api.models.AsyncCollection import AsyncCollection
from chromadb.api.types import IncludeEnum
from chromadb.config import APIVersion, Settings
from chromadb.utils import embedding_functions

Expand Down Expand Up @@ -248,3 +249,15 @@ def verify_ef(collection: AsyncCollection, configs: Config):
f"The collection was embedded with a different set of configurations: {collection_ep}. The result may be inaccurate.",
)
return True


async def list_collection_files(collection: AsyncCollection) -> list[str]:
return list(
set(
str(c.get("path", None))
for c in (await collection.get(include=[IncludeEnum.metadatas])).get(
"metadatas"
)
or []
)
)
4 changes: 4 additions & 0 deletions src/vectorcode/lsp_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
exclude_paths_by_spec,
find_exclude_specs,
load_files_from_include,
remove_orphanes,
)

try: # pragma: nocover
Expand Down Expand Up @@ -220,6 +221,9 @@ async def execute_command(ls: LanguageServer, args: list[str]):
percentage=int(100 * i / len(tasks)),
),
)

await remove_orphanes(collection, collection_lock, stats, stats_lock)

ls.progress.end(
progress_token,
types.WorkDoneProgressEnd(
Expand Down
4 changes: 4 additions & 0 deletions src/vectorcode/mcp_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
chunked_add,
exclude_paths_by_spec,
find_exclude_specs,
remove_orphanes,
)

try: # pragma: nocover
Expand Down Expand Up @@ -157,6 +158,9 @@ async def vectorise_files(paths: list[str], project_root: str) -> dict[str, int]
]
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
await task

await remove_orphanes(collection, collection_lock, stats, stats_lock)

return stats.to_dict()


Expand Down
40 changes: 26 additions & 14 deletions src/vectorcode/subcommands/vectorise.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
expand_globs,
expand_path,
)
from vectorcode.common import get_client, get_collection, verify_ef
from vectorcode.common import (
get_client,
get_collection,
list_collection_files,
verify_ef,
)

logger = logging.getLogger(name=__name__)

Expand Down Expand Up @@ -155,6 +160,25 @@ async def chunked_add(
stats.add += 1


async def remove_orphanes(
collection: AsyncCollection,
collection_lock: Lock,
stats: VectoriseStats,
stats_lock: Lock,
):
async with collection_lock:
paths = await list_collection_files(collection)
orphans = set()
for path in paths:
if isinstance(path, str) and not os.path.isfile(path):
orphans.add(path)
async with stats_lock:
stats.removed = len(orphans)
if len(orphans):
logger.info(f"Removing {len(orphans)} orphaned files from database.")
await collection.delete(where={"path": {"$in": list(orphans)}})


def show_stats(configs: Config, stats: VectoriseStats):
if configs.pipe:
print(stats.to_json())
Expand Down Expand Up @@ -284,19 +308,7 @@ async def vectorise(configs: Config) -> int:
print("Abort.", file=sys.stderr)
return 1

async with collection_lock:
all_results = await collection.get(include=[IncludeEnum.metadatas])
if all_results is not None and all_results.get("metadatas"):
paths = (meta["path"] for meta in (all_results["metadatas"] or []))
orphans = set()
for path in paths:
if isinstance(path, str) and not os.path.isfile(path):
orphans.add(path)
async with stats_lock:
stats.removed = len(orphans)
if len(orphans):
logger.info(f"Removing {len(orphans)} orphaned files from database.")
await collection.delete(where={"path": {"$in": list(orphans)}})
await remove_orphanes(collection, collection_lock, stats, stats_lock)

show_stats(configs=configs, stats=stats)
return 0
6 changes: 5 additions & 1 deletion tests/test_lsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,9 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf
patch(
"vectorcode.lsp_main.make_caches", new_callable=AsyncMock
), # Mock make_caches to avoid actual file system ops
patch(
"vectorcode.lsp_main.remove_orphanes", new_callable=AsyncMock
) as mock_remove_orphanes,
):
from unittest.mock import ANY

Expand All @@ -279,7 +282,7 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf
mock_parse_cli_args.return_value = mock_config
mock_client = AsyncMock()
mock_get_client.return_value = mock_client
mock_collection = MagicMock()
mock_collection = AsyncMock()
mock_get_collection.return_value = mock_collection
mock_client.get_max_batch_size.return_value = 100 # Mock batch size

Expand Down Expand Up @@ -337,6 +340,7 @@ async def test_execute_command_vectorise(mock_language_server, mock_config: Conf
assert mock_language_server.progress.report.call_count == len(
dummy_expanded_files
)
mock_remove_orphanes.assert_called_once()
mock_language_server.progress.end.assert_called_once()


Expand Down