From 37eebaea58ccd84d0640a9b2c81630cdf9dd8a30 Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Thu, 24 Jul 2025 14:15:08 +0800 Subject: [PATCH 1/5] refactor(cli): Separate embedding function from collection --- src/vectorcode/cli_utils.py | 3 +++ src/vectorcode/common.py | 15 ++++++++------- src/vectorcode/subcommands/query/__init__.py | 3 ++- src/vectorcode/subcommands/vectorise.py | 5 +++++ 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py index 6c839f1b..ae09f746 100644 --- a/src/vectorcode/cli_utils.py +++ b/src/vectorcode/cli_utils.py @@ -114,6 +114,9 @@ class Config: files_action: Optional[FilesAction] = None rm_paths: list[str] = field(default_factory=list) + def __hash__(self) -> int: + return hash(self.__repr__()) + @classmethod async def import_from(cls, config_dict: dict[str, Any]) -> "Config": """ diff --git a/src/vectorcode/common.py b/src/vectorcode/common.py index c5f7cee4..f6cca17f 100644 --- a/src/vectorcode/common.py +++ b/src/vectorcode/common.py @@ -8,6 +8,7 @@ import sys from asyncio.subprocess import Process from dataclasses import dataclass +from functools import cache from typing import Any, AsyncGenerator, Optional from urllib.parse import urlparse @@ -127,11 +128,15 @@ def get_collection_name(full_path: str) -> str: return collection_id -def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction | None: +@cache +def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction: try: - return getattr(embedding_functions, configs.embedding_function)( + ef = getattr(embedding_functions, configs.embedding_function)( **configs.embedding_params ) + if ef is None: + raise AttributeError() + return ef except AttributeError: logger.warning( f"Failed to use {configs.embedding_function}. Falling back to Sentence Transformer.", @@ -161,7 +166,6 @@ async def get_collection( full_path = str(expand_path(str(configs.project_root), absolute=True)) if __COLLECTION_CACHE.get(full_path) is None: collection_name = get_collection_name(full_path) - embedding_function = get_embedding_function(configs) collection_meta: dict[str, str | int] = { "path": full_path, @@ -183,14 +187,11 @@ async def get_collection( f"Getting/Creating collection with the following metadata: {collection_meta}" ) if not make_if_missing: - __COLLECTION_CACHE[full_path] = await client.get_collection( - collection_name, embedding_function - ) + __COLLECTION_CACHE[full_path] = await client.get_collection(collection_name) else: collection = await client.get_or_create_collection( collection_name, metadata=collection_meta, - embedding_function=embedding_function, ) if ( not collection.metadata.get("hostname") == socket.gethostname() diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index e1c0fc2f..3828146d 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -19,6 +19,7 @@ from vectorcode.common import ( ClientManager, get_collection, + get_embedding_function, verify_ef, ) from vectorcode.subcommands.query.reranker import ( @@ -67,7 +68,7 @@ async def get_query_result_files( ) logger.info(f"Querying {num_query} chunks for reranking.") results = await collection.query( - query_texts=query_chunks, + query_embeddings=get_embedding_function(configs)(query_chunks), n_results=num_query, include=[ IncludeEnum.metadatas, diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index 31e6420e..14902cee 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -28,6 +28,7 @@ from vectorcode.common import ( ClientManager, get_collection, + get_embedding_function, list_collection_files, verify_ef, ) @@ -92,6 +93,7 @@ async def chunked_add( max_batch_size: int, semaphore: asyncio.Semaphore, ): + embedding_function = get_embedding_function(configs) full_path_str = str(expand_path(str(file_path), True)) orig_sha256 = None new_sha256 = hash_file(full_path_str) @@ -147,6 +149,9 @@ async def chunked_add( await collection.add( ids=[get_uuid() for _ in inserted_chunks], documents=[str(i) for i in inserted_chunks], + embeddings=embedding_function( + list(str(c) for c in inserted_chunks) + ), metadatas=metas, ) except (UnicodeDecodeError, UnicodeError): # pragma: nocover From e6bcea6e2f27b5d6b163468db58c332b8df9a9eb Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Thu, 24 Jul 2025 15:56:54 +0800 Subject: [PATCH 2/5] tests(cli): Mock embedding function in tests --- src/vectorcode/common.py | 2 +- tests/subcommands/query/test_query.py | 30 +++++++++++++++++++-------- tests/subcommands/test_vectorise.py | 6 ++++++ tests/test_mcp.py | 6 ++++++ 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/vectorcode/common.py b/src/vectorcode/common.py index f6cca17f..94443f4f 100644 --- a/src/vectorcode/common.py +++ b/src/vectorcode/common.py @@ -134,7 +134,7 @@ def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction: ef = getattr(embedding_functions, configs.embedding_function)( **configs.embedding_params ) - if ef is None: + if ef is None: # pragma: nocover raise AttributeError() return ef except AttributeError: diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py index 9f8e4078..8ce5a7ed 100644 --- a/tests/subcommands/query/test_query.py +++ b/tests/subcommands/query/test_query.py @@ -64,8 +64,14 @@ def mock_config(): @pytest.mark.asyncio async def test_get_query_result_files(mock_collection, mock_config): - # Mock the reranker - with patch("vectorcode.subcommands.query.get_reranker") as mock_get_reranker: + mock_embedding_function = MagicMock() + with ( + patch("vectorcode.subcommands.query.get_reranker") as mock_get_reranker, + patch( + "vectorcode.subcommands.query.get_embedding_function", + return_value=mock_embedding_function, + ), + ): mock_reranker_instance = MagicMock() mock_reranker_instance.rerank = AsyncMock( return_value=[ @@ -82,9 +88,7 @@ async def test_get_query_result_files(mock_collection, mock_config): # Check that query was called with the right parameters mock_collection.query.assert_called_once() args, kwargs = mock_collection.query.call_args - assert kwargs["query_texts"] == [ - "test query" - ] # Assuming chunking produces this + mock_embedding_function.assert_called_once_with(["test query"]) assert kwargs["n_results"] == 6 # n_result(3) * query_multiplier(2) assert IncludeEnum.metadatas in kwargs["include"] assert IncludeEnum.distances in kwargs["include"] @@ -285,10 +289,14 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config): mock_config.query = [ "this is a longer query that should be chunked into multiple parts" ] - + mock_embedding_function = MagicMock() with ( patch("vectorcode.subcommands.query.StringChunker") as MockChunker, patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker, + patch( + "vectorcode.subcommands.query.get_embedding_function", + return_value=mock_embedding_function, + ), ): # Set up MockChunker to chunk the query mock_chunker_instance = MagicMock() @@ -309,7 +317,7 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config): # Check query was called with chunked query mock_collection.query.assert_called_once() _, kwargs = mock_collection.query.call_args - assert kwargs["query_texts"] == ["chunk1", "chunk2", "chunk3"] + mock_embedding_function.assert_called_once_with(["chunk1", "chunk2", "chunk3"]) # Check the result assert result == ["file1.py", "file2.py"] @@ -319,10 +327,14 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config): async def test_get_query_result_files_multiple_queries(mock_collection, mock_config): # Set multiple query terms mock_config.query = ["term1", "term2", "term3"] - + mock_embedding_function = MagicMock() with ( patch("vectorcode.subcommands.query.StringChunker") as MockChunker, patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker, + patch( + "vectorcode.subcommands.query.get_embedding_function", + return_value=mock_embedding_function, + ), ): # Set up MockChunker to return the query terms as is mock_chunker_instance = MagicMock() @@ -342,7 +354,7 @@ async def test_get_query_result_files_multiple_queries(mock_collection, mock_con # Check query was called with all query terms mock_collection.query.assert_called_once() _, kwargs = mock_collection.query.call_args - assert set(kwargs["query_texts"]) == set(["term1", "term2", "term3"]) + mock_embedding_function.assert_called_once_with(["term1", "term2", "term3"]) # Check the result assert result == ["file1.py", "file2.py"] diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py index ca64e1a5..d82069bc 100644 --- a/tests/subcommands/test_vectorise.py +++ b/tests/subcommands/test_vectorise.py @@ -6,6 +6,7 @@ from contextlib import ExitStack from unittest.mock import AsyncMock, MagicMock, mock_open, patch +import numpy import pytest from chromadb.api.models.AsyncCollection import AsyncCollection from tree_sitter import Point @@ -511,6 +512,7 @@ def is_file_side_effect(path): else: return True + mock_embedding_function = MagicMock(return_value=numpy.random.random((100,))) with ( patch("os.path.isfile", side_effect=is_file_side_effect), patch( @@ -522,6 +524,10 @@ def is_file_side_effect(path): "vectorcode.subcommands.vectorise.get_collection", return_value=mock_collection, ), + patch( + "vectorcode.subcommands.vectorise.get_embedding_function", + return_value=mock_embedding_function, + ), patch("vectorcode.subcommands.vectorise.verify_ef", return_value=True), patch( "vectorcode.subcommands.vectorise.expand_globs", diff --git a/tests/test_mcp.py b/tests/test_mcp.py index f48e684b..be5427c1 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -3,6 +3,7 @@ from argparse import ArgumentParser from unittest.mock import AsyncMock, MagicMock, mock_open, patch +import numpy import pytest from mcp import McpError @@ -202,6 +203,7 @@ async def test_vectorise_files_success(): f.write("def func(): pass") mock_client = AsyncMock() + mock_embedding_function = AsyncMock(return_value=numpy.random.random((100,))) with ( patch("os.path.isdir", return_value=True), patch("vectorcode.mcp_main.get_project_config") as mock_get_project_config, @@ -210,6 +212,10 @@ async def test_vectorise_files_success(): "vectorcode.mcp_main.ClientManager._create_client", return_value=mock_client, ), + patch( + "vectorcode.subcommands.vectorise.get_embedding_function", + return_value=mock_embedding_function, + ), patch("vectorcode.subcommands.vectorise.chunked_add"), patch( "vectorcode.subcommands.vectorise.hash_file", return_value="test_hash" From 243e8b8673fdeea481c7a80d16245cf0ce99a148 Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Thu, 24 Jul 2025 17:31:50 +0800 Subject: [PATCH 3/5] fix: Remove unnecessary mocks for query tool test --- tests/test_mcp.py | 82 +++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index be5427c1..3ee589bb 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -89,29 +89,8 @@ async def test_query_tool_invalid_project_root(): @pytest.mark.asyncio async def test_query_tool_success(): mock_client = AsyncMock() - with ( - tempfile.TemporaryDirectory() as temp_dir, - patch("os.path.isdir", return_value=True), - patch("vectorcode.mcp_main.get_project_config") as mock_get_project_config, - patch("vectorcode.mcp_main.get_collection") as mock_get_collection, - patch( - "vectorcode.mcp_main.ClientManager._create_client", return_value=mock_client - ), - patch( - "vectorcode.subcommands.query.get_query_result_files" - ) as mock_get_query_result_files, - patch("vectorcode.common.try_server", return_value=True), - patch("builtins.open", create=True) as mock_open, - patch("os.path.isfile", return_value=True), - patch("os.path.relpath", return_value="rel/path.py"), - patch("vectorcode.cli_utils.load_config_file") as mock_load_config_file, - ): - mock_config = Config( - chunk_size=100, overlap_ratio=0.1, reranker=None, project_root=temp_dir - ) - mock_load_config_file.return_value = mock_config - mock_get_project_config.return_value = mock_config + with tempfile.TemporaryDirectory() as temp_dir: # Mock the collection's query method to return a valid QueryResult mock_collection = AsyncMock() mock_collection.query.return_value = { @@ -123,19 +102,39 @@ async def test_query_tool_success(): "data": None, "distances": [[0.1, 0.2]], # Valid distances } - mock_get_collection.return_value = mock_collection + for i in range(1, 3): + with open(f"file{i}.py", "w") as fin: + fin.writelines([f"doc{i}"]) + with ( + patch("vectorcode.mcp_main.get_project_config") as mock_get_project_config, + patch("vectorcode.mcp_main.get_collection") as mock_get_collection, + patch( + "vectorcode.mcp_main.ClientManager._create_client", + return_value=mock_client, + ), + patch( + "vectorcode.subcommands.query.get_query_result_files" + ) as mock_get_query_result_files, + patch("vectorcode.common.try_server", return_value=True), + patch("vectorcode.cli_utils.load_config_file") as mock_load_config_file, + ): + mock_config = Config( + chunk_size=100, overlap_ratio=0.1, reranker=None, project_root=temp_dir + ) + mock_load_config_file.return_value = mock_config + mock_get_project_config.return_value = mock_config - mock_get_query_result_files.return_value = ["file1.py", "file2.py"] - mock_file_handle = MagicMock() - mock_file_handle.__enter__.return_value.read.return_value = "file content" - mock_open.return_value = mock_file_handle + mock_get_collection.return_value = mock_collection - result = await query_tool( - n_query=2, query_messages=["keyword1"], project_root=temp_dir - ) + mock_get_query_result_files.return_value = ["file1.py", "file2.py"] + mock_file_handle = MagicMock() + mock_file_handle.__enter__.return_value.read.return_value = "file content" + + result = await query_tool( + n_query=2, query_messages=["keyword1"], project_root=temp_dir + ) - assert len(result) == 2 - assert "rel/path.py\nfile content" in result + assert len(result) == 2 @pytest.mark.asyncio @@ -271,6 +270,8 @@ async def test_vectorise_files_with_exclude_spec(): f.write("content1") with open(excluded_file, "w") as f: f.write("content_excluded") + with open(exclude_spec_file, "w") as fin: + fin.writelines(["excluded.py"]) # Create mock file handles for specific file contents mock_exclude_file_handle = mock_open(read_data="excluded.py").return_value @@ -283,24 +284,16 @@ def mock_open_side_effect(filename, *args, **kwargs): mock_client = AsyncMock() with ( - patch("os.path.isdir", return_value=True), patch("vectorcode.mcp_main.get_project_config") as mock_get_project_config, patch("vectorcode.mcp_main.get_collection") as mock_get_collection, patch( "vectorcode.mcp_main.ClientManager._create_client", return_value=mock_client, ), - patch("vectorcode.subcommands.vectorise.chunked_add") as mock_chunked_add, + patch("vectorcode.mcp_main.chunked_add") as mock_chunked_add, patch( "vectorcode.subcommands.vectorise.hash_file", return_value="test_hash" ), - # Patch builtins.open with the custom side effect - patch("builtins.open", side_effect=mock_open_side_effect), - # Patch os.path.isfile to control which files "exist" - patch( - "os.path.isfile", - side_effect=lambda x: x in [file1, excluded_file, exclude_spec_file], - ), patch("vectorcode.common.try_server", return_value=True), ): mock_config = Config(project_root=temp_dir) @@ -311,12 +304,9 @@ def mock_open_side_effect(filename, *args, **kwargs): mock_get_collection.return_value = mock_collection mock_client.get_max_batch_size.return_value = 100 - result = await vectorise_files( - paths=[file1, excluded_file], project_root=temp_dir - ) + await vectorise_files(paths=[file1, excluded_file], project_root=temp_dir) - assert result["add"] == 0 - assert mock_chunked_add.call_count == 0 + assert mock_chunked_add.call_count == 1 call_args = [call[0][0] for call in mock_chunked_add.call_args_list] assert excluded_file not in call_args From 6e787afd7b1f126f8371090ffc53ddbd97fc8215 Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Thu, 24 Jul 2025 17:40:44 +0800 Subject: [PATCH 4/5] tests(cli): Fix file path issues in query tool test --- tests/test_mcp.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 3ee589bb..0d1b8bd8 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -91,6 +91,7 @@ async def test_query_tool_success(): mock_client = AsyncMock() with tempfile.TemporaryDirectory() as temp_dir: + os.chdir(temp_dir) # Mock the collection's query method to return a valid QueryResult mock_collection = AsyncMock() mock_collection.query.return_value = { @@ -103,7 +104,7 @@ async def test_query_tool_success(): "distances": [[0.1, 0.2]], # Valid distances } for i in range(1, 3): - with open(f"file{i}.py", "w") as fin: + with open(os.path.join(temp_dir, f"file{i}.py"), "w") as fin: fin.writelines([f"doc{i}"]) with ( patch("vectorcode.mcp_main.get_project_config") as mock_get_project_config, @@ -126,9 +127,9 @@ async def test_query_tool_success(): mock_get_collection.return_value = mock_collection - mock_get_query_result_files.return_value = ["file1.py", "file2.py"] - mock_file_handle = MagicMock() - mock_file_handle.__enter__.return_value.read.return_value = "file content" + mock_get_query_result_files.return_value = [ + os.path.join(temp_dir, i) for i in ("file1.py", "file2.py") + ] result = await query_tool( n_query=2, query_messages=["keyword1"], project_root=temp_dir @@ -202,7 +203,7 @@ async def test_vectorise_files_success(): f.write("def func(): pass") mock_client = AsyncMock() - mock_embedding_function = AsyncMock(return_value=numpy.random.random((100,))) + mock_embedding_function = MagicMock(return_value=numpy.random.random((100,))) with ( patch("os.path.isdir", return_value=True), patch("vectorcode.mcp_main.get_project_config") as mock_get_project_config, From 807bf7cafd9d45228d6c50179ce549cebc953f83 Mon Sep 17 00:00:00 2001 From: Davidyz Date: Fri, 25 Jul 2025 04:55:31 +0000 Subject: [PATCH 5/5] Auto generate docs --- doc/VectorCode-cli.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt index a2465d5e..a82db3ba 100644 --- a/doc/VectorCode-cli.txt +++ b/doc/VectorCode-cli.txt @@ -81,7 +81,7 @@ If you need to install multiple dependency group (for |VectorCode-cli-lsp| or |VectorCode-cli-mcp|), you can use the following syntax: >bash - uv tool install vectorcode[lsp,mcp] + uv tool install 'vectorcode[lsp,mcp]' < @@ -136,7 +136,7 @@ LEGACY ENVIRONMENTS ~ If your environment doesn’t support `numpy` version 2.0+, the default, unconstrained numpy may not work for you. In this case, you can try installing -the package by `uv tool install vectorcode[legacy]`, which enforces numpy +the package by `uv tool install 'vectorcode[legacy]'`, which enforces numpy `v1.x`. If this doesn’t help, please open an issue with your OS, CPU architecture, python version and the vectorcode virtual environment (`uv tool run --from=vectorcode python -m ensurepip && uv tool run --from=vectorcode @@ -626,9 +626,9 @@ following options in the JSON config file: For Intel users, sentence transformer supports OpenVINO -backend for supported GPU. Run `uv install vectorcode[intel]` which will bundle -the relevant libraries when you install VectorCode. After that, you will need -to configure `SentenceTransformer` to use `openvino` backend. In your +backend for supported GPU. Run `uv install 'vectorcode[intel]'` which will +bundle the relevant libraries when you install VectorCode. After that, you will +need to configure `SentenceTransformer` to use `openvino` backend. In your `config.json`, set `backend` key in `embedding_params` to `"openvino"` >json @@ -760,11 +760,11 @@ The experimental language server can be installed via the `lsp` dependency group: >bash - pipx install vectorcode[lsp] + pipx install 'vectorcode[lsp]' ## or if you have an existing `vectorcode` install: - pipx inject vectorcode vectorcode[lsp] --force + pipx inject vectorcode 'vectorcode[lsp]' --force < The LSP request for the `workspace/executeCommand` is defined as follows: