Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions doc/VectorCode-cli.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ If you need to install multiple dependency group (for |VectorCode-cli-lsp| or
|VectorCode-cli-mcp|), you can use the following syntax:

>bash
uv tool install vectorcode[lsp,mcp]
uv tool install 'vectorcode[lsp,mcp]'
<


Expand Down Expand Up @@ -136,7 +136,7 @@ LEGACY ENVIRONMENTS ~

If your environment doesn’t support `numpy` version 2.0+, the default,
unconstrained numpy may not work for you. In this case, you can try installing
the package by `uv tool install vectorcode[legacy]`, which enforces numpy
the package by `uv tool install 'vectorcode[legacy]'`, which enforces numpy
`v1.x`. If this doesn’t help, please open an issue with your OS, CPU
architecture, python version and the vectorcode virtual environment (`uv tool
run --from=vectorcode python -m ensurepip && uv tool run --from=vectorcode
Expand Down Expand Up @@ -626,9 +626,9 @@ following options in the JSON config file:
For Intel users, sentence transformer <https://www.sbert.net/index.html>
supports OpenVINO
<https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html>
backend for supported GPU. Run `uv install vectorcode[intel]` which will bundle
the relevant libraries when you install VectorCode. After that, you will need
to configure `SentenceTransformer` to use `openvino` backend. In your
backend for supported GPU. Run `uv install 'vectorcode[intel]'` which will
bundle the relevant libraries when you install VectorCode. After that, you will
need to configure `SentenceTransformer` to use `openvino` backend. In your
`config.json`, set `backend` key in `embedding_params` to `"openvino"`

>json
Expand Down Expand Up @@ -760,11 +760,11 @@ The experimental language server can be installed via the `lsp` dependency
group:

>bash
pipx install vectorcode[lsp]
pipx install 'vectorcode[lsp]'

## or if you have an existing `vectorcode` install:

pipx inject vectorcode vectorcode[lsp] --force
pipx inject vectorcode 'vectorcode[lsp]' --force
<

The LSP request for the `workspace/executeCommand` is defined as follows:
Expand Down
3 changes: 3 additions & 0 deletions src/vectorcode/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ class Config:
files_action: Optional[FilesAction] = None
rm_paths: list[str] = field(default_factory=list)

def __hash__(self) -> int:
return hash(self.__repr__())

@classmethod
async def import_from(cls, config_dict: dict[str, Any]) -> "Config":
"""
Expand Down
15 changes: 8 additions & 7 deletions src/vectorcode/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
from asyncio.subprocess import Process
from dataclasses import dataclass
from functools import cache
from typing import Any, AsyncGenerator, Optional
from urllib.parse import urlparse

Expand Down Expand Up @@ -127,11 +128,15 @@ def get_collection_name(full_path: str) -> str:
return collection_id


def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction | None:
@cache
def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction:
try:
return getattr(embedding_functions, configs.embedding_function)(
ef = getattr(embedding_functions, configs.embedding_function)(
**configs.embedding_params
)
if ef is None: # pragma: nocover
raise AttributeError()
return ef
except AttributeError:
logger.warning(
f"Failed to use {configs.embedding_function}. Falling back to Sentence Transformer.",
Expand Down Expand Up @@ -161,7 +166,6 @@ async def get_collection(
full_path = str(expand_path(str(configs.project_root), absolute=True))
if __COLLECTION_CACHE.get(full_path) is None:
collection_name = get_collection_name(full_path)
embedding_function = get_embedding_function(configs)

collection_meta: dict[str, str | int] = {
"path": full_path,
Expand All @@ -183,14 +187,11 @@ async def get_collection(
f"Getting/Creating collection with the following metadata: {collection_meta}"
)
if not make_if_missing:
__COLLECTION_CACHE[full_path] = await client.get_collection(
collection_name, embedding_function
)
__COLLECTION_CACHE[full_path] = await client.get_collection(collection_name)
else:
collection = await client.get_or_create_collection(
collection_name,
metadata=collection_meta,
embedding_function=embedding_function,
)
if (
not collection.metadata.get("hostname") == socket.gethostname()
Expand Down
3 changes: 2 additions & 1 deletion src/vectorcode/subcommands/query/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from vectorcode.common import (
ClientManager,
get_collection,
get_embedding_function,
verify_ef,
)
from vectorcode.subcommands.query.reranker import (
Expand Down Expand Up @@ -67,7 +68,7 @@ async def get_query_result_files(
)
logger.info(f"Querying {num_query} chunks for reranking.")
results = await collection.query(
query_texts=query_chunks,
query_embeddings=get_embedding_function(configs)(query_chunks),
n_results=num_query,
include=[
IncludeEnum.metadatas,
Expand Down
5 changes: 5 additions & 0 deletions src/vectorcode/subcommands/vectorise.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from vectorcode.common import (
ClientManager,
get_collection,
get_embedding_function,
list_collection_files,
verify_ef,
)
Expand Down Expand Up @@ -92,6 +93,7 @@ async def chunked_add(
max_batch_size: int,
semaphore: asyncio.Semaphore,
):
embedding_function = get_embedding_function(configs)
full_path_str = str(expand_path(str(file_path), True))
orig_sha256 = None
new_sha256 = hash_file(full_path_str)
Expand Down Expand Up @@ -147,6 +149,9 @@ async def chunked_add(
await collection.add(
ids=[get_uuid() for _ in inserted_chunks],
documents=[str(i) for i in inserted_chunks],
embeddings=embedding_function(
list(str(c) for c in inserted_chunks)
),
metadatas=metas,
)
except (UnicodeDecodeError, UnicodeError): # pragma: nocover
Expand Down
30 changes: 21 additions & 9 deletions tests/subcommands/query/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,14 @@ def mock_config():

@pytest.mark.asyncio
async def test_get_query_result_files(mock_collection, mock_config):
# Mock the reranker
with patch("vectorcode.subcommands.query.get_reranker") as mock_get_reranker:
mock_embedding_function = MagicMock()
with (
patch("vectorcode.subcommands.query.get_reranker") as mock_get_reranker,
patch(
"vectorcode.subcommands.query.get_embedding_function",
return_value=mock_embedding_function,
),
):
mock_reranker_instance = MagicMock()
mock_reranker_instance.rerank = AsyncMock(
return_value=[
Expand All @@ -82,9 +88,7 @@ async def test_get_query_result_files(mock_collection, mock_config):
# Check that query was called with the right parameters
mock_collection.query.assert_called_once()
args, kwargs = mock_collection.query.call_args
assert kwargs["query_texts"] == [
"test query"
] # Assuming chunking produces this
mock_embedding_function.assert_called_once_with(["test query"])
assert kwargs["n_results"] == 6 # n_result(3) * query_multiplier(2)
assert IncludeEnum.metadatas in kwargs["include"]
assert IncludeEnum.distances in kwargs["include"]
Expand Down Expand Up @@ -285,10 +289,14 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config):
mock_config.query = [
"this is a longer query that should be chunked into multiple parts"
]

mock_embedding_function = MagicMock()
with (
patch("vectorcode.subcommands.query.StringChunker") as MockChunker,
patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker,
patch(
"vectorcode.subcommands.query.get_embedding_function",
return_value=mock_embedding_function,
),
):
# Set up MockChunker to chunk the query
mock_chunker_instance = MagicMock()
Expand All @@ -309,7 +317,7 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config):
# Check query was called with chunked query
mock_collection.query.assert_called_once()
_, kwargs = mock_collection.query.call_args
assert kwargs["query_texts"] == ["chunk1", "chunk2", "chunk3"]
mock_embedding_function.assert_called_once_with(["chunk1", "chunk2", "chunk3"])

# Check the result
assert result == ["file1.py", "file2.py"]
Expand All @@ -319,10 +327,14 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config):
async def test_get_query_result_files_multiple_queries(mock_collection, mock_config):
# Set multiple query terms
mock_config.query = ["term1", "term2", "term3"]

mock_embedding_function = MagicMock()
with (
patch("vectorcode.subcommands.query.StringChunker") as MockChunker,
patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker,
patch(
"vectorcode.subcommands.query.get_embedding_function",
return_value=mock_embedding_function,
),
):
# Set up MockChunker to return the query terms as is
mock_chunker_instance = MagicMock()
Expand All @@ -342,7 +354,7 @@ async def test_get_query_result_files_multiple_queries(mock_collection, mock_con
# Check query was called with all query terms
mock_collection.query.assert_called_once()
_, kwargs = mock_collection.query.call_args
assert set(kwargs["query_texts"]) == set(["term1", "term2", "term3"])
mock_embedding_function.assert_called_once_with(["term1", "term2", "term3"])

# Check the result
assert result == ["file1.py", "file2.py"]
Expand Down
6 changes: 6 additions & 0 deletions tests/subcommands/test_vectorise.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from contextlib import ExitStack
from unittest.mock import AsyncMock, MagicMock, mock_open, patch

import numpy
import pytest
from chromadb.api.models.AsyncCollection import AsyncCollection
from tree_sitter import Point
Expand Down Expand Up @@ -511,6 +512,7 @@ def is_file_side_effect(path):
else:
return True

mock_embedding_function = MagicMock(return_value=numpy.random.random((100,)))
with (
patch("os.path.isfile", side_effect=is_file_side_effect),
patch(
Expand All @@ -522,6 +524,10 @@ def is_file_side_effect(path):
"vectorcode.subcommands.vectorise.get_collection",
return_value=mock_collection,
),
patch(
"vectorcode.subcommands.vectorise.get_embedding_function",
return_value=mock_embedding_function,
),
patch("vectorcode.subcommands.vectorise.verify_ef", return_value=True),
patch(
"vectorcode.subcommands.vectorise.expand_globs",
Expand Down
Loading