diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt index a82db3ba..8266845e 100644 --- a/doc/VectorCode-cli.txt +++ b/doc/VectorCode-cli.txt @@ -322,17 +322,23 @@ embedding function takes. For `OllamaEmbeddingFunction`, if you set "model_name": "nomic-embed-text" }` Then the embedding function object will be initialised as `OllamaEmbeddingFunction(url="http://127.0.0.1:11434/api/embeddings", -model_name="nomic-embed-text")`. Default: `{}`; - `db_url`string, the url that -points to the Chromadb server. VectorCode will start an HTTP server for -Chromadb at a randomly picked free port on `localhost` if your configured -`http://host:port` is not accessible. Default: `http://127.0.0.1:8000`; - -`db_path`string, Path to local persistent database. If you didn’t set up a -standalone Chromadb server, this is where the files for your database will be -stored. Default: `~/.local/share/vectorcode/chromadb/`; - `db_log_path`string, -path to the _directory_ where the built-in chromadb server will write the log -to. Default: `~/.local/share/vectorcode/`; - `chunk_size`integer, the maximum -number of characters per chunk. A larger value reduces the number of items in -the database, and hence accelerates the search, but at the cost of potentially +model_name="nomic-embed-text")`. Default: `{}`; - `embedding_dims`integer or +`null`, the number of dimensions to truncate the embeddings to. _Make sure your +model supports Matryoshka Representation Learning (MRL) before using this._ +Learn more about MRL here +. +When set to `null` (or unset), the embeddings won’t be truncated; - +`db_url`string, the url that points to the Chromadb server. VectorCode will +start an HTTP server for Chromadb at a randomly picked free port on `localhost` +if your configured `http://host:port` is not accessible. Default: +`http://127.0.0.1:8000`; - `db_path`string, Path to local persistent database. +If you didn’t set up a standalone Chromadb server, this is where the files +for your database will be stored. Default: +`~/.local/share/vectorcode/chromadb/`; - `db_log_path`string, path to the +_directory_ where the built-in chromadb server will write the log to. Default: +`~/.local/share/vectorcode/`; - `chunk_size`integer, the maximum number of +characters per chunk. A larger value reduces the number of items in the +database, and hence accelerates the search, but at the cost of potentially truncated data and lost information. Default: `2500`. To disable chunking, set it to a negative number; - `overlap_ratio`float between 0 and 1, the ratio of overlapping/shared content between 2 adjacent chunks. A larger ratio improves diff --git a/docs/cli.md b/docs/cli.md index 7934ee84..5036ac4e 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -275,6 +275,10 @@ The JSON configuration file may hold the following values: Then the embedding function object will be initialised as `OllamaEmbeddingFunction(url="http://127.0.0.1:11434/api/embeddings", model_name="nomic-embed-text")`. Default: `{}`; +- `embedding_dims`: integer or `null`, the number of dimensions to truncate the embeddings + to. _Make sure your model supports Matryoshka Representation Learning (MRL) + before using this._ Learn more about MRL [here](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings). + When set to `null` (or unset), the embeddings won't be truncated; - `db_url`: string, the url that points to the Chromadb server. VectorCode will start an HTTP server for Chromadb at a randomly picked free port on `localhost` if your configured `http://host:port` is not accessible. Default: `http://127.0.0.1:8000`; diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py index ae09f746..83e36793 100644 --- a/src/vectorcode/cli_utils.py +++ b/src/vectorcode/cli_utils.py @@ -89,6 +89,7 @@ class Config: db_url: str = "http://127.0.0.1:8000" embedding_function: str = "SentenceTransformerEmbeddingFunction" # This should fallback to whatever the default is. embedding_params: dict[str, Any] = field(default_factory=(lambda: {})) + embedding_dims: Optional[int] = None n_result: int = 1 force: bool = False db_path: Optional[str] = "~/.local/share/vectorcode/chromadb/" @@ -139,6 +140,9 @@ async def import_from(cls, config_dict: dict[str, Any]) -> "Config": "embedding_params": config_dict.get( "embedding_params", default_config.embedding_params ), + "embedding_dims": config_dict.get( + "embedding_dims", default_config.embedding_dims + ), "db_url": config_dict.get("db_url", default_config.db_url), "db_path": db_path, "db_log_path": os.path.expanduser( diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index 3828146d..8dea28b3 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -67,8 +67,11 @@ async def get_query_result_files( await collection.count(), ) logger.info(f"Querying {num_query} chunks for reranking.") + query_embeddings = get_embedding_function(configs)(query_chunks) + if isinstance(configs.embedding_dims, int) and configs.embedding_dims > 0: + query_embeddings = [e[: configs.embedding_dims] for e in query_embeddings] results = await collection.query( - query_embeddings=get_embedding_function(configs)(query_chunks), + query_embeddings=query_embeddings, n_results=num_query, include=[ IncludeEnum.metadatas, diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index 14902cee..1cf51569 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -146,12 +146,21 @@ async def chunked_add( async with collection_lock: for idx in range(0, len(chunks), max_batch_size): inserted_chunks = chunks[idx : idx + max_batch_size] + embeddings = embedding_function( + list(str(c) for c in inserted_chunks) + ) + if ( + isinstance(configs.embedding_dims, int) + and configs.embedding_dims > 0 + ): + logger.debug( + f"Truncating embeddings to {configs.embedding_dims} dimensions." + ) + embeddings = [e[: configs.embedding_dims] for e in embeddings] await collection.add( ids=[get_uuid() for _ in inserted_chunks], documents=[str(i) for i in inserted_chunks], - embeddings=embedding_function( - list(str(c) for c in inserted_chunks) - ), + embeddings=embeddings, metadatas=metas, ) except (UnicodeDecodeError, UnicodeError): # pragma: nocover diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py index 8ce5a7ed..43392526 100644 --- a/tests/subcommands/query/test_query.py +++ b/tests/subcommands/query/test_query.py @@ -327,14 +327,11 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config): async def test_get_query_result_files_multiple_queries(mock_collection, mock_config): # Set multiple query terms mock_config.query = ["term1", "term2", "term3"] - mock_embedding_function = MagicMock() + mock_config.embedding_dims = 10 + with ( patch("vectorcode.subcommands.query.StringChunker") as MockChunker, patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker, - patch( - "vectorcode.subcommands.query.get_embedding_function", - return_value=mock_embedding_function, - ), ): # Set up MockChunker to return the query terms as is mock_chunker_instance = MagicMock() @@ -354,7 +351,7 @@ async def test_get_query_result_files_multiple_queries(mock_collection, mock_con # Check query was called with all query terms mock_collection.query.assert_called_once() _, kwargs = mock_collection.query.call_args - mock_embedding_function.assert_called_once_with(["term1", "term2", "term3"]) + assert all(len(i) == 10 for i in kwargs["query_embeddings"]) # Check the result assert result == ["file1.py", "file2.py"] diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py index d82069bc..3ce5683b 100644 --- a/tests/subcommands/test_vectorise.py +++ b/tests/subcommands/test_vectorise.py @@ -103,6 +103,44 @@ async def test_chunked_add(): assert collection.add.call_count == 1 +@pytest.mark.asyncio +async def test_chunked_add_truncated(): + file_path = "test_file.py" + collection = AsyncMock() + collection_lock = asyncio.Lock() + stats = VectoriseStats() + stats_lock = asyncio.Lock() + configs = Config( + chunk_size=100, overlap_ratio=0.2, project_root=".", embedding_dims=10 + ) + max_batch_size = 50 + semaphore = asyncio.Semaphore(1) + + with ( + patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk, + patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file, + ): + mock_hash_file.return_value = "hash1" + mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"] + await chunked_add( + file_path, + collection, + collection_lock, + stats, + stats_lock, + configs, + max_batch_size, + semaphore, + ) + + assert stats.add == 1 + assert stats.update == 0 + collection.add.assert_called() + assert collection.add.call_count == 1 + + assert all(len(i) == 10 for i in collection.add.call_args.kwargs["embeddings"]) + + @pytest.mark.asyncio async def test_chunked_add_with_existing(): file_path = "test_file.py"