Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions doc/VectorCode-cli.txt
Original file line number Diff line number Diff line change
Expand Up @@ -322,17 +322,23 @@ embedding function takes. For `OllamaEmbeddingFunction`, if you set
"model_name": "nomic-embed-text" }` Then the embedding function object will be
initialised as
`OllamaEmbeddingFunction(url="http://127.0.0.1:11434/api/embeddings",
model_name="nomic-embed-text")`. Default: `{}`; - `db_url`string, the url that
points to the Chromadb server. VectorCode will start an HTTP server for
Chromadb at a randomly picked free port on `localhost` if your configured
`http://host:port` is not accessible. Default: `http://127.0.0.1:8000`; -
`db_path`string, Path to local persistent database. If you didn’t set up a
standalone Chromadb server, this is where the files for your database will be
stored. Default: `~/.local/share/vectorcode/chromadb/`; - `db_log_path`string,
path to the _directory_ where the built-in chromadb server will write the log
to. Default: `~/.local/share/vectorcode/`; - `chunk_size`integer, the maximum
number of characters per chunk. A larger value reduces the number of items in
the database, and hence accelerates the search, but at the cost of potentially
model_name="nomic-embed-text")`. Default: `{}`; - `embedding_dims`integer or
`null`, the number of dimensions to truncate the embeddings to. _Make sure your
model supports Matryoshka Representation Learning (MRL) before using this._
Learn more about MRL here
<https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings>.
When set to `null` (or unset), the embeddings won’t be truncated; -
`db_url`string, the url that points to the Chromadb server. VectorCode will
start an HTTP server for Chromadb at a randomly picked free port on `localhost`
if your configured `http://host:port` is not accessible. Default:
`http://127.0.0.1:8000`; - `db_path`string, Path to local persistent database.
If you didn’t set up a standalone Chromadb server, this is where the files
for your database will be stored. Default:
`~/.local/share/vectorcode/chromadb/`; - `db_log_path`string, path to the
_directory_ where the built-in chromadb server will write the log to. Default:
`~/.local/share/vectorcode/`; - `chunk_size`integer, the maximum number of
characters per chunk. A larger value reduces the number of items in the
database, and hence accelerates the search, but at the cost of potentially
truncated data and lost information. Default: `2500`. To disable chunking, set
it to a negative number; - `overlap_ratio`float between 0 and 1, the ratio of
overlapping/shared content between 2 adjacent chunks. A larger ratio improves
Expand Down
4 changes: 4 additions & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@ The JSON configuration file may hold the following values:
Then the embedding function object will be initialised as
`OllamaEmbeddingFunction(url="http://127.0.0.1:11434/api/embeddings",
model_name="nomic-embed-text")`. Default: `{}`;
- `embedding_dims`: integer or `null`, the number of dimensions to truncate the embeddings
to. _Make sure your model supports Matryoshka Representation Learning (MRL)
before using this._ Learn more about MRL [here](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings).
When set to `null` (or unset), the embeddings won't be truncated;
- `db_url`: string, the url that points to the Chromadb server. VectorCode will start an
HTTP server for Chromadb at a randomly picked free port on `localhost` if your
configured `http://host:port` is not accessible. Default: `http://127.0.0.1:8000`;
Expand Down
4 changes: 4 additions & 0 deletions src/vectorcode/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class Config:
db_url: str = "http://127.0.0.1:8000"
embedding_function: str = "SentenceTransformerEmbeddingFunction" # This should fallback to whatever the default is.
embedding_params: dict[str, Any] = field(default_factory=(lambda: {}))
embedding_dims: Optional[int] = None
n_result: int = 1
force: bool = False
db_path: Optional[str] = "~/.local/share/vectorcode/chromadb/"
Expand Down Expand Up @@ -139,6 +140,9 @@ async def import_from(cls, config_dict: dict[str, Any]) -> "Config":
"embedding_params": config_dict.get(
"embedding_params", default_config.embedding_params
),
"embedding_dims": config_dict.get(
"embedding_dims", default_config.embedding_dims
),
"db_url": config_dict.get("db_url", default_config.db_url),
"db_path": db_path,
"db_log_path": os.path.expanduser(
Expand Down
5 changes: 4 additions & 1 deletion src/vectorcode/subcommands/query/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,11 @@ async def get_query_result_files(
await collection.count(),
)
logger.info(f"Querying {num_query} chunks for reranking.")
query_embeddings = get_embedding_function(configs)(query_chunks)
if isinstance(configs.embedding_dims, int) and configs.embedding_dims > 0:
query_embeddings = [e[: configs.embedding_dims] for e in query_embeddings]
results = await collection.query(
query_embeddings=get_embedding_function(configs)(query_chunks),
query_embeddings=query_embeddings,
n_results=num_query,
include=[
IncludeEnum.metadatas,
Expand Down
15 changes: 12 additions & 3 deletions src/vectorcode/subcommands/vectorise.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,21 @@ async def chunked_add(
async with collection_lock:
for idx in range(0, len(chunks), max_batch_size):
inserted_chunks = chunks[idx : idx + max_batch_size]
embeddings = embedding_function(
list(str(c) for c in inserted_chunks)
)
if (
isinstance(configs.embedding_dims, int)
and configs.embedding_dims > 0
):
logger.debug(
f"Truncating embeddings to {configs.embedding_dims} dimensions."
)
embeddings = [e[: configs.embedding_dims] for e in embeddings]
await collection.add(
ids=[get_uuid() for _ in inserted_chunks],
documents=[str(i) for i in inserted_chunks],
embeddings=embedding_function(
list(str(c) for c in inserted_chunks)
),
embeddings=embeddings,
metadatas=metas,
)
except (UnicodeDecodeError, UnicodeError): # pragma: nocover
Expand Down
9 changes: 3 additions & 6 deletions tests/subcommands/query/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,14 +327,11 @@ async def test_get_query_result_files_chunking(mock_collection, mock_config):
async def test_get_query_result_files_multiple_queries(mock_collection, mock_config):
# Set multiple query terms
mock_config.query = ["term1", "term2", "term3"]
mock_embedding_function = MagicMock()
mock_config.embedding_dims = 10

with (
patch("vectorcode.subcommands.query.StringChunker") as MockChunker,
patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker,
patch(
"vectorcode.subcommands.query.get_embedding_function",
return_value=mock_embedding_function,
),
):
# Set up MockChunker to return the query terms as is
mock_chunker_instance = MagicMock()
Expand All @@ -354,7 +351,7 @@ async def test_get_query_result_files_multiple_queries(mock_collection, mock_con
# Check query was called with all query terms
mock_collection.query.assert_called_once()
_, kwargs = mock_collection.query.call_args
mock_embedding_function.assert_called_once_with(["term1", "term2", "term3"])
assert all(len(i) == 10 for i in kwargs["query_embeddings"])

# Check the result
assert result == ["file1.py", "file2.py"]
Expand Down
38 changes: 38 additions & 0 deletions tests/subcommands/test_vectorise.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,44 @@ async def test_chunked_add():
assert collection.add.call_count == 1


@pytest.mark.asyncio
async def test_chunked_add_truncated():
file_path = "test_file.py"
collection = AsyncMock()
collection_lock = asyncio.Lock()
stats = VectoriseStats()
stats_lock = asyncio.Lock()
configs = Config(
chunk_size=100, overlap_ratio=0.2, project_root=".", embedding_dims=10
)
max_batch_size = 50
semaphore = asyncio.Semaphore(1)

with (
patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk,
patch("vectorcode.subcommands.vectorise.hash_file") as mock_hash_file,
):
mock_hash_file.return_value = "hash1"
mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"]
await chunked_add(
file_path,
collection,
collection_lock,
stats,
stats_lock,
configs,
max_batch_size,
semaphore,
)

assert stats.add == 1
assert stats.update == 0
collection.add.assert_called()
assert collection.add.call_count == 1

assert all(len(i) == 10 for i in collection.add.call_args.kwargs["embeddings"])


@pytest.mark.asyncio
async def test_chunked_add_with_existing():
file_path = "test_file.py"
Expand Down