From deea19c587d4e94fe4724fccbda6f46d2e1314d5 Mon Sep 17 00:00:00 2001 From: Zhe Yu Date: Mon, 25 Aug 2025 15:46:46 +0800 Subject: [PATCH 1/2] revert(cli): default to `NaiveReranker` --- docs/cli.md | 13 ++++--------- src/vectorcode/cli_utils.py | 2 +- tests/test_cli_utils.py | 2 +- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 1ab6d914..d1376d64 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -311,16 +311,11 @@ The JSON configuration file may hold the following values: guarantees the return of `n` documents, but with the risk of including too many less-relevant chunks that may affect the document selection. Default: `-1` (any negative value means selecting documents based on all indexed chunks); -- `reranker`: string, the reranking method to use. Currently supports - `CrossEncoderReranker` (default, using +- `reranker`: string, the reranking method to use. Currently supports `NaiveReranker` + (sort chunks by the "distance" between the embedding vectors) and + `CrossEncoderReranker` (using [sentence-transformers cross-encoder](https://sbert.net/docs/package_reference/cross_encoder/cross_encoder.html) - ) and `NaiveReranker` (sort chunks by the "distance" between the embedding - vectors). - Note: If you're using a good embedding model (eg. a hosted service from OpenAI, or - a LLM-based embedding model like - [Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)), you - may get better results if you use `NaiveReranker` here because a good embedding - model may understand texts better than a mediocre reranking model. + ). - `reranker_params`: dictionary, similar to `embedding_params`. The options passed to the reranker class constructor. For `CrossEncoderReranker`, these are the options passed to the diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py index 7c49def2..0131a5e2 100644 --- a/src/vectorcode/cli_utils.py +++ b/src/vectorcode/cli_utils.py @@ -100,7 +100,7 @@ class Config: overlap_ratio: float = 0.2 query_multiplier: int = -1 query_exclude: list[Union[str, os.PathLike]] = field(default_factory=list) - reranker: Optional[str] = "CrossEncoderReranker" + reranker: Optional[str] = "NaiveReranker" reranker_params: dict[str, Any] = field(default_factory=lambda: {}) check_item: Optional[str] = None use_absolute_path: bool = False diff --git a/tests/test_cli_utils.py b/tests/test_cli_utils.py index e8e79f2b..bd10efc5 100644 --- a/tests/test_cli_utils.py +++ b/tests/test_cli_utils.py @@ -113,7 +113,7 @@ async def test_config_import_from_missing_keys(): assert config.chunk_size == 2500 assert config.overlap_ratio == 0.2 assert config.query_multiplier == -1 - assert config.reranker == "CrossEncoderReranker" + assert config.reranker == "NaiveReranker" assert config.reranker_params == {} assert config.db_settings is None From e5cb57fc236af7a514077a6389b522fe746bd4a7 Mon Sep 17 00:00:00 2001 From: Davidyz <30951234+Davidyz@users.noreply.github.com> Date: Mon, 25 Aug 2025 07:47:45 +0000 Subject: [PATCH 2/2] Auto generate docs --- doc/VectorCode-cli.txt | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt index 66e2f0c3..f20ec83c 100644 --- a/doc/VectorCode-cli.txt +++ b/doc/VectorCode-cli.txt @@ -358,18 +358,13 @@ most `n` documents. A larger value of `query_multiplier` guarantees the return of `n` documents, but with the risk of including too many less-relevant chunks that may affect the document selection. Default: `-1` (any negative value means selecting documents based on all indexed chunks); - `reranker`string, the -reranking method to use. Currently supports `CrossEncoderReranker` (default, -using sentence-transformers cross-encoder - ) -and `NaiveReranker` (sort chunks by the "distance" between the embedding -vectors). Note: If you’re using a good embedding model (eg. a hosted service -from OpenAI, or a LLM-based embedding model like Qwen3-Embedding-0.6B -), you may get better results -if you use `NaiveReranker` here because a good embedding model may understand -texts better than a mediocre reranking model. - `reranker_params`dictionary, -similar to `embedding_params`. The options passed to the reranker class -constructor. For `CrossEncoderReranker`, these are the options passed to the -`CrossEncoder` +reranking method to use. Currently supports `NaiveReranker` (sort chunks by the +"distance" between the embedding vectors) and `CrossEncoderReranker` (using +sentence-transformers cross-encoder + ). +- `reranker_params`dictionary, similar to `embedding_params`. The options +passed to the reranker class constructor. For `CrossEncoderReranker`, these are +the options passed to the `CrossEncoder` class. For example, if you want to use a non-default model, you can use the following: `json { "reranker_params": { "model_name_or_path": "your_model_here"