From a6f8bc12f247ff3119a1c3d94123e764a13bf928 Mon Sep 17 00:00:00 2001 From: Aaron Spring Date: Mon, 20 Oct 2025 20:42:31 +0200 Subject: [PATCH 1/3] Add Snowflake Arctic Embed L v2.0 model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Snowflake/snowflake-arctic-embed-l-v2.0, a multilingual embedding model with the following features: - 1024 dimensions - 74 languages support - 8192 token context length - Based on XLM-RoBERTa architecture - Supports Matryoshka learning for dimension truncation - Apache 2.0 license Changes: - Added model configuration to supported_onnx_models list - Added canonical test vectors for validation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- fastembed/text/onnx_embedding.py | 14 ++++++++++++++ tests/test_text_onnx_embeddings.py | 3 +++ 2 files changed, 17 insertions(+) diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index 4cc892f5..96321db7 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -168,6 +168,20 @@ sources=ModelSource(hf="snowflake/snowflake-arctic-embed-l"), model_file="onnx/model.onnx", ), + DenseModelDescription( + model="Snowflake/snowflake-arctic-embed-l-v2.0", + dim=1024, + description=( + "Text embeddings, Unimodal (text), Multilingual (74 languages), 8192 input tokens truncation, " + "Based on XLM-RoBERTa, supports Matryoshka learning for dimension truncation, " + "Prefixes for queries: recommended (query: ), 2024 year." + ), + license="apache-2.0", + size_in_GB=2.27, + sources=ModelSource(hf="Snowflake/snowflake-arctic-embed-l-v2.0"), + model_file="onnx/model.onnx", + additional_files=["onnx/model.onnx_data"], + ), DenseModelDescription( model="jinaai/jina-clip-v1", dim=768, diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index 6b25d900..e4b99492 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -64,6 +64,9 @@ [0.0080, -0.0266, -0.0335, 0.0282, 0.0143] ), "snowflake/snowflake-arctic-embed-l": np.array([0.0189, -0.0673, 0.0183, 0.0124, 0.0146]), + "Snowflake/snowflake-arctic-embed-l-v2.0": np.array( + [-0.0266, 0.0167, -0.0478, -0.0039, -0.0128] + ), "Qdrant/clip-ViT-B-32-text": np.array([0.0083, 0.0103, -0.0138, 0.0199, -0.0069]), "thenlper/gte-base": np.array([0.0038, 0.0355, 0.0181, 0.0092, 0.0654]), "jinaai/jina-clip-v1": np.array([-0.0862, -0.0101, -0.0056, 0.0375, -0.0472]), From 730634af47e12f484b95416baa9279eae951c39a Mon Sep 17 00:00:00 2001 From: Aaron Spring Date: Mon, 20 Oct 2025 20:48:44 +0200 Subject: [PATCH 2/3] Add query/passage prefix support for Snowflake Arctic Embed L v2.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement automatic prefix handling for models with task-specific prefixes: - Added tasks field to model configuration with query_prefix and passage_prefix - Implemented query_embed() method to automatically prepend "query: " prefix - Implemented passage_embed() method (no prefix for this model) - Both methods check for tasks configuration and apply prefixes dynamically This enables optimal retrieval performance as recommended in the model documentation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- fastembed/text/onnx_embedding.py | 60 ++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index 96321db7..c4b8a4f4 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -171,6 +171,12 @@ DenseModelDescription( model="Snowflake/snowflake-arctic-embed-l-v2.0", dim=1024, + tasks={ + "embedding": { + "query_prefix": "query: ", + "passage_prefix": "", + } + }, description=( "Text embeddings, Unimodal (text), Multilingual (74 languages), 8192 input tokens truncation, " "Based on XLM-RoBERTa, supports Matryoshka learning for dimension truncation, " @@ -308,6 +314,60 @@ def embed( **kwargs, ) + def query_embed( + self, query: Union[str, Iterable[str]], **kwargs: Any + ) -> Iterable[NumpyArray]: + """ + Embeds queries using the model-specific query prefix if configured. + + Args: + query: The query or queries to embed + **kwargs: Additional arguments to pass to embed + + Yields: + Iterable[NumpyArray]: Query embeddings + """ + # Check if model has task-specific prefixes configured + if hasattr(self.model_description, "tasks") and self.model_description.tasks: + embedding_task = self.model_description.tasks.get("embedding", {}) + query_prefix = embedding_task.get("query_prefix", "") + + if query_prefix: + # Add prefix to queries + if isinstance(query, str): + query = f"{query_prefix}{query}" + else: + query = [f"{query_prefix}{q}" for q in query] + + # Use parent implementation + if isinstance(query, str): + yield from self.embed([query], **kwargs) + else: + yield from self.embed(query, **kwargs) + + def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]: + """ + Embeds passages using the model-specific passage prefix if configured. + + Args: + texts: The passages to embed + **kwargs: Additional arguments to pass to embed + + Yields: + Iterable[NumpyArray]: Passage embeddings + """ + # Check if model has task-specific prefixes configured + if hasattr(self.model_description, "tasks") and self.model_description.tasks: + embedding_task = self.model_description.tasks.get("embedding", {}) + passage_prefix = embedding_task.get("passage_prefix", "") + + if passage_prefix: + # Add prefix to passages + texts = [f"{passage_prefix}{t}" for t in texts] + + # Use parent implementation + yield from self.embed(texts, **kwargs) + @classmethod def _get_worker_class(cls) -> Type["TextEmbeddingWorker[NumpyArray]"]: return OnnxTextEmbeddingWorker From 7f41fba5284a48306cb7a373e4b2a015561ae269 Mon Sep 17 00:00:00 2001 From: Aaron Spring Date: Mon, 20 Oct 2025 20:56:47 +0200 Subject: [PATCH 3/3] Address PR feedback: fix casing, simplify code, add tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes based on CodeRabbit review: 1. Remove redundant hasattr checks - tasks field has default factory 2. Fix model identifier casing from Snowflake/ to snowflake/ for consistency 3. Add comprehensive tests for prefix functionality: - test_query_passage_prefix: Verifies query prefix is applied correctly - test_prefix_backward_compatibility: Ensures models without prefix config work All tests passing. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- fastembed/text/onnx_embedding.py | 6 +-- tests/test_text_onnx_embeddings.py | 74 +++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index c4b8a4f4..74659ba4 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -169,7 +169,7 @@ model_file="onnx/model.onnx", ), DenseModelDescription( - model="Snowflake/snowflake-arctic-embed-l-v2.0", + model="snowflake/snowflake-arctic-embed-l-v2.0", dim=1024, tasks={ "embedding": { @@ -328,7 +328,7 @@ def query_embed( Iterable[NumpyArray]: Query embeddings """ # Check if model has task-specific prefixes configured - if hasattr(self.model_description, "tasks") and self.model_description.tasks: + if self.model_description.tasks: embedding_task = self.model_description.tasks.get("embedding", {}) query_prefix = embedding_task.get("query_prefix", "") @@ -357,7 +357,7 @@ def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyAr Iterable[NumpyArray]: Passage embeddings """ # Check if model has task-specific prefixes configured - if hasattr(self.model_description, "tasks") and self.model_description.tasks: + if self.model_description.tasks: embedding_task = self.model_description.tasks.get("embedding", {}) passage_prefix = embedding_task.get("passage_prefix", "") diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index e4b99492..ad91c87b 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -64,7 +64,7 @@ [0.0080, -0.0266, -0.0335, 0.0282, 0.0143] ), "snowflake/snowflake-arctic-embed-l": np.array([0.0189, -0.0673, 0.0183, 0.0124, 0.0146]), - "Snowflake/snowflake-arctic-embed-l-v2.0": np.array( + "snowflake/snowflake-arctic-embed-l-v2.0": np.array( [-0.0266, 0.0167, -0.0478, -0.0039, -0.0128] ), "Qdrant/clip-ViT-B-32-text": np.array([0.0083, 0.0103, -0.0138, 0.0199, -0.0069]), @@ -178,3 +178,75 @@ def test_embedding_size() -> None: if is_ci: delete_model_cache(model.model._model_dir) + + +def test_query_passage_prefix() -> None: + """Test that query/passage prefixes are applied correctly for models with prefix configuration.""" + is_ci = os.getenv("CI") + + # Test with Snowflake Arctic Embed L v2.0 which has query_prefix configured + model_name = "snowflake/snowflake-arctic-embed-l-v2.0" + model = TextEmbedding(model_name=model_name) + + test_text = "what is fastembed?" + + # Test query_embed (should apply "query: " prefix) + query_embedding = list(model.query_embed(test_text)) + query_embedding_array = np.array(query_embedding) + + # Test regular embed (should not apply prefix) + regular_embedding = list(model.embed([test_text])) + regular_embedding_array = np.array(regular_embedding) + + # Query embeddings with prefix should differ from regular embeddings without prefix + assert not np.allclose(query_embedding_array, regular_embedding_array), ( + "Query embeddings with prefix should differ from regular embeddings" + ) + + # Test passage_embed (should not apply prefix for this model) + passage_embedding = list(model.passage_embed([test_text])) + passage_embedding_array = np.array(passage_embedding) + + # Passage embeddings should match regular embeddings (both without prefix) + assert np.allclose(passage_embedding_array, regular_embedding_array, atol=1e-5), ( + "Passage embeddings should match regular embeddings when no passage prefix configured" + ) + + # Test with multiple queries + queries = ["query one", "query two"] + query_embeddings = list(model.query_embed(queries)) + assert len(query_embeddings) == 2 + assert query_embeddings[0].shape == (1024,) + + if is_ci: + delete_model_cache(model.model._model_dir) + + +def test_prefix_backward_compatibility() -> None: + """Test that models without prefix configuration still work correctly.""" + is_ci = os.getenv("CI") + + # Test with a model that doesn't have prefix configuration + model_name = "BAAI/bge-small-en-v1.5" + model = TextEmbedding(model_name=model_name) + + test_text = "hello world" + + # All three methods should produce the same embeddings for models without prefix config + query_embedding = list(model.query_embed(test_text)) + passage_embedding = list(model.passage_embed([test_text])) + regular_embedding = list(model.embed([test_text])) + + query_array = np.array(query_embedding) + passage_array = np.array(passage_embedding) + regular_array = np.array(regular_embedding) + + assert np.allclose(query_array, regular_array, atol=1e-5), ( + "Query embed should match regular embed for models without prefix config" + ) + assert np.allclose(passage_array, regular_array, atol=1e-5), ( + "Passage embed should match regular embed for models without prefix config" + ) + + if is_ci: + delete_model_cache(model.model._model_dir)