From 07a5c454e2fcad8ff93f5ed97a5fe1124b8c246b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Thu, 4 Dec 2025 11:25:31 +0100 Subject: [PATCH 01/24] Add ColModernVBERT to LateInteractionMultimodalEmbedding registry --- .../colmodernvbert.py | 71 +++++++++++++++++++ .../late_interaction_multimodal_embedding.py | 3 +- 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 fastembed/late_interaction_multimodal/colmodernvbert.py diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py new file mode 100644 index 00000000..0c1b0e6c --- /dev/null +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -0,0 +1,71 @@ +from typing import Any, Iterable, Type, Union, Optional + +from fastembed.common import ImageInput +from fastembed.common.model_description import DenseModelDescription, ModelSource +from fastembed.common.onnx_model import OnnxOutputContext, T +from fastembed.common.types import NumpyArray +from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( + LateInteractionMultimodalEmbeddingBase, +) +from fastembed.late_interaction_multimodal.onnx_multimodal_model import OnnxMultimodalModel, TextEmbeddingWorker, \ + ImageEmbeddingWorker + +supported_colmodernvbert_models: list[DenseModelDescription] = [ + DenseModelDescription( + model="Qdrant/colmodernvbert", + dim=128, + description="The late-interaction version of ModernVBERT, CPU friendly, English, 2025.", + license="mit", + size_in_GB=1.0, + # TODO: change the url to hf repo link! + sources=ModelSource(url="file:///home/kacper/Projects/Qdrant/colpali-model-migration-to-onnx/outputs/colmodernvbert"), + additional_files=["model.onnx_data"], + model_file="model.onnx", + ), +] + +class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[NumpyArray]): + """ + The ModernVBERT/colmodernvbert model implementation. This model uses + bidirectional attention, which proves to work better for retrieval. + + See: https://huggingface.co/ModernVBERT/colmodernvbert + """ + + # TODO: reproduce ColPali methods only + + @classmethod + def _list_supported_models(cls) -> list[DenseModelDescription]: + """Lists the supported models. + + Returns: + list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information. + """ + return supported_colmodernvbert_models + + @classmethod + def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]: + return ColModernVBERTTextEmbeddingWorker + + @classmethod + def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]: + return ColModernVBERTmageEmbeddingWorker + +class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): + def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali: + return ColModernVBERT( + model_name=model_name, + cache_dir=cache_dir, + threads=1, + **kwargs, + ) + + +class ColModernVBERTmageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]): + def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali: + return ColModernVBERT( + model_name=model_name, + cache_dir=cache_dir, + threads=1, + **kwargs, + ) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py index afe839d4..f123dc63 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py @@ -4,6 +4,7 @@ from fastembed.common import OnnxProvider, ImageInput from fastembed.common.types import NumpyArray, Device from fastembed.late_interaction_multimodal.colpali import ColPali +from fastembed.late_interaction_multimodal.colmodernvbert import ColModernVBERT from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( LateInteractionMultimodalEmbeddingBase, @@ -12,7 +13,7 @@ class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase): - EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ColPali] + EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ColPali, ColModernVBERT] @classmethod def list_supported_models(cls) -> list[dict[str, Any]]: From 203ca315775aa7bf03e1c6b8d4599f90209b44b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Fri, 5 Dec 2025 17:31:05 +0100 Subject: [PATCH 02/24] Implement image processing based on Idefics3ImageProcessor logic --- fastembed/common/onnx_model.py | 1 + fastembed/image/transform/functional.py | 74 ++++++ fastembed/image/transform/operators.py | 224 +++++++++++++++++- .../onnx_multimodal_model.py | 51 +++- 4 files changed, 342 insertions(+), 8 deletions(-) diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py index d465e870..d357f2c1 100644 --- a/fastembed/common/onnx_model.py +++ b/fastembed/common/onnx_model.py @@ -21,6 +21,7 @@ class OnnxOutputContext: model_output: NumpyArray attention_mask: NDArray[np.int64] | None = None input_ids: NDArray[np.int64] | None = None + metadata: dict[str, Any] | None = None class OnnxModel(Generic[T]): diff --git a/fastembed/image/transform/functional.py b/fastembed/image/transform/functional.py index b06ef46c..86dfbe08 100644 --- a/fastembed/image/transform/functional.py +++ b/fastembed/image/transform/functional.py @@ -145,3 +145,77 @@ def pad2square( new_image = Image.new(mode="RGB", size=(size, size), color=fill_color) new_image.paste(image.crop((left, top, right, bottom)) if crop_required else image) return new_image + + +def resize_longest_edge( + image: Image.Image, + max_size: int, + resample: Union[int, Image.Resampling] = Image.Resampling.LANCZOS, +) -> Image.Image: + height, width = image.height, image.width + aspect_ratio = width / height + + if width >= height: + # Width is longer + new_width = max_size + new_height = int(new_width / aspect_ratio) + else: + # Height is longer + new_height = max_size + new_width = int(new_height * aspect_ratio) + + # Ensure even dimensions + if new_height % 2 != 0: + new_height += 1 + if new_width % 2 != 0: + new_width += 1 + + return image.resize((new_width, new_height), resample) + + +def crop_ndarray( + image: NumpyArray, + x1: int, + y1: int, + x2: int, + y2: int, + channel_first: bool = True, +) -> NumpyArray: + if channel_first: + # (C, H, W) format + return image[:, y1:y2, x1:x2] + else: + # (H, W, C) format + return image[y1:y2, x1:x2, :] + + +def resize_ndarray( + image: NumpyArray, + size: tuple[int, int], + resample: Union[int, Image.Resampling] = Image.Resampling.LANCZOS, + channel_first: bool = True, +) -> NumpyArray: + # Convert to PIL-friendly format (H, W, C) + if channel_first: + img_hwc = image.transpose((1, 2, 0)) + else: + img_hwc = image + + # Handle different dtypes + if img_hwc.dtype == np.float32 or img_hwc.dtype == np.float64: + # Assume normalized, scale to 0-255 for PIL + img_hwc_scaled = (img_hwc * 255).astype(np.uint8) + pil_img = Image.fromarray(img_hwc_scaled, mode="RGB") + resized = pil_img.resize(size, resample) + result = np.array(resized).astype(np.float32) / 255.0 + else: + # uint8 or similar + pil_img = Image.fromarray(img_hwc.astype(np.uint8), mode="RGB") + resized = pil_img.resize(size, resample) + result = np.array(resized) + + # Convert back to original format + if channel_first: + result = result.transpose((2, 0, 1)) + + return result diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py index 857b1999..b6d3814d 100644 --- a/fastembed/image/transform/operators.py +++ b/fastembed/image/transform/operators.py @@ -1,4 +1,5 @@ from typing import Any +import math from PIL import Image @@ -6,10 +7,13 @@ from fastembed.image.transform.functional import ( center_crop, convert_to_rgb, + crop_ndarray, normalize, pil2ndarray, rescale, resize, + resize_longest_edge, + resize_ndarray, pad2square, ) @@ -37,8 +41,13 @@ def __init__(self, mean: float | list[float], std: float | list[float]): self.mean = mean self.std = std - def __call__(self, images: list[NumpyArray]) -> list[NumpyArray]: - return [normalize(image, mean=self.mean, std=self.std) for image in images] + def __call__(self, images: Union[list[NumpyArray], list[list[NumpyArray]]]) -> Union[list[NumpyArray], list[list[NumpyArray]]]: + if images and isinstance(images[0], list): + # Nested structure from ImageSplitter + return [[normalize(image, mean=self.mean, std=self.std) for image in img_patches] for img_patches in images] + else: + # Flat structure (backward compatibility) + return [normalize(image, mean=self.mean, std=self.std) for image in images] class Resize(Transform): @@ -58,8 +67,13 @@ class Rescale(Transform): def __init__(self, scale: float = 1 / 255): self.scale = scale - def __call__(self, images: list[NumpyArray]) -> list[NumpyArray]: - return [rescale(image, scale=self.scale) for image in images] + def __call__(self, images: Union[list[NumpyArray], list[list[NumpyArray]]]) -> Union[list[NumpyArray], list[list[NumpyArray]]]: + if images and isinstance(images[0], list): + # Nested structure from ImageSplitter + return [[rescale(image, scale=self.scale) for image in img_patches] for img_patches in images] + else: + # Flat structure (backward compatibility) + return [rescale(image, scale=self.scale) for image in images] class PILtoNDarray(Transform): @@ -82,6 +96,163 @@ def __call__(self, images: list[Image.Image]) -> list[Image.Image]: ] +class ResizeLongestEdge(Transform): + """Resize images so the longest edge equals target size, preserving aspect ratio.""" + + def __init__( + self, + size: int, + resample: Image.Resampling = Image.Resampling.LANCZOS, + ): + self.size = size + self.resample = resample + + def __call__(self, images: list[Image.Image]) -> list[Image.Image]: + return [resize_longest_edge(image, self.size, self.resample) for image in images] + + +class ResizeForVisionEncoder(Transform): + """ + Resize both dimensions to be multiples of vision_encoder_max_size. + Preserves aspect ratio approximately. + Works on numpy arrays in (C, H, W) format. + """ + + def __init__( + self, + max_size: int, + resample: Image.Resampling = Image.Resampling.LANCZOS, + ): + self.max_size = max_size + self.resample = resample + + def __call__(self, images: list[NumpyArray]) -> list[NumpyArray]: + result = [] + for image in images: + # Assume (C, H, W) format + _, height, width = image.shape + + aspect_ratio = width / height + + if width >= height: + # Calculate new width as multiple of max_size + new_width = math.ceil(width / self.max_size) * self.max_size + new_height = int(new_width / aspect_ratio) + new_height = math.ceil(new_height / self.max_size) * self.max_size + else: + # Calculate new height as multiple of max_size + new_height = math.ceil(height / self.max_size) * self.max_size + new_width = int(new_height * aspect_ratio) + new_width = math.ceil(new_width / self.max_size) * self.max_size + + # Resize using the ndarray resize function + resized = resize_ndarray( + image, + size=(new_width, new_height), # PIL expects (width, height) + resample=self.resample, + channel_first=True, + ) + result.append(resized) + + return result + + +class ImageSplitter(Transform): + """ + Split images into grid of patches plus a global view. + + If image dimensions exceed max_size: + - Divide into ceil(H/max_size) x ceil(W/max_size) patches + - Each patch is cropped from the image + - Add a global view (original resized to max_size x max_size) + + If image is smaller than max_size: + - Return single image unchanged + + Works on numpy arrays in (C, H, W) format. + """ + + def __init__( + self, + max_size: int, + resample: Image.Resampling = Image.Resampling.LANCZOS, + ): + self.max_size = max_size + self.resample = resample + + def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: + result = [] + + for image in images: + # Assume (C, H, W) format + _, height, width = image.shape + max_height = max_width = self.max_size + + frames = [] + + if height > max_height or width > max_width: + # Calculate the number of splits needed + num_splits_h = math.ceil(height / max_height) + num_splits_w = math.ceil(width / max_width) + + # Calculate optimal patch dimensions + optimal_height = math.ceil(height / num_splits_h) + optimal_width = math.ceil(width / num_splits_w) + + # Generate patches in grid order (row by row) + for r in range(num_splits_h): + for c in range(num_splits_w): + # Calculate crop coordinates + start_x = c * optimal_width + start_y = r * optimal_height + end_x = min(start_x + optimal_width, width) + end_y = min(start_y + optimal_height, height) + + # Crop the patch + cropped = crop_ndarray( + image, x1=start_x, y1=start_y, x2=end_x, y2=end_y, channel_first=True + ) + frames.append(cropped) + + # Add global view (resized to max_size x max_size) + global_view = resize_ndarray( + image, + size=(max_width, max_height), # PIL expects (width, height) + resample=self.resample, + channel_first=True, + ) + frames.append(global_view) + else: + # Image is small enough, no splitting needed + frames.append(image) + + # Append (not extend) to preserve per-image grouping + result.append(frames) + + return result + + +class SquareResize(Transform): + """ + Resize images to square dimensions (max_size x max_size). + Works on numpy arrays in (C, H, W) format. + """ + + def __init__( + self, + size: int, + resample: Image.Resampling = Image.Resampling.LANCZOS, + ): + self.size = size + self.resample = resample + + def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: + return [ + [resize_ndarray(image, size=(self.size, self.size), resample=self.resample, channel_first=True)] + for image in images + ] + + class Compose: def __init__(self, transforms: list[Transform]): self.transforms = transforms @@ -118,6 +289,7 @@ def from_config(cls, config: dict[str, Any]) -> "Compose": Valid size keys (nested): - {"height", "width"} - {"shortest_edge"} + - {"longest_edge"} Returns: Compose: Image processor. @@ -128,6 +300,7 @@ def from_config(cls, config: dict[str, Any]) -> "Compose": cls._get_pad2square(transforms, config) cls._get_center_crop(transforms, config) cls._get_pil2ndarray(transforms, config) + cls._get_image_splitting(transforms, config) cls._get_rescale(transforms, config) cls._get_normalize(transforms, config) return cls(transforms=transforms) @@ -196,6 +369,25 @@ def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]) -> Non resample=resample, ) ) + elif mode == "Idefics3ImageProcessor": + if config.get("do_resize", False): + size = config.get("size", {}) + if "longest_edge" not in size: + raise ValueError( + "Size dictionary must contain 'longest_edge' key for Idefics3ImageProcessor" + ) + + # Handle resample parameter - can be int enum or PIL.Image.Resampling + resample = config.get("resample", Image.Resampling.LANCZOS) + if isinstance(resample, int): + resample = Image.Resampling(resample) + + transforms.append( + ResizeLongestEdge( + size=size["longest_edge"], + resample=resample, + ) + ) else: raise ValueError(f"Preprocessor {mode} is not supported") @@ -217,6 +409,8 @@ def _get_center_crop(transforms: list[Transform], config: dict[str, Any]) -> Non pass elif mode == "JinaCLIPImageProcessor": pass + elif mode == "Idefics3ImageProcessor": + pass else: raise ValueError(f"Preprocessor {mode} is not supported") @@ -224,6 +418,28 @@ def _get_center_crop(transforms: list[Transform], config: dict[str, Any]) -> Non def _get_pil2ndarray(transforms: list[Transform], config: dict[str, Any]) -> None: transforms.append(PILtoNDarray()) + @classmethod + def _get_image_splitting(cls, transforms: list[Transform], config: dict[str, Any]) -> None: + """ + Add image splitting transforms for Idefics3. + Handles conditional logic: splitting vs square resize. + Must be called AFTER PILtoNDarray. + """ + mode = config.get("image_processor_type", "CLIPImageProcessor") + + if mode == "Idefics3ImageProcessor": + do_splitting = config.get("do_image_splitting", False) + max_size = config.get("max_image_size", {}).get("longest_edge", 512) + resample = config.get("resample", Image.Resampling.LANCZOS) + if isinstance(resample, int): + resample = Image.Resampling(resample) + + if do_splitting: + transforms.append(ResizeForVisionEncoder(max_size, resample)) + transforms.append(ImageSplitter(max_size, resample)) + else: + transforms.append(SquareResize(max_size, resample)) + @staticmethod def _get_rescale(transforms: list[Transform], config: dict[str, Any]) -> None: if config.get("do_rescale", True): diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index 18b36338..75e7ee92 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -176,12 +176,55 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu for image in images ] assert self.processor is not None, "Processor is not initialized" - encoded = np.array(self.processor(image_files)) - onnx_input = {"pixel_values": encoded} + processed = self.processor(image_files) + + # Handle nested structure (with image splitting) + if isinstance(processed[0], list): + # processed = [[img1_patches], [img2_patches], ...] + # Need shape: (batch_size, max_patches, C, H, W) + + patch_counts = [len(patches) for patches in processed] + max_patches = max(patch_counts) + + # Get dimensions from first patch + C, H, W = processed[0][0].shape + + # Create padded array + batch_size = len(processed) + encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype) + + # Create attention mask (1 for real patches, 0 for padding) + attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64) + + # Fill in patches and attention mask + for i, patches in enumerate(processed): + for j, patch in enumerate(patches): + encoded[i, j] = patch + attention_mask[i, j] = 1 + + # Track actual patch counts for later use + metadata = {"patch_counts": patch_counts} + else: + # Flat structure (no splitting) - still need batch dimension + # Shape: (batch_size, 1, C, H, W) + encoded = np.array(processed) + if len(encoded.shape) == 4: # (batch_size, C, H, W) + encoded = encoded[:, np.newaxis, ...] # Add num_patches=1 dimension + + # All patches are real (no padding) + # TODO: attention_mask should be built + attention_mask = np.ones((len(images), encoded.shape[1]), dtype=np.int64) + metadata = {"patch_counts": [encoded.shape[1]] * len(images)} + + onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask} onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) model_output = self.model.run(None, onnx_input) # type: ignore[union-attr] - embeddings = model_output[0].reshape(len(images), -1) - return OnnxOutputContext(model_output=embeddings) + + return OnnxOutputContext( + model_output=model_output[0], + attention_mask=attention_mask, + metadata=metadata, + ) def _embed_images( self, From 8c45088e9f605211388e3bd7cba7a49965a922b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Fri, 5 Dec 2025 17:45:32 +0100 Subject: [PATCH 03/24] Fix padding support --- fastembed/common/preprocessor_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fastembed/common/preprocessor_utils.py b/fastembed/common/preprocessor_utils.py index efbc3e25..3b702f79 100644 --- a/fastembed/common/preprocessor_utils.py +++ b/fastembed/common/preprocessor_utils.py @@ -50,9 +50,10 @@ def load_tokenizer(model_dir: Path) -> tuple[Tokenizer, dict[str, int]]: tokenizer = Tokenizer.from_file(str(tokenizer_path)) tokenizer.enable_truncation(max_length=max_context) - tokenizer.enable_padding( - pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"] - ) + if not tokenizer.padding: + tokenizer.enable_padding( + pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"] + ) for token in tokens_map.values(): if isinstance(token, str): From 5b56a77d2e5d61c4e2e677df1c5daba8d2cac08a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Fri, 5 Dec 2025 17:46:28 +0100 Subject: [PATCH 04/24] Implement ColModernVBERT logic --- .../colmodernvbert.py | 376 +++++++++++++++++- 1 file changed, 370 insertions(+), 6 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 0c1b0e6c..8058e6c7 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -1,9 +1,14 @@ -from typing import Any, Iterable, Type, Union, Optional +from typing import Any, Iterable, Type, Union, Optional, Sequence +import json + +import numpy as np +from tokenizers import Encoding from fastembed.common import ImageInput from fastembed.common.model_description import DenseModelDescription, ModelSource from fastembed.common.onnx_model import OnnxOutputContext, T -from fastembed.common.types import NumpyArray +from fastembed.common.types import NumpyArray, OnnxProvider +from fastembed.common.utils import define_cache_dir from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( LateInteractionMultimodalEmbeddingBase, ) @@ -17,8 +22,7 @@ description="The late-interaction version of ModernVBERT, CPU friendly, English, 2025.", license="mit", size_in_GB=1.0, - # TODO: change the url to hf repo link! - sources=ModelSource(url="file:///home/kacper/Projects/Qdrant/colpali-model-migration-to-onnx/outputs/colmodernvbert"), + sources=ModelSource(hf="Qdrant/colmodernvbert"), additional_files=["model.onnx_data"], model_file="model.onnx", ), @@ -32,6 +36,78 @@ class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel See: https://huggingface.co/ModernVBERT/colmodernvbert """ + VISUAL_PROMPT_PREFIX = "<|begin_of_text|>User:Describe the image.\nAssistant:" + + def __init__( + self, + model_name: str, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + providers: Optional[Sequence[OnnxProvider]] = None, + cuda: bool = False, + device_ids: Optional[list[int]] = None, + lazy_load: bool = False, + device_id: Optional[int] = None, + specific_model_path: Optional[str] = None, + **kwargs: Any, + ): + """ + Args: + model_name (str): The name of the model to use. + cache_dir (str, optional): The path to the cache directory. + Can be set using the `FASTEMBED_CACHE_PATH` env variable. + Defaults to `fastembed_cache` in the system's temp directory. + threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. + providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use. + Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None. + cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers` + Defaults to False. + device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in + workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None. + lazy_load (bool, optional): Whether to load the model during class initialization or on demand. + Should be set to True when using multiple-gpu and parallel encoding. Defaults to False. + device_id (Optional[int], optional): The device id to use for loading the model in the worker process. + + Raises: + ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + """ + + # TODO: consider unifying ColPali and ColModernVBERT __init__ methods + + super().__init__(model_name, cache_dir, threads, **kwargs) + self.providers = providers + self.lazy_load = lazy_load + self._extra_session_options = self._select_exposed_session_options(kwargs) + + # List of device ids, that can be used for data parallel processing in workers + self.device_ids = device_ids + self.cuda = cuda + + # This device_id will be used if we need to load model in current process + self.device_id: Optional[int] = None + if device_id is not None: + self.device_id = device_id + elif self.device_ids is not None: + self.device_id = self.device_ids[0] + + self.model_description = self._get_model_description(model_name) + self.cache_dir = str(define_cache_dir(cache_dir)) + + self._specific_model_path = specific_model_path + self._model_dir = self.download_model( + self.model_description, + self.cache_dir, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, + ) + self.mask_token_id = None + self.pad_token_id = None + self.image_seq_len: Optional[int] = None + self.max_image_size: Optional[int] = None + + if not self.lazy_load: + self.load_onnx_model() + # TODO: reproduce ColPali methods only @classmethod @@ -43,6 +119,294 @@ def _list_supported_models(cls) -> list[DenseModelDescription]: """ return supported_colmodernvbert_models + def load_onnx_model(self) -> None: + self._load_onnx_model( + model_dir=self._model_dir, + model_file=self.model_description.model_file, + threads=self.threads, + providers=self.providers, + cuda=self.cuda, + device_id=self.device_id, + extra_session_options=self._extra_session_options, + ) + + # Load image processing configuration + processor_config_path = self._model_dir / "processor_config.json" + with open(processor_config_path) as f: + processor_config = json.load(f) + self.image_seq_len = processor_config.get("image_seq_len", 64) + + preprocessor_config_path = self._model_dir / "preprocessor_config.json" + with open(preprocessor_config_path) as f: + preprocessor_config = json.load(f) + self.max_image_size = preprocessor_config.get("max_image_size", {}).get("longest_edge", 512) + + def _preprocess_onnx_text_input( + self, onnx_input: dict[str, NumpyArray], **kwargs: Any + ) -> dict[str, NumpyArray]: + """ + Post-process the ONNX model output to convert it into a usable format. + + Args: + output (OnnxOutputContext): The raw output from the ONNX model. + + Returns: + Iterable[NumpyArray]: Post-processed output as NumPy arrays. + """ + batch_size, seq_length = onnx_input["input_ids"].shape + # TODO: use .json config, not 3, 512, 512 + empty_image_placeholder: NumpyArray = np.zeros( + (batch_size, seq_length, 3, 512, 512), dtype=np.float32 + ) + onnx_input["pixel_values"] = empty_image_placeholder + return onnx_input + + def _post_process_onnx_text_output( + self, + output: OnnxOutputContext, + ) -> Iterable[NumpyArray]: + """ + Post-process the ONNX model output to convert it into a usable format. + + Args: + output (OnnxOutputContext): The raw output from the ONNX model. + + Returns: + Iterable[NumpyArray]: Post-processed output as NumPy arrays. + """ + return output.model_output + + def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: + encoded = self.tokenizer.encode_batch(documents) # type: ignore[union-attr] + return encoded + + def _preprocess_onnx_image_input( + self, onnx_input: dict[str, np.ndarray], **kwargs: Any + ) -> dict[str, NumpyArray]: + """ + Add text input placeholders for image data, following Idefics3 processing logic. + + Constructs input_ids dynamically based on the actual number of image patches, + using the same token expansion logic as Idefics3Processor. + + Args: + onnx_input: Dict with 'pixel_values' (batch, num_patches, C, H, W) + and 'attention_mask' (batch, num_patches) indicating real patches + **kwargs: Additional arguments + + Returns: + Updated onnx_input with 'input_ids' and updated 'attention_mask' for token sequence + """ + # The attention_mask in onnx_input has a shape of (batch_size, num_patches), + # and should be used to create an attention mask matching the input_ids shape. + patch_attention_mask = onnx_input["attention_mask"] + pixel_values = onnx_input["pixel_values"] + + batch_size = pixel_values.shape[0] + batch_input_ids = [] + + # Build input_ids for each image based on its actual patch count + for i in range(batch_size): + # Count real patches (non-padded) from attention mask + patch_count = int(np.sum(patch_attention_mask[i])) + + # Compute rows/cols from patch count + rows, cols = self._compute_rows_cols_from_patches(patch_count) + + # Build input_ids for this image + input_ids = self._build_input_ids_for_image(rows, cols) + batch_input_ids.append(input_ids) + + # Pad sequences to max length in batch + max_len = max(len(ids) for ids in batch_input_ids) + + # Get padding config from tokenizer + padding_direction = self.tokenizer.padding["direction"] # type: ignore[index,union-attr] + pad_token_id = self.tokenizer.padding["pad_id"] # type: ignore[index,union-attr] + + # Initialize with pad token + padded_input_ids = np.full((batch_size, max_len), pad_token_id, dtype=np.int64) + attention_mask = np.zeros((batch_size, max_len), dtype=np.int64) + + for i, input_ids in enumerate(batch_input_ids): + seq_len = len(input_ids) + if padding_direction == "left": + # Left padding: place tokens at the END of the array + start_idx = max_len - seq_len + padded_input_ids[i, start_idx:] = input_ids + attention_mask[i, start_idx:] = 1 + else: + # Right padding: place tokens at the START of the array + padded_input_ids[i, :seq_len] = input_ids + attention_mask[i, :seq_len] = 1 + + onnx_input["input_ids"] = padded_input_ids + # Update attention_mask with token-level data + onnx_input["attention_mask"] = attention_mask + return onnx_input + + def _compute_rows_cols_from_patches(self, patch_count: int) -> tuple[int, int]: + if patch_count <= 1: + return 0, 0 + + # Subtract 1 for the global image + grid_patches = patch_count - 1 + + # Find rows and cols (assume square or near-square grid) + rows = int(grid_patches ** 0.5) + cols = grid_patches // rows + + # Verify the calculation + if rows * cols + 1 != patch_count: + # Handle non-square grids + for r in range(1, grid_patches + 1): + if grid_patches % r == 0: + c = grid_patches // r + if r * c + 1 == patch_count: + return r, c + # Fallback: treat as unsplit + return 0, 0 + + return rows, cols + + def _create_single_image_prompt_string(self) -> str: + return ( + "" + + "" + + "" * self.image_seq_len + + "" + ) + + def _create_split_image_prompt_string(self, rows: int, cols: int) -> str: + text_split_images = "" + + # Add tokens for each patch in the grid + for n_h in range(rows): + for n_w in range(cols): + text_split_images += ( + f"" + + f"" + + "" * self.image_seq_len + ) + text_split_images += "\n" + + # Add global image at the end + text_split_images += ( + f"\n" + + "" + + "" * self.image_seq_len + + "" + ) + + return text_split_images + + def _build_input_ids_for_image(self, rows: int, cols: int) -> np.ndarray: + # Create the appropriate image prompt string + if rows == 0 and cols == 0: + image_prompt_tokens = self._create_single_image_prompt_string() + else: + image_prompt_tokens = self._create_split_image_prompt_string(rows, cols) + + # Replace in visual prompt with expanded tokens + # The visual prompt is: "<|begin_of_text|>User:Describe the image.\nAssistant:" + expanded_prompt = self.VISUAL_PROMPT_PREFIX.replace("", image_prompt_tokens) + + # Tokenize the complete prompt + encoded = self.tokenizer.encode(expanded_prompt) # type: ignore[union-attr] + + # Convert to numpy array + return np.array(encoded.ids, dtype=np.int64) + + def _post_process_onnx_image_output( + self, + output: OnnxOutputContext, + ) -> Iterable[NumpyArray]: + """ + Post-process the ONNX model output to convert it into a usable format. + + Args: + output (OnnxOutputContext): The raw output from the ONNX model. + + Returns: + Iterable[NumpyArray]: Post-processed output as NumPy arrays. + """ + assert self.model_description.dim is not None, "Model dim is not defined" + return output.model_output.reshape( + output.model_output.shape[0], -1, self.model_description.dim + ) + + def embed_text( + self, + documents: Union[str, Iterable[str]], + batch_size: int = 256, + parallel: Optional[int] = None, + **kwargs: Any, + ) -> Iterable[NumpyArray]: + """ + Encode a list of documents into list of embeddings. + + Args: + documents: Iterator of documents or single document to embed + batch_size: Batch size for encoding -- higher values will use more memory, but be faster + parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + + Returns: + List of embeddings, one per document + """ + yield from self._embed_documents( + model_name=self.model_name, + cache_dir=str(self.cache_dir), + documents=documents, + batch_size=batch_size, + parallel=parallel, + providers=self.providers, + cuda=self.cuda, + device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, + extra_session_options=self._extra_session_options, + **kwargs, + ) + + def embed_image( + self, + images: Union[ImageInput, Iterable[ImageInput]], + batch_size: int = 16, + parallel: Optional[int] = None, + **kwargs: Any, + ) -> Iterable[NumpyArray]: + """ + Encode a list of images into list of embeddings. + + Args: + images: Iterator of image paths or single image path to embed + batch_size: Batch size for encoding -- higher values will use more memory, but be faster + parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + + Returns: + List of embeddings, one per document + """ + yield from self._embed_images( + model_name=self.model_name, + cache_dir=str(self.cache_dir), + images=images, + batch_size=batch_size, + parallel=parallel, + providers=self.providers, + cuda=self.cuda, + device_ids=self.device_ids, + local_files_only=self._local_files_only, + specific_model_path=self._specific_model_path, + extra_session_options=self._extra_session_options, + **kwargs, + ) + @classmethod def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]: return ColModernVBERTTextEmbeddingWorker @@ -52,7 +416,7 @@ def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]: return ColModernVBERTmageEmbeddingWorker class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali: + def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT: return ColModernVBERT( model_name=model_name, cache_dir=cache_dir, @@ -62,7 +426,7 @@ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColP class ColModernVBERTmageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]): - def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali: + def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT: return ColModernVBERT( model_name=model_name, cache_dir=cache_dir, From e637a7fe3aef02cf23855f9671ff0d9a121251cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Fri, 5 Dec 2025 17:47:28 +0100 Subject: [PATCH 05/24] Remove TODOs --- fastembed/late_interaction_multimodal/colmodernvbert.py | 5 ----- .../late_interaction_multimodal/onnx_multimodal_model.py | 1 - 2 files changed, 6 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 8058e6c7..c55d099b 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -71,9 +71,6 @@ def __init__( Raises: ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. """ - - # TODO: consider unifying ColPali and ColModernVBERT __init__ methods - super().__init__(model_name, cache_dir, threads, **kwargs) self.providers = providers self.lazy_load = lazy_load @@ -108,8 +105,6 @@ def __init__( if not self.lazy_load: self.load_onnx_model() - # TODO: reproduce ColPali methods only - @classmethod def _list_supported_models(cls) -> list[DenseModelDescription]: """Lists the supported models. diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index 75e7ee92..cbfc09e8 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -212,7 +212,6 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu encoded = encoded[:, np.newaxis, ...] # Add num_patches=1 dimension # All patches are real (no padding) - # TODO: attention_mask should be built attention_mask = np.ones((len(images), encoded.shape[1]), dtype=np.int64) metadata = {"patch_counts": [encoded.shape[1]] * len(images)} From 74f5c3e47462ac133e155618f2cd3cea5473bcd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Fri, 5 Dec 2025 18:10:27 +0100 Subject: [PATCH 06/24] Handle empty pixel values with proper image_size --- .../late_interaction_multimodal/colmodernvbert.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index c55d099b..a3357517 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -101,6 +101,7 @@ def __init__( self.pad_token_id = None self.image_seq_len: Optional[int] = None self.max_image_size: Optional[int] = None + self.image_size: Optional[int] = None if not self.lazy_load: self.load_onnx_model() @@ -136,6 +137,13 @@ def load_onnx_model(self) -> None: preprocessor_config = json.load(f) self.max_image_size = preprocessor_config.get("max_image_size", {}).get("longest_edge", 512) + # Load model configuration + config_path = self._model_dir / "config.json" + with open(config_path) as f: + model_config = json.load(f) + vision_config = model_config.get("vision_config", {}) + self.image_size = vision_config.get("image_size", 512) + def _preprocess_onnx_text_input( self, onnx_input: dict[str, NumpyArray], **kwargs: Any ) -> dict[str, NumpyArray]: @@ -149,9 +157,8 @@ def _preprocess_onnx_text_input( Iterable[NumpyArray]: Post-processed output as NumPy arrays. """ batch_size, seq_length = onnx_input["input_ids"].shape - # TODO: use .json config, not 3, 512, 512 empty_image_placeholder: NumpyArray = np.zeros( - (batch_size, seq_length, 3, 512, 512), dtype=np.float32 + (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32 ) onnx_input["pixel_values"] = empty_image_placeholder return onnx_input From 9e2929ec2b05fb7070ef29d9a620ecca3a1ea605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Fri, 5 Dec 2025 18:19:05 +0100 Subject: [PATCH 07/24] Add ColModernVBERT tests --- tests/test_late_interaction_multimodal.py | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 8a102ace..8aa3a44e 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -21,6 +21,17 @@ [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122], ] ), + "Qdrant/colmodernvbert": np.array( + [ + [0.2256, -0.0503, 0.0254, -0.011, -0.0786, 0.2152, -0.0961], + [-0.0028, -0.0484, -0.0724, -0.0724, -0.0977, 0.0308, -0.0236], + [0.0035, -0.1075, -0.0877, -0.0207, -0.0828, -0.0294, -0.0253], + [0.0021, -0.0797, -0.0605, -0.0008, -0.0837, 0.0015, -0.0846], + [-0.0473, -0.0594, -0.0553, -0.0014, -0.0712, 0.0158, -0.0546], + [-0.1009, -0.082, -0.0684, -0.1385, -0.0469, -0.0606, -0.0323], + [-0.0624, 0.006, -0.0498, -0.0127, -0.1115, 0.0076, -0.0888], + ] + ), } CANONICAL_QUERY_VALUES = { @@ -35,6 +46,17 @@ [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137], ] ), + "Qdrant/colmodernvbert": np.array( + [ + [0.05, 0.0656, 0.0403, 0.1498, 0.1842, 0.0263, -0.1871], + [-0.0566, -0.1403, 0.0065, -0.0285, 0.0903, -0.0149, 0.1069], + [-0.1015, -0.0072, 0.0908, -0.0824, -0.0185, -0.0097, -0.0046], + [-0.1233, -0.1081, -0.0234, -0.0033, 0.0598, 0.0993, 0.0985], + [-0.0705, -0.1312, -0.0649, 0.0151, 0.0746, 0.0765, 0.1482], + [0.0053, -0.1384, -0.0584, -0.0272, 0.1301, 0.0508, 0.1796], + [0.0092, -0.1438, -0.0306, -0.0369, 0.1172, 0.037, 0.1334], + ] + ), } queries = ["hello world", "flag embedding"] @@ -90,6 +112,9 @@ def test_get_embedding_size(): model_name = "Qdrant/ColPali-v1.3-fp16" assert LateInteractionMultimodalEmbedding.get_embedding_size(model_name) == 128 + model_name = "Qdrant/colmodernvbert" + assert LateInteractionMultimodalEmbedding.get_embedding_size(model_name) == 128 + def test_embedding_size(): if os.getenv("CI"): @@ -102,6 +127,10 @@ def test_embedding_size(): model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) assert model.embedding_size == 128 + model_name = "Qdrant/colmodernvbert" + model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) + assert model.embedding_size == 128 + def test_token_count() -> None: if os.getenv("CI"): From aa93a528679c6a25fa15116628206f4c85d404d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Mon, 8 Dec 2025 13:17:38 +0100 Subject: [PATCH 08/24] Run pre-commit --- fastembed/image/transform/operators.py | 24 ++++++++++++++---- .../colmodernvbert.py | 25 +++++++++++++------ .../late_interaction_multimodal_embedding.py | 5 +++- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py index b6d3814d..cabff63d 100644 --- a/fastembed/image/transform/operators.py +++ b/fastembed/image/transform/operators.py @@ -41,10 +41,15 @@ def __init__(self, mean: float | list[float], std: float | list[float]): self.mean = mean self.std = std - def __call__(self, images: Union[list[NumpyArray], list[list[NumpyArray]]]) -> Union[list[NumpyArray], list[list[NumpyArray]]]: + def __call__( + self, images: Union[list[NumpyArray], list[list[NumpyArray]]] + ) -> Union[list[NumpyArray], list[list[NumpyArray]]]: if images and isinstance(images[0], list): # Nested structure from ImageSplitter - return [[normalize(image, mean=self.mean, std=self.std) for image in img_patches] for img_patches in images] + return [ + [normalize(image, mean=self.mean, std=self.std) for image in img_patches] + for img_patches in images + ] else: # Flat structure (backward compatibility) return [normalize(image, mean=self.mean, std=self.std) for image in images] @@ -67,10 +72,15 @@ class Rescale(Transform): def __init__(self, scale: float = 1 / 255): self.scale = scale - def __call__(self, images: Union[list[NumpyArray], list[list[NumpyArray]]]) -> Union[list[NumpyArray], list[list[NumpyArray]]]: + def __call__( + self, images: Union[list[NumpyArray], list[list[NumpyArray]]] + ) -> Union[list[NumpyArray], list[list[NumpyArray]]]: if images and isinstance(images[0], list): # Nested structure from ImageSplitter - return [[rescale(image, scale=self.scale) for image in img_patches] for img_patches in images] + return [ + [rescale(image, scale=self.scale) for image in img_patches] + for img_patches in images + ] else: # Flat structure (backward compatibility) return [rescale(image, scale=self.scale) for image in images] @@ -248,7 +258,11 @@ def __init__( def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: return [ - [resize_ndarray(image, size=(self.size, self.size), resample=self.resample, channel_first=True)] + [ + resize_ndarray( + image, size=(self.size, self.size), resample=self.resample, channel_first=True + ) + ] for image in images ] diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index a3357517..d975f510 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -6,14 +6,17 @@ from fastembed.common import ImageInput from fastembed.common.model_description import DenseModelDescription, ModelSource -from fastembed.common.onnx_model import OnnxOutputContext, T +from fastembed.common.onnx_model import OnnxOutputContext from fastembed.common.types import NumpyArray, OnnxProvider from fastembed.common.utils import define_cache_dir from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( LateInteractionMultimodalEmbeddingBase, ) -from fastembed.late_interaction_multimodal.onnx_multimodal_model import OnnxMultimodalModel, TextEmbeddingWorker, \ - ImageEmbeddingWorker +from fastembed.late_interaction_multimodal.onnx_multimodal_model import ( + OnnxMultimodalModel, + TextEmbeddingWorker, + ImageEmbeddingWorker, +) supported_colmodernvbert_models: list[DenseModelDescription] = [ DenseModelDescription( @@ -28,6 +31,7 @@ ), ] + class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[NumpyArray]): """ The ModernVBERT/colmodernvbert model implementation. This model uses @@ -36,7 +40,9 @@ class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel See: https://huggingface.co/ModernVBERT/colmodernvbert """ - VISUAL_PROMPT_PREFIX = "<|begin_of_text|>User:Describe the image.\nAssistant:" + VISUAL_PROMPT_PREFIX = ( + "<|begin_of_text|>User:Describe the image.\nAssistant:" + ) def __init__( self, @@ -135,7 +141,9 @@ def load_onnx_model(self) -> None: preprocessor_config_path = self._model_dir / "preprocessor_config.json" with open(preprocessor_config_path) as f: preprocessor_config = json.load(f) - self.max_image_size = preprocessor_config.get("max_image_size", {}).get("longest_edge", 512) + self.max_image_size = preprocessor_config.get("max_image_size", {}).get( + "longest_edge", 512 + ) # Load model configuration config_path = self._model_dir / "config.json" @@ -255,7 +263,7 @@ def _compute_rows_cols_from_patches(self, patch_count: int) -> tuple[int, int]: grid_patches = patch_count - 1 # Find rows and cols (assume square or near-square grid) - rows = int(grid_patches ** 0.5) + rows = int(grid_patches**0.5) cols = grid_patches // rows # Verify the calculation @@ -286,7 +294,7 @@ def _create_split_image_prompt_string(self, rows: int, cols: int) -> str: for n_h in range(rows): for n_w in range(cols): text_split_images += ( - f"" + "" + f"" + "" * self.image_seq_len ) @@ -294,7 +302,7 @@ def _create_split_image_prompt_string(self, rows: int, cols: int) -> str: # Add global image at the end text_split_images += ( - f"\n" + "\n" + "" + "" * self.image_seq_len + "" @@ -417,6 +425,7 @@ def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]: def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]: return ColModernVBERTmageEmbeddingWorker + class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT: return ColModernVBERT( diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py index f123dc63..10d426d0 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py @@ -13,7 +13,10 @@ class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase): - EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ColPali, ColModernVBERT] + EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ + ColPali, + ColModernVBERT, + ] @classmethod def list_supported_models(cls) -> list[dict[str, Any]]: From 6470e35317d6ebc4576f4b609ae9504faf93ee3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Mon, 8 Dec 2025 13:29:51 +0100 Subject: [PATCH 09/24] mypy fixes --- fastembed/image/transform/operators.py | 16 ++++++++-------- .../colmodernvbert.py | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py index cabff63d..3ee2d873 100644 --- a/fastembed/image/transform/operators.py +++ b/fastembed/image/transform/operators.py @@ -41,18 +41,18 @@ def __init__(self, mean: float | list[float], std: float | list[float]): self.mean = mean self.std = std - def __call__( + def __call__( # type: ignore[override] self, images: Union[list[NumpyArray], list[list[NumpyArray]]] ) -> Union[list[NumpyArray], list[list[NumpyArray]]]: if images and isinstance(images[0], list): # Nested structure from ImageSplitter return [ - [normalize(image, mean=self.mean, std=self.std) for image in img_patches] + [normalize(image, mean=self.mean, std=self.std) for image in img_patches] # type: ignore[arg-type] for img_patches in images ] else: # Flat structure (backward compatibility) - return [normalize(image, mean=self.mean, std=self.std) for image in images] + return [normalize(image, mean=self.mean, std=self.std) for image in images] # type: ignore[arg-type] class Resize(Transform): @@ -72,18 +72,18 @@ class Rescale(Transform): def __init__(self, scale: float = 1 / 255): self.scale = scale - def __call__( + def __call__( # type: ignore[override] self, images: Union[list[NumpyArray], list[list[NumpyArray]]] ) -> Union[list[NumpyArray], list[list[NumpyArray]]]: if images and isinstance(images[0], list): # Nested structure from ImageSplitter return [ - [rescale(image, scale=self.scale) for image in img_patches] + [rescale(image, scale=self.scale) for image in img_patches] # type: ignore[arg-type] for img_patches in images ] else: # Flat structure (backward compatibility) - return [rescale(image, scale=self.scale) for image in images] + return [rescale(image, scale=self.scale) for image in images] # type: ignore[arg-type] class PILtoNDarray(Transform): @@ -190,7 +190,7 @@ def __init__( self.max_size = max_size self.resample = resample - def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: + def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: # type: ignore[override] result = [] for image in images: @@ -256,7 +256,7 @@ def __init__( self.size = size self.resample = resample - def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: + def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]: # type: ignore[override] return [ [ resize_ndarray( diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index d975f510..210c8084 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -165,7 +165,7 @@ def _preprocess_onnx_text_input( Iterable[NumpyArray]: Post-processed output as NumPy arrays. """ batch_size, seq_length = onnx_input["input_ids"].shape - empty_image_placeholder: NumpyArray = np.zeros( + empty_image_placeholder: NumpyArray = np.zeros( # type: ignore[type-var] (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32 ) onnx_input["pixel_values"] = empty_image_placeholder @@ -283,7 +283,7 @@ def _create_single_image_prompt_string(self) -> str: return ( "" + "" - + "" * self.image_seq_len + + "" * self.image_seq_len # type: ignore[operator] + "" ) @@ -296,7 +296,7 @@ def _create_split_image_prompt_string(self, rows: int, cols: int) -> str: text_split_images += ( "" + f"" - + "" * self.image_seq_len + + "" * self.image_seq_len # type: ignore[operator] ) text_split_images += "\n" @@ -304,7 +304,7 @@ def _create_split_image_prompt_string(self, rows: int, cols: int) -> str: text_split_images += ( "\n" + "" - + "" * self.image_seq_len + + "" * self.image_seq_len # type: ignore[operator] + "" ) From bf8931721d3f80f527ec8cc80f485671eab3c068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Mon, 8 Dec 2025 13:32:56 +0100 Subject: [PATCH 10/24] mypy fixes --- fastembed/late_interaction_multimodal/colmodernvbert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 210c8084..3051e962 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -165,7 +165,7 @@ def _preprocess_onnx_text_input( Iterable[NumpyArray]: Post-processed output as NumPy arrays. """ batch_size, seq_length = onnx_input["input_ids"].shape - empty_image_placeholder: NumpyArray = np.zeros( # type: ignore[type-var] + empty_image_placeholder: NumpyArray = np.zeros( # type: ignore[type-var,arg-type] (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32 ) onnx_input["pixel_values"] = empty_image_placeholder From 39c7211d7a141cabd300c0b6c8b96f60e2035122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Mon, 8 Dec 2025 13:34:53 +0100 Subject: [PATCH 11/24] mypy fixes --- fastembed/late_interaction_multimodal/colmodernvbert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 3051e962..2d030387 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -165,7 +165,7 @@ def _preprocess_onnx_text_input( Iterable[NumpyArray]: Post-processed output as NumPy arrays. """ batch_size, seq_length = onnx_input["input_ids"].shape - empty_image_placeholder: NumpyArray = np.zeros( # type: ignore[type-var,arg-type] + empty_image_placeholder: NumpyArray = np.zeros( # type: ignore[type-var,arg-type,assignment] (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32 ) onnx_input["pixel_values"] = empty_image_placeholder From 889a95bf3110be781da8bd48dc0c92c15f44b8ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Mon, 8 Dec 2025 13:37:08 +0100 Subject: [PATCH 12/24] mypy fixes --- fastembed/late_interaction_multimodal/colmodernvbert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 2d030387..41048011 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -165,8 +165,8 @@ def _preprocess_onnx_text_input( Iterable[NumpyArray]: Post-processed output as NumPy arrays. """ batch_size, seq_length = onnx_input["input_ids"].shape - empty_image_placeholder: NumpyArray = np.zeros( # type: ignore[type-var,arg-type,assignment] - (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32 + empty_image_placeholder: NumpyArray = np.zeros( + (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32 # type: ignore[type-var,arg-type,assignment] ) onnx_input["pixel_values"] = empty_image_placeholder return onnx_input From 144d8671b4328998321d5d87f9deee02e3776634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Tue, 9 Dec 2025 12:05:50 +0100 Subject: [PATCH 13/24] Fix typo in the class name --- fastembed/late_interaction_multimodal/colmodernvbert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 41048011..890805c1 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -423,7 +423,7 @@ def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]: @classmethod def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]: - return ColModernVBERTmageEmbeddingWorker + return ColModernVBERTImageEmbeddingWorker class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): @@ -436,7 +436,7 @@ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColM ) -class ColModernVBERTmageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]): +class ColModernVBERTImageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]): def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT: return ColModernVBERT( model_name=model_name, From 23768f1b679e417a9652504835492502f3ea583d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Tue, 9 Dec 2025 13:19:58 +0100 Subject: [PATCH 14/24] Add processor_config.json to additional files --- fastembed/late_interaction_multimodal/colmodernvbert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 890805c1..0a1f5d29 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -26,7 +26,7 @@ license="mit", size_in_GB=1.0, sources=ModelSource(hf="Qdrant/colmodernvbert"), - additional_files=["model.onnx_data"], + additional_files=["processor_config.json"], model_file="model.onnx", ), ] From 7bea5328d7ab5559f72d0e939cea85b2243f5c31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Tue, 9 Dec 2025 15:39:59 +0100 Subject: [PATCH 15/24] Fix mypy errors --- .../onnx_multimodal_model.py | 142 +++++++++++++----- tests/test_late_interaction_multimodal.py | 11 +- 2 files changed, 111 insertions(+), 42 deletions(-) diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index cbfc09e8..6683a077 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -178,42 +178,15 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu assert self.processor is not None, "Processor is not initialized" processed = self.processor(image_files) - # Handle nested structure (with image splitting) + # Dispatch to appropriate handler based on structure. + # ColModernVBERT processors divides the original image into + # subimages and processes them separately. if isinstance(processed[0], list): - # processed = [[img1_patches], [img2_patches], ...] - # Need shape: (batch_size, max_patches, C, H, W) - - patch_counts = [len(patches) for patches in processed] - max_patches = max(patch_counts) - - # Get dimensions from first patch - C, H, W = processed[0][0].shape - - # Create padded array - batch_size = len(processed) - encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype) - - # Create attention mask (1 for real patches, 0 for padding) - attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64) - - # Fill in patches and attention mask - for i, patches in enumerate(processed): - for j, patch in enumerate(patches): - encoded[i, j] = patch - attention_mask[i, j] = 1 - - # Track actual patch counts for later use - metadata = {"patch_counts": patch_counts} + encoded, attention_mask, metadata = self._process_nested_patches(processed) else: - # Flat structure (no splitting) - still need batch dimension - # Shape: (batch_size, 1, C, H, W) - encoded = np.array(processed) - if len(encoded.shape) == 4: # (batch_size, C, H, W) - encoded = encoded[:, np.newaxis, ...] # Add num_patches=1 dimension - - # All patches are real (no padding) - attention_mask = np.ones((len(images), encoded.shape[1]), dtype=np.int64) - metadata = {"patch_counts": [encoded.shape[1]] * len(images)} + encoded, attention_mask, metadata = self._process_flat_images( + processed, len(images) # type: ignore[arg-type] + ) onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask} onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) @@ -221,10 +194,109 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu return OnnxOutputContext( model_output=model_output[0], - attention_mask=attention_mask, + attention_mask=attention_mask, # type: ignore[arg-type] metadata=metadata, ) + def _process_nested_patches( + self, processed: list[list[NumpyArray]] + ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]: + """ + Process nested image patches (from ImageSplitter). + + Args: + processed: List of patch lists, one per image [[img1_patches], [img2_patches], ...] + + Returns: + tuple: (encoded array, attention_mask, metadata) + - encoded: (batch_size, max_patches, C, H, W) + - attention_mask: (batch_size, max_patches) with 1 for real patches, 0 for padding + - metadata: Dict with 'patch_counts' key + """ + patch_counts = [len(patches) for patches in processed] + max_patches = max(patch_counts) + + # Get dimensions from first patch + C, H, W = processed[0][0].shape + batch_size = len(processed) + + # Create padded array + encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype) + + # Create attention mask (1 for real patches, 0 for padding) + attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64) + + # Fill in patches and attention mask + for i, patches in enumerate(processed): + for j, patch in enumerate(patches): + encoded[i, j] = patch + attention_mask[i, j] = 1 + + metadata = {"patch_counts": patch_counts} + return encoded, attention_mask, metadata + + def _process_flat_images( + self, processed: list[NumpyArray], num_images: int + ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]: + """ + Process flat image arrays (from standard processors like SiglipImageProcessor). + + For models expecting 5D input (Idefics3-based), adds patch dimension. + For models expecting 4D input, keeps original shape. + + Args: + processed: List of image arrays + num_images: Number of images being processed + + Returns: + tuple: (encoded array, attention_mask, metadata) + - encoded: (batch_size, C, H, W) for 4D models OR (batch_size, 1, C, H, W) for 5D models + - attention_mask: (batch_size, 1) with all ones + - metadata: Dict with 'patch_counts' key + """ + encoded = np.array(processed) + + # Check if model needs patch dimension based on ONNX signature + if len(encoded.shape) == 4 and self._needs_patch_dimension(): + # Add patch dimension for Idefics3-based models: (batch, 1, C, H, W) + encoded = encoded[:, np.newaxis, ...] + + # Determine attention mask shape based on final tensor shape + if len(encoded.shape) == 5: + # 5D tensor: attention_mask shape is (batch, num_patches) + attention_mask = np.ones((num_images, encoded.shape[1]), dtype=np.int64) + metadata = {"patch_counts": [encoded.shape[1]] * num_images} + else: + # 4D tensor: attention_mask shape is (batch, 1) + attention_mask = np.ones((num_images, 1), dtype=np.int64) + metadata = {"patch_counts": [1] * num_images} + + return encoded, attention_mask, metadata # type: ignore[return-value] + + def _needs_patch_dimension(self) -> bool: + """ + Determine if this model needs the patch dimension by checking ONNX input shape. + + Idefics3-based models (like ColModernVBERT) need 5D tensors (batch_size, patch_count, C, H, W). + Earlier models (like ColPali v1.3) need 4D tensors (batch_size, C, H, W). + + Returns: + bool: True if pixel_values input has 5 dimensions, False if 4 dimensions + """ + if not hasattr(self, "model") or self.model is None: + return False + + # Get pixel_values input metadata + for input_meta in self.model.get_inputs(): + if input_meta.name == "pixel_values": + # input_meta.shape is a list like + # ['batch_size', 'sequence_length', 'num_channels', 'height', 'width'] + # or ['batch_size', 'num_channels', 'height', 'width'] + return len(input_meta.shape) == 5 + + # Default to False for backward compatibility + return False + def _embed_images( self, model_name: str, diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 8aa3a44e..888586d3 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -48,13 +48,10 @@ ), "Qdrant/colmodernvbert": np.array( [ - [0.05, 0.0656, 0.0403, 0.1498, 0.1842, 0.0263, -0.1871], - [-0.0566, -0.1403, 0.0065, -0.0285, 0.0903, -0.0149, 0.1069], - [-0.1015, -0.0072, 0.0908, -0.0824, -0.0185, -0.0097, -0.0046], - [-0.1233, -0.1081, -0.0234, -0.0033, 0.0598, 0.0993, 0.0985], - [-0.0705, -0.1312, -0.0649, 0.0151, 0.0746, 0.0765, 0.1482], - [0.0053, -0.1384, -0.0584, -0.0272, 0.1301, 0.0508, 0.1796], - [0.0092, -0.1438, -0.0306, -0.0369, 0.1172, 0.037, 0.1334], + [0.0541, 0.0677, 0.0392, 0.1494, 0.1855, 0.0275, -0.1835, -0.1025, -0.1204, -0.0835], + [-0.0515, -0.1328, 0.0298, -0.0574, 0.0829, -0.0836, 0.0888, 0.0138, 0.0741, 0.0293], + [-0.1114, -0.0506, 0.0666, -0.1064, -0.0229, -0.0486, -0.007, 0.0932, 0.0054, 0.1113], + [0.2317, -0.0518, 0.0248, -0.0075, -0.078, 0.2073, -0.0912, -0.0622, -0.0203, 0.093] ] ), } From 96eb50bcbcf27c2bb603eb11f67fee86463ab18f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Tue, 9 Dec 2025 15:39:59 +0100 Subject: [PATCH 16/24] Refactor onnx_embed_image --- .../late_interaction_multimodal/onnx_multimodal_model.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index 6683a077..f4299421 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -184,9 +184,7 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu if isinstance(processed[0], list): encoded, attention_mask, metadata = self._process_nested_patches(processed) else: - encoded, attention_mask, metadata = self._process_flat_images( - processed, len(images) # type: ignore[arg-type] - ) + encoded, attention_mask, metadata = self._process_flat_images(processed, len(images)) onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask} onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) @@ -194,7 +192,7 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu return OnnxOutputContext( model_output=model_output[0], - attention_mask=attention_mask, # type: ignore[arg-type] + attention_mask=attention_mask, metadata=metadata, ) @@ -271,7 +269,7 @@ def _process_flat_images( attention_mask = np.ones((num_images, 1), dtype=np.int64) metadata = {"patch_counts": [1] * num_images} - return encoded, attention_mask, metadata # type: ignore[return-value] + return encoded, attention_mask, metadata def _needs_patch_dimension(self) -> bool: """ From cf4100c9dedb59161beca7b4cdc9582860e059da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Tue, 9 Dec 2025 16:11:03 +0100 Subject: [PATCH 17/24] Fix mypy errors --- .../onnx_multimodal_model.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index f4299421..06af0586 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -184,7 +184,10 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu if isinstance(processed[0], list): encoded, attention_mask, metadata = self._process_nested_patches(processed) else: - encoded, attention_mask, metadata = self._process_flat_images(processed, len(images)) + encoded, attention_mask, metadata = self._process_flat_images( + processed, # type: ignore[arg-type] + len(images), + ) onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask} onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) @@ -192,7 +195,7 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu return OnnxOutputContext( model_output=model_output[0], - attention_mask=attention_mask, + attention_mask=attention_mask, # type: ignore[arg-type] metadata=metadata, ) @@ -231,7 +234,7 @@ def _process_nested_patches( attention_mask[i, j] = 1 metadata = {"patch_counts": patch_counts} - return encoded, attention_mask, metadata + return encoded, attention_mask, metadata # type: ignore[return-value] def _process_flat_images( self, processed: list[NumpyArray], num_images: int @@ -269,7 +272,7 @@ def _process_flat_images( attention_mask = np.ones((num_images, 1), dtype=np.int64) metadata = {"patch_counts": [1] * num_images} - return encoded, attention_mask, metadata + return encoded, attention_mask, metadata # type: ignore[return-value] def _needs_patch_dimension(self) -> bool: """ From 5a6d8841f3dcac88b39df000403237dbfc7fe459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Fri, 2 Jan 2026 17:37:01 +0100 Subject: [PATCH 18/24] fix: colmodernvbert tests and query processing --- .../colmodernvbert.py | 5 +++- tests/test_late_interaction_multimodal.py | 25 +++++++++++-------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 0a1f5d29..3ef46f55 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -43,6 +43,7 @@ class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel VISUAL_PROMPT_PREFIX = ( "<|begin_of_text|>User:Describe the image.\nAssistant:" ) + QUERY_AUGMENTATION_TOKEN = "" def __init__( self, @@ -187,7 +188,9 @@ def _post_process_onnx_text_output( return output.model_output def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: - encoded = self.tokenizer.encode_batch(documents) # type: ignore[union-attr] + # Add query augmentation tokens (matching process_queries logic from colpali-engine) + augmented_queries = [doc + self.QUERY_AUGMENTATION_TOKEN * 10 for doc in documents] + encoded = self.tokenizer.encode_batch(augmented_queries) # type: ignore[union-attr] return encoded def _preprocess_onnx_image_input( diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 888586d3..ac4f9eeb 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -23,13 +23,13 @@ ), "Qdrant/colmodernvbert": np.array( [ - [0.2256, -0.0503, 0.0254, -0.011, -0.0786, 0.2152, -0.0961], - [-0.0028, -0.0484, -0.0724, -0.0724, -0.0977, 0.0308, -0.0236], - [0.0035, -0.1075, -0.0877, -0.0207, -0.0828, -0.0294, -0.0253], - [0.0021, -0.0797, -0.0605, -0.0008, -0.0837, 0.0015, -0.0846], - [-0.0473, -0.0594, -0.0553, -0.0014, -0.0712, 0.0158, -0.0546], - [-0.1009, -0.082, -0.0684, -0.1385, -0.0469, -0.0606, -0.0323], - [-0.0624, 0.006, -0.0498, -0.0127, -0.1115, 0.0076, -0.0888], + [0.11614, -0.15793, -0.11194, 0.0688, 0.08001, 0.10575, -0.07871], + [0.10094, -0.13301, -0.12069, 0.10932, 0.04645, 0.09884, 0.04048], + [0.13106, -0.18613, -0.13469, 0.10566, 0.03659, 0.07712, -0.03916], + [0.09754, -0.09596, -0.04839, 0.14991, 0.05692, 0.10569, -0.08349], + [0.02576, -0.15651, -0.09977, 0.09707, 0.13412, 0.09994, -0.09931], + [-0.06741, -0.1787, -0.19677, -0.07618, 0.13102, -0.02131, -0.02437], + [-0.02776, -0.10187, -0.13793, 0.03835, 0.04766, 0.04701, -0.15635], ] ), } @@ -48,10 +48,13 @@ ), "Qdrant/colmodernvbert": np.array( [ - [0.0541, 0.0677, 0.0392, 0.1494, 0.1855, 0.0275, -0.1835, -0.1025, -0.1204, -0.0835], - [-0.0515, -0.1328, 0.0298, -0.0574, 0.0829, -0.0836, 0.0888, 0.0138, 0.0741, 0.0293], - [-0.1114, -0.0506, 0.0666, -0.1064, -0.0229, -0.0486, -0.007, 0.0932, 0.0054, 0.1113], - [0.2317, -0.0518, 0.0248, -0.0075, -0.078, 0.2073, -0.0912, -0.0622, -0.0203, 0.093] + [0.05, 0.06557, 0.04026, 0.14981, 0.1842, 0.0263, -0.18706], + [-0.05664, -0.14028, 0.00649, -0.02849, 0.09034, -0.01494, 0.10693], + [-0.10147, -0.00716, 0.09084, -0.08236, -0.01849, -0.00972, -0.00461], + [-0.1233, -0.10814, -0.02337, -0.00329, 0.05984, 0.09934, 0.09846], + [-0.07053, -0.13119, -0.06487, 0.01508, 0.07459, 0.07655, 0.14821], + [0.00526, -0.13842, -0.05837, -0.02721, 0.13009, 0.05076, 0.17962], + [0.00924, -0.14383, -0.03057, -0.03691, 0.11718, 0.037, 0.13344], ] ), } From 2dc7de8a0c29a49c13dd0b9787f6bc49532486a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Fri, 2 Jan 2026 21:13:47 +0100 Subject: [PATCH 19/24] fix: remove Union references --- fastembed/image/transform/functional.py | 4 ++-- fastembed/image/transform/operators.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fastembed/image/transform/functional.py b/fastembed/image/transform/functional.py index 86dfbe08..9d9e2197 100644 --- a/fastembed/image/transform/functional.py +++ b/fastembed/image/transform/functional.py @@ -150,7 +150,7 @@ def pad2square( def resize_longest_edge( image: Image.Image, max_size: int, - resample: Union[int, Image.Resampling] = Image.Resampling.LANCZOS, + resample: int | Image.Resampling = Image.Resampling.LANCZOS, ) -> Image.Image: height, width = image.height, image.width aspect_ratio = width / height @@ -192,7 +192,7 @@ def crop_ndarray( def resize_ndarray( image: NumpyArray, size: tuple[int, int], - resample: Union[int, Image.Resampling] = Image.Resampling.LANCZOS, + resample: int | Image.Resampling = Image.Resampling.LANCZOS, channel_first: bool = True, ) -> NumpyArray: # Convert to PIL-friendly format (H, W, C) diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py index 3ee2d873..e6ba4d95 100644 --- a/fastembed/image/transform/operators.py +++ b/fastembed/image/transform/operators.py @@ -42,8 +42,8 @@ def __init__(self, mean: float | list[float], std: float | list[float]): self.std = std def __call__( # type: ignore[override] - self, images: Union[list[NumpyArray], list[list[NumpyArray]]] - ) -> Union[list[NumpyArray], list[list[NumpyArray]]]: + self, images: list[NumpyArray] | list[list[NumpyArray]] + ) -> list[NumpyArray] | list[list[NumpyArray]]: if images and isinstance(images[0], list): # Nested structure from ImageSplitter return [ @@ -73,8 +73,8 @@ def __init__(self, scale: float = 1 / 255): self.scale = scale def __call__( # type: ignore[override] - self, images: Union[list[NumpyArray], list[list[NumpyArray]]] - ) -> Union[list[NumpyArray], list[list[NumpyArray]]]: + self, images: list[NumpyArray] | list[list[NumpyArray]] + ) -> list[NumpyArray] | list[list[NumpyArray]]: if images and isinstance(images[0], list): # Nested structure from ImageSplitter return [ From 46453a26a918a6389a078327807af97ebf29855c Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Thu, 8 Jan 2026 17:12:23 +0700 Subject: [PATCH 20/24] fix: fix exit stack, update tests, implement token count --- fastembed/image/onnx_image_model.py | 6 +- .../colmodernvbert.py | 31 +++- .../onnx_multimodal_model.py | 6 +- tests/test_late_interaction_multimodal.py | 161 ++++++++++-------- 4 files changed, 120 insertions(+), 84 deletions(-) diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py index 86326da9..deddcf73 100644 --- a/fastembed/image/onnx_image_model.py +++ b/fastembed/image/onnx_image_model.py @@ -76,9 +76,11 @@ def _build_onnx_input(self, encoded: NumpyArray) -> dict[str, NumpyArray]: return {input_name: encoded} def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext: - with contextlib.ExitStack(): + with contextlib.ExitStack() as stack: image_files = [ - Image.open(image) if not isinstance(image, Image.Image) else image + stack.enter_context(Image.open(image)) + if not isinstance(image, Image.Image) + else image for image in images ] assert self.processor is not None, "Processor is not initialized" diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 3ef46f55..7ab84f22 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -1,4 +1,4 @@ -from typing import Any, Iterable, Type, Union, Optional, Sequence +from typing import Any, Iterable, Type, Optional, Sequence import json import numpy as np @@ -8,7 +8,7 @@ from fastembed.common.model_description import DenseModelDescription, ModelSource from fastembed.common.onnx_model import OnnxOutputContext from fastembed.common.types import NumpyArray, OnnxProvider -from fastembed.common.utils import define_cache_dir +from fastembed.common.utils import define_cache_dir, iter_batch from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( LateInteractionMultimodalEmbeddingBase, ) @@ -167,7 +167,8 @@ def _preprocess_onnx_text_input( """ batch_size, seq_length = onnx_input["input_ids"].shape empty_image_placeholder: NumpyArray = np.zeros( - (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32 # type: ignore[type-var,arg-type,assignment] + (batch_size, seq_length, 3, self.image_size, self.image_size), + dtype=np.float32, # type: ignore[type-var,arg-type,assignment] ) onnx_input["pixel_values"] = empty_image_placeholder return onnx_input @@ -193,6 +194,23 @@ def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: encoded = self.tokenizer.encode_batch(augmented_queries) # type: ignore[union-attr] return encoded + def token_count( + self, + texts: str | Iterable[str], + batch_size: int = 1024, + include_extension: bool = False, + **kwargs: Any, + ) -> int: + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() # loads the tokenizer as well + token_num = 0 + texts = [texts] if isinstance(texts, str) else texts + assert self.tokenizer is not None + tokenize_func = self.tokenize if include_extension else self.tokenizer.encode_batch + for batch in iter_batch(texts, batch_size): + token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)]) + return token_num + def _preprocess_onnx_image_input( self, onnx_input: dict[str, np.ndarray], **kwargs: Any ) -> dict[str, NumpyArray]: @@ -258,7 +276,8 @@ def _preprocess_onnx_image_input( onnx_input["attention_mask"] = attention_mask return onnx_input - def _compute_rows_cols_from_patches(self, patch_count: int) -> tuple[int, int]: + @staticmethod + def _compute_rows_cols_from_patches(patch_count: int) -> tuple[int, int]: if patch_count <= 1: return 0, 0 @@ -350,7 +369,7 @@ def _post_process_onnx_image_output( def embed_text( self, - documents: Union[str, Iterable[str]], + documents: str | Iterable[str], batch_size: int = 256, parallel: Optional[int] = None, **kwargs: Any, @@ -386,7 +405,7 @@ def embed_text( def embed_image( self, - images: Union[ImageInput, Iterable[ImageInput]], + images: ImageInput | Iterable[ImageInput], batch_size: int = 16, parallel: Optional[int] = None, **kwargs: Any, diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index 06af0586..bfe81fdc 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -170,9 +170,11 @@ def _embed_documents( yield from self._post_process_onnx_text_output(batch) # type: ignore def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext: - with contextlib.ExitStack(): + with contextlib.ExitStack() as stack: image_files = [ - Image.open(image) if not isinstance(image, Image.Image) else image + stack.enter_context(Image.open(image)) + if not isinstance(image, Image.Image) + else image for image in images ] assert self.processor is not None, "Processor is not initialized" diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index ac4f9eeb..29d550fa 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -1,4 +1,5 @@ import os +from contextlib import contextmanager import pytest from PIL import Image @@ -6,21 +7,21 @@ from fastembed import LateInteractionMultimodalEmbedding from tests.config import TEST_MISC_DIR - +from tests.utils import delete_model_cache # vectors are abridged and rounded for brevity CANONICAL_IMAGE_VALUES = { - "Qdrant/colpali-v1.3-fp16": np.array( - [ - [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738], - [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021], - [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666], - [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087], - [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064], - [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301], - [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122], - ] - ), + # "Qdrant/colpali-v1.3-fp16": np.array( + # [ + # [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738], + # [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021], + # [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666], + # [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087], + # [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064], + # [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301], + # [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122], + # ] + # ), "Qdrant/colmodernvbert": np.array( [ [0.11614, -0.15793, -0.11194, 0.0688, 0.08001, 0.10575, -0.07871], @@ -35,17 +36,17 @@ } CANONICAL_QUERY_VALUES = { - "Qdrant/colpali-v1.3-fp16": np.array( - [ - [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567], - [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537], - [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593], - [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098], - [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708], - [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022], - [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137], - ] - ), + # "Qdrant/colpali-v1.3-fp16": np.array( + # [ + # [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567], + # [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537], + # [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593], + # [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098], + # [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708], + # [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022], + # [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137], + # ] + # ), "Qdrant/colmodernvbert": np.array( [ [0.05, 0.06557, 0.04026, 0.14981, 0.1842, 0.0263, -0.18706], @@ -66,43 +67,68 @@ Image.open((TEST_MISC_DIR / "image.jpeg")), ] +MODELS_TO_CACHE = ("Qdrant/colmodernvbert",) -def test_batch_embedding(): - if os.getenv("CI"): - pytest.skip("Colpali is too large to test in CI") - for model_name, expected_result in CANONICAL_IMAGE_VALUES.items(): - print("evaluating", model_name) - model = LateInteractionMultimodalEmbedding(model_name=model_name) - result = list(model.embed_image(images, batch_size=2)) +@pytest.fixture(scope="module") +def model_cache(): + is_ci = os.getenv("CI") + cache = {} - for value in result: - token_num, abridged_dim = expected_result.shape - assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3) + @contextmanager + def get_model(model_name: str): + lowercase_model_name = model_name.lower() + if lowercase_model_name not in cache: + cache[lowercase_model_name] = LateInteractionMultimodalEmbedding(lowercase_model_name) + yield cache[lowercase_model_name] + if lowercase_model_name not in MODELS_TO_CACHE: + model_inst = cache.pop(lowercase_model_name) + if is_ci: + delete_model_cache(model_inst.model._model_dir) + del model_inst + yield get_model -def test_single_embedding(): - if os.getenv("CI"): - pytest.skip("Colpali is too large to test in CI") + if is_ci: + for name, model in cache.items(): + delete_model_cache(model.model._model_dir) + cache.clear() + +def test_batch_embedding(model_cache): for model_name, expected_result in CANONICAL_IMAGE_VALUES.items(): + if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"): + continue # colpali is too large for ci + print("evaluating", model_name) - model = LateInteractionMultimodalEmbedding(model_name=model_name) - result = next(iter(model.embed_image(images, batch_size=6))) - token_num, abridged_dim = expected_result.shape - assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) + with model_cache(model_name) as model: + result = list(model.embed_image(images, batch_size=2)) + for value in result: + token_num, abridged_dim = expected_result.shape + assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3) + + +def test_single_embedding(model_cache): + for model_name, expected_result in CANONICAL_IMAGE_VALUES.items(): + if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"): + continue # colpali is too large for ci + print("evaluating", model_name) + with model_cache(model_name) as model: + result = next(iter(model.embed_image(images, batch_size=6))) + token_num, abridged_dim = expected_result.shape + assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) -def test_single_embedding_query(): - if os.getenv("CI"): - pytest.skip("Colpali is too large to test in CI") +def test_single_embedding_query(model_cache): for model_name, expected_result in CANONICAL_QUERY_VALUES.items(): + if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"): + continue # colpali is too large for ci print("evaluating", model_name) - model = LateInteractionMultimodalEmbedding(model_name=model_name) - result = next(iter(model.embed_text(queries))) - token_num, abridged_dim = expected_result.shape - assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) + with model_cache(model_name) as model: + result = next(iter(model.embed_text(queries))) + token_num, abridged_dim = expected_result.shape + assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3) def test_get_embedding_size(): @@ -117,35 +143,22 @@ def test_get_embedding_size(): def test_embedding_size(): - if os.getenv("CI"): - pytest.skip("Colpali is too large to test in CI") - model_name = "Qdrant/colpali-v1.3-fp16" - model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 128 - - model_name = "Qdrant/ColPali-v1.3-fp16" - model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) - assert model.embedding_size == 128 - model_name = "Qdrant/colmodernvbert" model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) assert model.embedding_size == 128 -def test_token_count() -> None: - if os.getenv("CI"): - pytest.skip("Colpali is too large to test in CI") - model_name = "Qdrant/colpali-v1.3-fp16" - model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) - - documents = ["short doc", "it is a long document to check attention mask for paddings"] - short_doc_token_count = model.token_count(documents[0]) - long_doc_token_count = model.token_count(documents[1]) - documents_token_count = model.token_count(documents) - assert short_doc_token_count + long_doc_token_count == documents_token_count - assert short_doc_token_count + long_doc_token_count == model.token_count( - documents, batch_size=1 - ) - assert short_doc_token_count + long_doc_token_count < model.token_count( - documents, include_extension=True - ) +def test_token_count(model_cache) -> None: + model_name = "Qdrant/colmodernvbert" + with model_cache(model_name) as model: + documents = ["short doc", "it is a long document to check attention mask for paddings"] + short_doc_token_count = model.token_count(documents[0]) + long_doc_token_count = model.token_count(documents[1]) + documents_token_count = model.token_count(documents) + assert short_doc_token_count + long_doc_token_count == documents_token_count + assert short_doc_token_count + long_doc_token_count == model.token_count( + documents, batch_size=1 + ) + assert short_doc_token_count + long_doc_token_count < model.token_count( + documents, include_extension=True + ) From 8f6f057b08f6d1b6761003dc01e869333f87a684 Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Fri, 9 Jan 2026 16:33:20 +0700 Subject: [PATCH 21/24] fix: uncomment colpali in tests --- tests/test_late_interaction_multimodal.py | 44 +++++++++++------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 29d550fa..248352d4 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -11,17 +11,17 @@ # vectors are abridged and rounded for brevity CANONICAL_IMAGE_VALUES = { - # "Qdrant/colpali-v1.3-fp16": np.array( - # [ - # [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738], - # [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021], - # [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666], - # [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087], - # [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064], - # [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301], - # [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122], - # ] - # ), + "Qdrant/colpali-v1.3-fp16": np.array( + [ + [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738], + [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021], + [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666], + [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087], + [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064], + [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301], + [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122], + ] + ), "Qdrant/colmodernvbert": np.array( [ [0.11614, -0.15793, -0.11194, 0.0688, 0.08001, 0.10575, -0.07871], @@ -36,17 +36,17 @@ } CANONICAL_QUERY_VALUES = { - # "Qdrant/colpali-v1.3-fp16": np.array( - # [ - # [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567], - # [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537], - # [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593], - # [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098], - # [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708], - # [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022], - # [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137], - # ] - # ), + "Qdrant/colpali-v1.3-fp16": np.array( + [ + [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567], + [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537], + [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593], + [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098], + [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708], + [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022], + [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137], + ] + ), "Qdrant/colmodernvbert": np.array( [ [0.05, 0.06557, 0.04026, 0.14981, 0.1842, 0.0263, -0.18706], From 01965c933b6c11d7b99bb7b4f610d2fc0ba17160 Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Fri, 9 Jan 2026 17:02:00 +0700 Subject: [PATCH 22/24] fix: lowercase models to cache --- tests/test_late_interaction_multimodal.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 248352d4..bf400dfa 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -67,7 +67,8 @@ Image.open((TEST_MISC_DIR / "image.jpeg")), ] -MODELS_TO_CACHE = ("Qdrant/colmodernvbert",) +_MODELS_TO_CACHE = ("Qdrant/colmodernvbert",) +MODELS_TO_CACHE = (model_name.lower() for model_name in _MODELS_TO_CACHE) @pytest.fixture(scope="module") @@ -90,7 +91,7 @@ def get_model(model_name: str): yield get_model if is_ci: - for name, model in cache.items(): + for _, model in cache.items(): delete_model_cache(model.model._model_dir) cache.clear() From ef9c496dcc06aee1d0058cb9bfc7f48adf12504c Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Fri, 9 Jan 2026 17:23:56 +0700 Subject: [PATCH 23/24] fix: fix models to cache --- tests/test_late_interaction_multimodal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index bf400dfa..94ae47e7 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -68,7 +68,7 @@ ] _MODELS_TO_CACHE = ("Qdrant/colmodernvbert",) -MODELS_TO_CACHE = (model_name.lower() for model_name in _MODELS_TO_CACHE) +MODELS_TO_CACHE = tuple(model_name.lower() for model_name in _MODELS_TO_CACHE) @pytest.fixture(scope="module") From 8b9f50c32b87dad0f24819e7e94aa8d4c2446342 Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Fri, 9 Jan 2026 18:21:59 +0700 Subject: [PATCH 24/24] refactor: move colmodernvbert related onnx embed to its class --- .../colmodernvbert.py | 64 +++++++++ .../onnx_multimodal_model.py | 123 +----------------- 2 files changed, 68 insertions(+), 119 deletions(-) diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py index 7ab84f22..20b8e4f7 100644 --- a/fastembed/late_interaction_multimodal/colmodernvbert.py +++ b/fastembed/late_interaction_multimodal/colmodernvbert.py @@ -1,8 +1,10 @@ +import contextlib from typing import Any, Iterable, Type, Optional, Sequence import json import numpy as np from tokenizers import Encoding +from PIL import Image from fastembed.common import ImageInput from fastembed.common.model_description import DenseModelDescription, ModelSource @@ -211,6 +213,68 @@ def token_count( token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)]) return token_num + def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext: + with contextlib.ExitStack() as stack: + image_files = [ + stack.enter_context(Image.open(image)) + if not isinstance(image, Image.Image) + else image + for image in images + ] + assert self.processor is not None, "Processor is not initialized" + processed = self.processor(image_files) + encoded, attention_mask, metadata = self._process_nested_patches(processed) # type: ignore[arg-type] + + onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask} + onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) + model_output = self.model.run(None, onnx_input) # type: ignore[union-attr] + + return OnnxOutputContext( + model_output=model_output[0], + attention_mask=attention_mask, # type: ignore[arg-type] + metadata=metadata, + ) + + @staticmethod + def _process_nested_patches( + processed: list[list[NumpyArray]], + ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]: + """ + Process nested image patches (from ImageSplitter). + + Args: + processed: List of patch lists, one per image [[img1_patches], [img2_patches], ...] + + Returns: + tuple: (encoded array, attention_mask, metadata) + - encoded: (batch_size, max_patches, C, H, W) + - attention_mask: (batch_size, max_patches) with 1 for real patches, 0 for padding + - metadata: Dict with 'patch_counts' key + """ + patch_counts = [len(patches) for patches in processed] + max_patches = max(patch_counts) + + # Get dimensions from first patch + channels, height, width = processed[0][0].shape + batch_size = len(processed) + + # Create padded array + encoded = np.zeros( + (batch_size, max_patches, channels, height, width), dtype=processed[0][0].dtype + ) + + # Create attention mask (1 for real patches, 0 for padding) + attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64) + + # Fill in patches and attention mask + for i, patches in enumerate(processed): + for j, patch in enumerate(patches): + encoded[i, j] = patch + attention_mask[i, j] = 1 + + metadata = {"patch_counts": patch_counts} + return encoded, attention_mask, metadata # type: ignore[return-value] + def _preprocess_onnx_image_input( self, onnx_input: dict[str, np.ndarray], **kwargs: Any ) -> dict[str, NumpyArray]: diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py index bfe81fdc..93436895 100644 --- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py +++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py @@ -178,127 +178,12 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu for image in images ] assert self.processor is not None, "Processor is not initialized" - processed = self.processor(image_files) - - # Dispatch to appropriate handler based on structure. - # ColModernVBERT processors divides the original image into - # subimages and processes them separately. - if isinstance(processed[0], list): - encoded, attention_mask, metadata = self._process_nested_patches(processed) - else: - encoded, attention_mask, metadata = self._process_flat_images( - processed, # type: ignore[arg-type] - len(images), - ) - - onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask} + encoded = np.array(self.processor(image_files)) + onnx_input = {"pixel_values": encoded} onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs) model_output = self.model.run(None, onnx_input) # type: ignore[union-attr] - - return OnnxOutputContext( - model_output=model_output[0], - attention_mask=attention_mask, # type: ignore[arg-type] - metadata=metadata, - ) - - def _process_nested_patches( - self, processed: list[list[NumpyArray]] - ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]: - """ - Process nested image patches (from ImageSplitter). - - Args: - processed: List of patch lists, one per image [[img1_patches], [img2_patches], ...] - - Returns: - tuple: (encoded array, attention_mask, metadata) - - encoded: (batch_size, max_patches, C, H, W) - - attention_mask: (batch_size, max_patches) with 1 for real patches, 0 for padding - - metadata: Dict with 'patch_counts' key - """ - patch_counts = [len(patches) for patches in processed] - max_patches = max(patch_counts) - - # Get dimensions from first patch - C, H, W = processed[0][0].shape - batch_size = len(processed) - - # Create padded array - encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype) - - # Create attention mask (1 for real patches, 0 for padding) - attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64) - - # Fill in patches and attention mask - for i, patches in enumerate(processed): - for j, patch in enumerate(patches): - encoded[i, j] = patch - attention_mask[i, j] = 1 - - metadata = {"patch_counts": patch_counts} - return encoded, attention_mask, metadata # type: ignore[return-value] - - def _process_flat_images( - self, processed: list[NumpyArray], num_images: int - ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]: - """ - Process flat image arrays (from standard processors like SiglipImageProcessor). - - For models expecting 5D input (Idefics3-based), adds patch dimension. - For models expecting 4D input, keeps original shape. - - Args: - processed: List of image arrays - num_images: Number of images being processed - - Returns: - tuple: (encoded array, attention_mask, metadata) - - encoded: (batch_size, C, H, W) for 4D models OR (batch_size, 1, C, H, W) for 5D models - - attention_mask: (batch_size, 1) with all ones - - metadata: Dict with 'patch_counts' key - """ - encoded = np.array(processed) - - # Check if model needs patch dimension based on ONNX signature - if len(encoded.shape) == 4 and self._needs_patch_dimension(): - # Add patch dimension for Idefics3-based models: (batch, 1, C, H, W) - encoded = encoded[:, np.newaxis, ...] - - # Determine attention mask shape based on final tensor shape - if len(encoded.shape) == 5: - # 5D tensor: attention_mask shape is (batch, num_patches) - attention_mask = np.ones((num_images, encoded.shape[1]), dtype=np.int64) - metadata = {"patch_counts": [encoded.shape[1]] * num_images} - else: - # 4D tensor: attention_mask shape is (batch, 1) - attention_mask = np.ones((num_images, 1), dtype=np.int64) - metadata = {"patch_counts": [1] * num_images} - - return encoded, attention_mask, metadata # type: ignore[return-value] - - def _needs_patch_dimension(self) -> bool: - """ - Determine if this model needs the patch dimension by checking ONNX input shape. - - Idefics3-based models (like ColModernVBERT) need 5D tensors (batch_size, patch_count, C, H, W). - Earlier models (like ColPali v1.3) need 4D tensors (batch_size, C, H, W). - - Returns: - bool: True if pixel_values input has 5 dimensions, False if 4 dimensions - """ - if not hasattr(self, "model") or self.model is None: - return False - - # Get pixel_values input metadata - for input_meta in self.model.get_inputs(): - if input_meta.name == "pixel_values": - # input_meta.shape is a list like - # ['batch_size', 'sequence_length', 'num_channels', 'height', 'width'] - # or ['batch_size', 'num_channels', 'height', 'width'] - return len(input_meta.shape) == 5 - - # Default to False for backward compatibility - return False + embeddings = model_output[0].reshape(len(images), -1) + return OnnxOutputContext(model_output=embeddings) def _embed_images( self,