From 07a5c454e2fcad8ff93f5ed97a5fe1124b8c246b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Thu, 4 Dec 2025 11:25:31 +0100
Subject: [PATCH 01/24] Add ColModernVBERT to
 LateInteractionMultimodalEmbedding registry

---
 .../colmodernvbert.py                         | 71 +++++++++++++++++++
 .../late_interaction_multimodal_embedding.py  |  3 +-
 2 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 fastembed/late_interaction_multimodal/colmodernvbert.py

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
new file mode 100644
index 00000000..0c1b0e6c
--- /dev/null
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -0,0 +1,71 @@
+from typing import Any, Iterable, Type, Union, Optional
+
+from fastembed.common import ImageInput
+from fastembed.common.model_description import DenseModelDescription, ModelSource
+from fastembed.common.onnx_model import OnnxOutputContext, T
+from fastembed.common.types import NumpyArray
+from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
+    LateInteractionMultimodalEmbeddingBase,
+)
+from fastembed.late_interaction_multimodal.onnx_multimodal_model import OnnxMultimodalModel, TextEmbeddingWorker, \
+    ImageEmbeddingWorker
+
+supported_colmodernvbert_models: list[DenseModelDescription] = [
+    DenseModelDescription(
+        model="Qdrant/colmodernvbert",
+        dim=128,
+        description="The late-interaction version of ModernVBERT, CPU friendly, English, 2025.",
+        license="mit",
+        size_in_GB=1.0,
+        # TODO: change the url to hf repo link!
+        sources=ModelSource(url="file:///home/kacper/Projects/Qdrant/colpali-model-migration-to-onnx/outputs/colmodernvbert"),
+        additional_files=["model.onnx_data"],
+        model_file="model.onnx",
+    ),
+]
+
+class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[NumpyArray]):
+    """
+    The ModernVBERT/colmodernvbert model implementation. This model uses
+    bidirectional attention, which proves to work better for retrieval.
+
+    See: https://huggingface.co/ModernVBERT/colmodernvbert
+    """
+
+    # TODO: reproduce ColPali methods only
+
+    @classmethod
+    def _list_supported_models(cls) -> list[DenseModelDescription]:
+        """Lists the supported models.
+
+        Returns:
+            list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information.
+        """
+        return supported_colmodernvbert_models
+
+    @classmethod
+    def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]:
+        return ColModernVBERTTextEmbeddingWorker
+
+    @classmethod
+    def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]:
+        return ColModernVBERTmageEmbeddingWorker
+
+class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]):
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali:
+        return ColModernVBERT(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=1,
+            **kwargs,
+        )
+
+
+class ColModernVBERTmageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]):
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali:
+        return ColModernVBERT(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=1,
+            **kwargs,
+        )
diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
index afe839d4..f123dc63 100644
--- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
+++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
@@ -4,6 +4,7 @@
 from fastembed.common import OnnxProvider, ImageInput
 from fastembed.common.types import NumpyArray, Device
 from fastembed.late_interaction_multimodal.colpali import ColPali
+from fastembed.late_interaction_multimodal.colmodernvbert import ColModernVBERT
 
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
     LateInteractionMultimodalEmbeddingBase,
@@ -12,7 +13,7 @@
 
 
 class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase):
-    EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ColPali]
+    EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ColPali, ColModernVBERT]
 
     @classmethod
     def list_supported_models(cls) -> list[dict[str, Any]]:

From 203ca315775aa7bf03e1c6b8d4599f90209b44b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 5 Dec 2025 17:31:05 +0100
Subject: [PATCH 02/24] Implement image processing based on
 Idefics3ImageProcessor logic

---
 fastembed/common/onnx_model.py                |   1 +
 fastembed/image/transform/functional.py       |  74 ++++++
 fastembed/image/transform/operators.py        | 224 +++++++++++++++++-
 .../onnx_multimodal_model.py                  |  51 +++-
 4 files changed, 342 insertions(+), 8 deletions(-)

diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
index d465e870..d357f2c1 100644
--- a/fastembed/common/onnx_model.py
+++ b/fastembed/common/onnx_model.py
@@ -21,6 +21,7 @@ class OnnxOutputContext:
     model_output: NumpyArray
     attention_mask: NDArray[np.int64] | None = None
     input_ids: NDArray[np.int64] | None = None
+    metadata: dict[str, Any] | None = None
 
 
 class OnnxModel(Generic[T]):
diff --git a/fastembed/image/transform/functional.py b/fastembed/image/transform/functional.py
index b06ef46c..86dfbe08 100644
--- a/fastembed/image/transform/functional.py
+++ b/fastembed/image/transform/functional.py
@@ -145,3 +145,77 @@ def pad2square(
     new_image = Image.new(mode="RGB", size=(size, size), color=fill_color)
     new_image.paste(image.crop((left, top, right, bottom)) if crop_required else image)
     return new_image
+
+
+def resize_longest_edge(
+    image: Image.Image,
+    max_size: int,
+    resample: Union[int, Image.Resampling] = Image.Resampling.LANCZOS,
+) -> Image.Image:
+    height, width = image.height, image.width
+    aspect_ratio = width / height
+
+    if width >= height:
+        # Width is longer
+        new_width = max_size
+        new_height = int(new_width / aspect_ratio)
+    else:
+        # Height is longer
+        new_height = max_size
+        new_width = int(new_height * aspect_ratio)
+
+    # Ensure even dimensions
+    if new_height % 2 != 0:
+        new_height += 1
+    if new_width % 2 != 0:
+        new_width += 1
+
+    return image.resize((new_width, new_height), resample)
+
+
+def crop_ndarray(
+    image: NumpyArray,
+    x1: int,
+    y1: int,
+    x2: int,
+    y2: int,
+    channel_first: bool = True,
+) -> NumpyArray:
+    if channel_first:
+        # (C, H, W) format
+        return image[:, y1:y2, x1:x2]
+    else:
+        # (H, W, C) format
+        return image[y1:y2, x1:x2, :]
+
+
+def resize_ndarray(
+    image: NumpyArray,
+    size: tuple[int, int],
+    resample: Union[int, Image.Resampling] = Image.Resampling.LANCZOS,
+    channel_first: bool = True,
+) -> NumpyArray:
+    # Convert to PIL-friendly format (H, W, C)
+    if channel_first:
+        img_hwc = image.transpose((1, 2, 0))
+    else:
+        img_hwc = image
+
+    # Handle different dtypes
+    if img_hwc.dtype == np.float32 or img_hwc.dtype == np.float64:
+        # Assume normalized, scale to 0-255 for PIL
+        img_hwc_scaled = (img_hwc * 255).astype(np.uint8)
+        pil_img = Image.fromarray(img_hwc_scaled, mode="RGB")
+        resized = pil_img.resize(size, resample)
+        result = np.array(resized).astype(np.float32) / 255.0
+    else:
+        # uint8 or similar
+        pil_img = Image.fromarray(img_hwc.astype(np.uint8), mode="RGB")
+        resized = pil_img.resize(size, resample)
+        result = np.array(resized)
+
+    # Convert back to original format
+    if channel_first:
+        result = result.transpose((2, 0, 1))
+
+    return result
diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py
index 857b1999..b6d3814d 100644
--- a/fastembed/image/transform/operators.py
+++ b/fastembed/image/transform/operators.py
@@ -1,4 +1,5 @@
 from typing import Any
+import math
 
 from PIL import Image
 
@@ -6,10 +7,13 @@
 from fastembed.image.transform.functional import (
     center_crop,
     convert_to_rgb,
+    crop_ndarray,
     normalize,
     pil2ndarray,
     rescale,
     resize,
+    resize_longest_edge,
+    resize_ndarray,
     pad2square,
 )
 
@@ -37,8 +41,13 @@ def __init__(self, mean: float | list[float], std: float | list[float]):
         self.mean = mean
         self.std = std
 
-    def __call__(self, images: list[NumpyArray]) -> list[NumpyArray]:
-        return [normalize(image, mean=self.mean, std=self.std) for image in images]
+    def __call__(self, images: Union[list[NumpyArray], list[list[NumpyArray]]]) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
+        if images and isinstance(images[0], list):
+            # Nested structure from ImageSplitter
+            return [[normalize(image, mean=self.mean, std=self.std) for image in img_patches] for img_patches in images]
+        else:
+            # Flat structure (backward compatibility)
+            return [normalize(image, mean=self.mean, std=self.std) for image in images]
 
 
 class Resize(Transform):
@@ -58,8 +67,13 @@ class Rescale(Transform):
     def __init__(self, scale: float = 1 / 255):
         self.scale = scale
 
-    def __call__(self, images: list[NumpyArray]) -> list[NumpyArray]:
-        return [rescale(image, scale=self.scale) for image in images]
+    def __call__(self, images: Union[list[NumpyArray], list[list[NumpyArray]]]) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
+        if images and isinstance(images[0], list):
+            # Nested structure from ImageSplitter
+            return [[rescale(image, scale=self.scale) for image in img_patches] for img_patches in images]
+        else:
+            # Flat structure (backward compatibility)
+            return [rescale(image, scale=self.scale) for image in images]
 
 
 class PILtoNDarray(Transform):
@@ -82,6 +96,163 @@ def __call__(self, images: list[Image.Image]) -> list[Image.Image]:
         ]
 
 
+class ResizeLongestEdge(Transform):
+    """Resize images so the longest edge equals target size, preserving aspect ratio."""
+
+    def __init__(
+        self,
+        size: int,
+        resample: Image.Resampling = Image.Resampling.LANCZOS,
+    ):
+        self.size = size
+        self.resample = resample
+
+    def __call__(self, images: list[Image.Image]) -> list[Image.Image]:
+        return [resize_longest_edge(image, self.size, self.resample) for image in images]
+
+
+class ResizeForVisionEncoder(Transform):
+    """
+    Resize both dimensions to be multiples of vision_encoder_max_size.
+    Preserves aspect ratio approximately.
+    Works on numpy arrays in (C, H, W) format.
+    """
+
+    def __init__(
+        self,
+        max_size: int,
+        resample: Image.Resampling = Image.Resampling.LANCZOS,
+    ):
+        self.max_size = max_size
+        self.resample = resample
+
+    def __call__(self, images: list[NumpyArray]) -> list[NumpyArray]:
+        result = []
+        for image in images:
+            # Assume (C, H, W) format
+            _, height, width = image.shape
+
+            aspect_ratio = width / height
+
+            if width >= height:
+                # Calculate new width as multiple of max_size
+                new_width = math.ceil(width / self.max_size) * self.max_size
+                new_height = int(new_width / aspect_ratio)
+                new_height = math.ceil(new_height / self.max_size) * self.max_size
+            else:
+                # Calculate new height as multiple of max_size
+                new_height = math.ceil(height / self.max_size) * self.max_size
+                new_width = int(new_height * aspect_ratio)
+                new_width = math.ceil(new_width / self.max_size) * self.max_size
+
+            # Resize using the ndarray resize function
+            resized = resize_ndarray(
+                image,
+                size=(new_width, new_height),  # PIL expects (width, height)
+                resample=self.resample,
+                channel_first=True,
+            )
+            result.append(resized)
+
+        return result
+
+
+class ImageSplitter(Transform):
+    """
+    Split images into grid of patches plus a global view.
+
+    If image dimensions exceed max_size:
+    - Divide into ceil(H/max_size) x ceil(W/max_size) patches
+    - Each patch is cropped from the image
+    - Add a global view (original resized to max_size x max_size)
+
+    If image is smaller than max_size:
+    - Return single image unchanged
+
+    Works on numpy arrays in (C, H, W) format.
+    """
+
+    def __init__(
+        self,
+        max_size: int,
+        resample: Image.Resampling = Image.Resampling.LANCZOS,
+    ):
+        self.max_size = max_size
+        self.resample = resample
+
+    def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]:
+        result = []
+
+        for image in images:
+            # Assume (C, H, W) format
+            _, height, width = image.shape
+            max_height = max_width = self.max_size
+
+            frames = []
+
+            if height > max_height or width > max_width:
+                # Calculate the number of splits needed
+                num_splits_h = math.ceil(height / max_height)
+                num_splits_w = math.ceil(width / max_width)
+
+                # Calculate optimal patch dimensions
+                optimal_height = math.ceil(height / num_splits_h)
+                optimal_width = math.ceil(width / num_splits_w)
+
+                # Generate patches in grid order (row by row)
+                for r in range(num_splits_h):
+                    for c in range(num_splits_w):
+                        # Calculate crop coordinates
+                        start_x = c * optimal_width
+                        start_y = r * optimal_height
+                        end_x = min(start_x + optimal_width, width)
+                        end_y = min(start_y + optimal_height, height)
+
+                        # Crop the patch
+                        cropped = crop_ndarray(
+                            image, x1=start_x, y1=start_y, x2=end_x, y2=end_y, channel_first=True
+                        )
+                        frames.append(cropped)
+
+                # Add global view (resized to max_size x max_size)
+                global_view = resize_ndarray(
+                    image,
+                    size=(max_width, max_height),  # PIL expects (width, height)
+                    resample=self.resample,
+                    channel_first=True,
+                )
+                frames.append(global_view)
+            else:
+                # Image is small enough, no splitting needed
+                frames.append(image)
+
+            # Append (not extend) to preserve per-image grouping
+            result.append(frames)
+
+        return result
+
+
+class SquareResize(Transform):
+    """
+    Resize images to square dimensions (max_size x max_size).
+    Works on numpy arrays in (C, H, W) format.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        resample: Image.Resampling = Image.Resampling.LANCZOS,
+    ):
+        self.size = size
+        self.resample = resample
+
+    def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]:
+        return [
+            [resize_ndarray(image, size=(self.size, self.size), resample=self.resample, channel_first=True)]
+            for image in images
+        ]
+
+
 class Compose:
     def __init__(self, transforms: list[Transform]):
         self.transforms = transforms
@@ -118,6 +289,7 @@ def from_config(cls, config: dict[str, Any]) -> "Compose":
                 Valid size keys (nested):
                     - {"height", "width"}
                     - {"shortest_edge"}
+                    - {"longest_edge"}
 
         Returns:
             Compose: Image processor.
@@ -128,6 +300,7 @@ def from_config(cls, config: dict[str, Any]) -> "Compose":
         cls._get_pad2square(transforms, config)
         cls._get_center_crop(transforms, config)
         cls._get_pil2ndarray(transforms, config)
+        cls._get_image_splitting(transforms, config)
         cls._get_rescale(transforms, config)
         cls._get_normalize(transforms, config)
         return cls(transforms=transforms)
@@ -196,6 +369,25 @@ def _get_resize(cls, transforms: list[Transform], config: dict[str, Any]) -> Non
                             resample=resample,
                         )
                     )
+        elif mode == "Idefics3ImageProcessor":
+            if config.get("do_resize", False):
+                size = config.get("size", {})
+                if "longest_edge" not in size:
+                    raise ValueError(
+                        "Size dictionary must contain 'longest_edge' key for Idefics3ImageProcessor"
+                    )
+
+                # Handle resample parameter - can be int enum or PIL.Image.Resampling
+                resample = config.get("resample", Image.Resampling.LANCZOS)
+                if isinstance(resample, int):
+                    resample = Image.Resampling(resample)
+
+                transforms.append(
+                    ResizeLongestEdge(
+                        size=size["longest_edge"],
+                        resample=resample,
+                    )
+                )
         else:
             raise ValueError(f"Preprocessor {mode} is not supported")
 
@@ -217,6 +409,8 @@ def _get_center_crop(transforms: list[Transform], config: dict[str, Any]) -> Non
             pass
         elif mode == "JinaCLIPImageProcessor":
             pass
+        elif mode == "Idefics3ImageProcessor":
+            pass
         else:
             raise ValueError(f"Preprocessor {mode} is not supported")
 
@@ -224,6 +418,28 @@ def _get_center_crop(transforms: list[Transform], config: dict[str, Any]) -> Non
     def _get_pil2ndarray(transforms: list[Transform], config: dict[str, Any]) -> None:
         transforms.append(PILtoNDarray())
 
+    @classmethod
+    def _get_image_splitting(cls, transforms: list[Transform], config: dict[str, Any]) -> None:
+        """
+        Add image splitting transforms for Idefics3.
+        Handles conditional logic: splitting vs square resize.
+        Must be called AFTER PILtoNDarray.
+        """
+        mode = config.get("image_processor_type", "CLIPImageProcessor")
+
+        if mode == "Idefics3ImageProcessor":
+            do_splitting = config.get("do_image_splitting", False)
+            max_size = config.get("max_image_size", {}).get("longest_edge", 512)
+            resample = config.get("resample", Image.Resampling.LANCZOS)
+            if isinstance(resample, int):
+                resample = Image.Resampling(resample)
+
+            if do_splitting:
+                transforms.append(ResizeForVisionEncoder(max_size, resample))
+                transforms.append(ImageSplitter(max_size, resample))
+            else:
+                transforms.append(SquareResize(max_size, resample))
+
     @staticmethod
     def _get_rescale(transforms: list[Transform], config: dict[str, Any]) -> None:
         if config.get("do_rescale", True):
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index 18b36338..75e7ee92 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -176,12 +176,55 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
                 for image in images
             ]
             assert self.processor is not None, "Processor is not initialized"
-            encoded = np.array(self.processor(image_files))
-        onnx_input = {"pixel_values": encoded}
+            processed = self.processor(image_files)
+
+            # Handle nested structure (with image splitting)
+            if isinstance(processed[0], list):
+                # processed = [[img1_patches], [img2_patches], ...]
+                # Need shape: (batch_size, max_patches, C, H, W)
+
+                patch_counts = [len(patches) for patches in processed]
+                max_patches = max(patch_counts)
+
+                # Get dimensions from first patch
+                C, H, W = processed[0][0].shape
+
+                # Create padded array
+                batch_size = len(processed)
+                encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype)
+
+                # Create attention mask (1 for real patches, 0 for padding)
+                attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64)
+
+                # Fill in patches and attention mask
+                for i, patches in enumerate(processed):
+                    for j, patch in enumerate(patches):
+                        encoded[i, j] = patch
+                        attention_mask[i, j] = 1
+
+                # Track actual patch counts for later use
+                metadata = {"patch_counts": patch_counts}
+            else:
+                # Flat structure (no splitting) - still need batch dimension
+                # Shape: (batch_size, 1, C, H, W)
+                encoded = np.array(processed)
+                if len(encoded.shape) == 4:  # (batch_size, C, H, W)
+                    encoded = encoded[:, np.newaxis, ...]  # Add num_patches=1 dimension
+
+                # All patches are real (no padding)
+                # TODO: attention_mask should be built
+                attention_mask = np.ones((len(images), encoded.shape[1]), dtype=np.int64)
+                metadata = {"patch_counts": [encoded.shape[1]] * len(images)}
+
+        onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask}
         onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
         model_output = self.model.run(None, onnx_input)  # type: ignore[union-attr]
-        embeddings = model_output[0].reshape(len(images), -1)
-        return OnnxOutputContext(model_output=embeddings)
+
+        return OnnxOutputContext(
+            model_output=model_output[0],
+            attention_mask=attention_mask,
+            metadata=metadata,
+        )
 
     def _embed_images(
         self,

From 8c45088e9f605211388e3bd7cba7a49965a922b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 5 Dec 2025 17:45:32 +0100
Subject: [PATCH 03/24] Fix padding support

---
 fastembed/common/preprocessor_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fastembed/common/preprocessor_utils.py b/fastembed/common/preprocessor_utils.py
index efbc3e25..3b702f79 100644
--- a/fastembed/common/preprocessor_utils.py
+++ b/fastembed/common/preprocessor_utils.py
@@ -50,9 +50,10 @@ def load_tokenizer(model_dir: Path) -> tuple[Tokenizer, dict[str, int]]:
 
     tokenizer = Tokenizer.from_file(str(tokenizer_path))
     tokenizer.enable_truncation(max_length=max_context)
-    tokenizer.enable_padding(
-        pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"]
-    )
+    if not tokenizer.padding:
+        tokenizer.enable_padding(
+            pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"]
+        )
 
     for token in tokens_map.values():
         if isinstance(token, str):

From 5b56a77d2e5d61c4e2e677df1c5daba8d2cac08a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 5 Dec 2025 17:46:28 +0100
Subject: [PATCH 04/24] Implement ColModernVBERT logic

---
 .../colmodernvbert.py                         | 376 +++++++++++++++++-
 1 file changed, 370 insertions(+), 6 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 0c1b0e6c..8058e6c7 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -1,9 +1,14 @@
-from typing import Any, Iterable, Type, Union, Optional
+from typing import Any, Iterable, Type, Union, Optional, Sequence
+import json
+
+import numpy as np
+from tokenizers import Encoding
 
 from fastembed.common import ImageInput
 from fastembed.common.model_description import DenseModelDescription, ModelSource
 from fastembed.common.onnx_model import OnnxOutputContext, T
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, OnnxProvider
+from fastembed.common.utils import define_cache_dir
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
     LateInteractionMultimodalEmbeddingBase,
 )
@@ -17,8 +22,7 @@
         description="The late-interaction version of ModernVBERT, CPU friendly, English, 2025.",
         license="mit",
         size_in_GB=1.0,
-        # TODO: change the url to hf repo link!
-        sources=ModelSource(url="file:///home/kacper/Projects/Qdrant/colpali-model-migration-to-onnx/outputs/colmodernvbert"),
+        sources=ModelSource(hf="Qdrant/colmodernvbert"),
         additional_files=["model.onnx_data"],
         model_file="model.onnx",
     ),
@@ -32,6 +36,78 @@ class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel
     See: https://huggingface.co/ModernVBERT/colmodernvbert
     """
 
+    VISUAL_PROMPT_PREFIX = "<|begin_of_text|>User:<image>Describe the image.<end_of_utterance>\nAssistant:"
+
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        providers: Optional[Sequence[OnnxProvider]] = None,
+        cuda: bool = False,
+        device_ids: Optional[list[int]] = None,
+        lazy_load: bool = False,
+        device_id: Optional[int] = None,
+        specific_model_path: Optional[str] = None,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            model_name (str): The name of the model to use.
+            cache_dir (str, optional): The path to the cache directory.
+                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
+                                       Defaults to `fastembed_cache` in the system's temp directory.
+            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+            providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
+                Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
+            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to False.
+            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
+                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
+                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
+            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
+
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        """
+
+        # TODO: consider unifying ColPali and ColModernVBERT __init__ methods
+
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+        self.providers = providers
+        self.lazy_load = lazy_load
+        self._extra_session_options = self._select_exposed_session_options(kwargs)
+
+        # List of device ids, that can be used for data parallel processing in workers
+        self.device_ids = device_ids
+        self.cuda = cuda
+
+        # This device_id will be used if we need to load model in current process
+        self.device_id: Optional[int] = None
+        if device_id is not None:
+            self.device_id = device_id
+        elif self.device_ids is not None:
+            self.device_id = self.device_ids[0]
+
+        self.model_description = self._get_model_description(model_name)
+        self.cache_dir = str(define_cache_dir(cache_dir))
+
+        self._specific_model_path = specific_model_path
+        self._model_dir = self.download_model(
+            self.model_description,
+            self.cache_dir,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+        )
+        self.mask_token_id = None
+        self.pad_token_id = None
+        self.image_seq_len: Optional[int] = None
+        self.max_image_size: Optional[int] = None
+
+        if not self.lazy_load:
+            self.load_onnx_model()
+
     # TODO: reproduce ColPali methods only
 
     @classmethod
@@ -43,6 +119,294 @@ def _list_supported_models(cls) -> list[DenseModelDescription]:
         """
         return supported_colmodernvbert_models
 
+    def load_onnx_model(self) -> None:
+        self._load_onnx_model(
+            model_dir=self._model_dir,
+            model_file=self.model_description.model_file,
+            threads=self.threads,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_id=self.device_id,
+            extra_session_options=self._extra_session_options,
+        )
+
+        # Load image processing configuration
+        processor_config_path = self._model_dir / "processor_config.json"
+        with open(processor_config_path) as f:
+            processor_config = json.load(f)
+            self.image_seq_len = processor_config.get("image_seq_len", 64)
+
+        preprocessor_config_path = self._model_dir / "preprocessor_config.json"
+        with open(preprocessor_config_path) as f:
+            preprocessor_config = json.load(f)
+            self.max_image_size = preprocessor_config.get("max_image_size", {}).get("longest_edge", 512)
+
+    def _preprocess_onnx_text_input(
+        self, onnx_input: dict[str, NumpyArray], **kwargs: Any
+    ) -> dict[str, NumpyArray]:
+        """
+        Post-process the ONNX model output to convert it into a usable format.
+
+        Args:
+            output (OnnxOutputContext): The raw output from the ONNX model.
+
+        Returns:
+            Iterable[NumpyArray]: Post-processed output as NumPy arrays.
+        """
+        batch_size, seq_length = onnx_input["input_ids"].shape
+        # TODO: use .json config, not 3, 512, 512
+        empty_image_placeholder: NumpyArray = np.zeros(
+            (batch_size, seq_length, 3, 512, 512), dtype=np.float32
+        )
+        onnx_input["pixel_values"] = empty_image_placeholder
+        return onnx_input
+
+    def _post_process_onnx_text_output(
+        self,
+        output: OnnxOutputContext,
+    ) -> Iterable[NumpyArray]:
+        """
+        Post-process the ONNX model output to convert it into a usable format.
+
+        Args:
+            output (OnnxOutputContext): The raw output from the ONNX model.
+
+        Returns:
+            Iterable[NumpyArray]: Post-processed output as NumPy arrays.
+        """
+        return output.model_output
+
+    def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
+        encoded = self.tokenizer.encode_batch(documents)  # type: ignore[union-attr]
+        return encoded
+
+    def _preprocess_onnx_image_input(
+        self, onnx_input: dict[str, np.ndarray], **kwargs: Any
+    ) -> dict[str, NumpyArray]:
+        """
+        Add text input placeholders for image data, following Idefics3 processing logic.
+
+        Constructs input_ids dynamically based on the actual number of image patches,
+        using the same token expansion logic as Idefics3Processor.
+
+        Args:
+            onnx_input: Dict with 'pixel_values' (batch, num_patches, C, H, W)
+                        and 'attention_mask' (batch, num_patches) indicating real patches
+            **kwargs: Additional arguments
+
+        Returns:
+            Updated onnx_input with 'input_ids' and updated 'attention_mask' for token sequence
+        """
+        # The attention_mask in onnx_input has a shape of (batch_size, num_patches),
+        # and should be used to create an attention mask matching the input_ids shape.
+        patch_attention_mask = onnx_input["attention_mask"]
+        pixel_values = onnx_input["pixel_values"]
+
+        batch_size = pixel_values.shape[0]
+        batch_input_ids = []
+
+        # Build input_ids for each image based on its actual patch count
+        for i in range(batch_size):
+            # Count real patches (non-padded) from attention mask
+            patch_count = int(np.sum(patch_attention_mask[i]))
+
+            # Compute rows/cols from patch count
+            rows, cols = self._compute_rows_cols_from_patches(patch_count)
+
+            # Build input_ids for this image
+            input_ids = self._build_input_ids_for_image(rows, cols)
+            batch_input_ids.append(input_ids)
+
+        # Pad sequences to max length in batch
+        max_len = max(len(ids) for ids in batch_input_ids)
+
+        # Get padding config from tokenizer
+        padding_direction = self.tokenizer.padding["direction"]  # type: ignore[index,union-attr]
+        pad_token_id = self.tokenizer.padding["pad_id"]  # type: ignore[index,union-attr]
+
+        # Initialize with pad token
+        padded_input_ids = np.full((batch_size, max_len), pad_token_id, dtype=np.int64)
+        attention_mask = np.zeros((batch_size, max_len), dtype=np.int64)
+
+        for i, input_ids in enumerate(batch_input_ids):
+            seq_len = len(input_ids)
+            if padding_direction == "left":
+                # Left padding: place tokens at the END of the array
+                start_idx = max_len - seq_len
+                padded_input_ids[i, start_idx:] = input_ids
+                attention_mask[i, start_idx:] = 1
+            else:
+                # Right padding: place tokens at the START of the array
+                padded_input_ids[i, :seq_len] = input_ids
+                attention_mask[i, :seq_len] = 1
+
+        onnx_input["input_ids"] = padded_input_ids
+        # Update attention_mask with token-level data
+        onnx_input["attention_mask"] = attention_mask
+        return onnx_input
+
+    def _compute_rows_cols_from_patches(self, patch_count: int) -> tuple[int, int]:
+        if patch_count <= 1:
+            return 0, 0
+
+        # Subtract 1 for the global image
+        grid_patches = patch_count - 1
+
+        # Find rows and cols (assume square or near-square grid)
+        rows = int(grid_patches ** 0.5)
+        cols = grid_patches // rows
+
+        # Verify the calculation
+        if rows * cols + 1 != patch_count:
+            # Handle non-square grids
+            for r in range(1, grid_patches + 1):
+                if grid_patches % r == 0:
+                    c = grid_patches // r
+                    if r * c + 1 == patch_count:
+                        return r, c
+            # Fallback: treat as unsplit
+            return 0, 0
+
+        return rows, cols
+
+    def _create_single_image_prompt_string(self) -> str:
+        return (
+            "<fake_token_around_image>"
+            + "<global-img>"
+            + "<image>" * self.image_seq_len
+            + "<fake_token_around_image>"
+        )
+
+    def _create_split_image_prompt_string(self, rows: int, cols: int) -> str:
+        text_split_images = ""
+
+        # Add tokens for each patch in the grid
+        for n_h in range(rows):
+            for n_w in range(cols):
+                text_split_images += (
+                    f"<fake_token_around_image>"
+                    + f"<row_{n_h + 1}_col_{n_w + 1}>"
+                    + "<image>" * self.image_seq_len
+                )
+            text_split_images += "\n"
+
+        # Add global image at the end
+        text_split_images += (
+            f"\n<fake_token_around_image>"
+            + "<global-img>"
+            + "<image>" * self.image_seq_len
+            + "<fake_token_around_image>"
+        )
+
+        return text_split_images
+
+    def _build_input_ids_for_image(self, rows: int, cols: int) -> np.ndarray:
+        # Create the appropriate image prompt string
+        if rows == 0 and cols == 0:
+            image_prompt_tokens = self._create_single_image_prompt_string()
+        else:
+            image_prompt_tokens = self._create_split_image_prompt_string(rows, cols)
+
+        # Replace <image> in visual prompt with expanded tokens
+        # The visual prompt is: "<|begin_of_text|>User:<image>Describe the image.<end_of_utterance>\nAssistant:"
+        expanded_prompt = self.VISUAL_PROMPT_PREFIX.replace("<image>", image_prompt_tokens)
+
+        # Tokenize the complete prompt
+        encoded = self.tokenizer.encode(expanded_prompt)  # type: ignore[union-attr]
+
+        # Convert to numpy array
+        return np.array(encoded.ids, dtype=np.int64)
+
+    def _post_process_onnx_image_output(
+        self,
+        output: OnnxOutputContext,
+    ) -> Iterable[NumpyArray]:
+        """
+        Post-process the ONNX model output to convert it into a usable format.
+
+        Args:
+            output (OnnxOutputContext): The raw output from the ONNX model.
+
+        Returns:
+            Iterable[NumpyArray]: Post-processed output as NumPy arrays.
+        """
+        assert self.model_description.dim is not None, "Model dim is not defined"
+        return output.model_output.reshape(
+            output.model_output.shape[0], -1, self.model_description.dim
+        )
+
+    def embed_text(
+        self,
+        documents: Union[str, Iterable[str]],
+        batch_size: int = 256,
+        parallel: Optional[int] = None,
+        **kwargs: Any,
+    ) -> Iterable[NumpyArray]:
+        """
+        Encode a list of documents into list of embeddings.
+
+        Args:
+            documents: Iterator of documents or single document to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self._embed_documents(
+            model_name=self.model_name,
+            cache_dir=str(self.cache_dir),
+            documents=documents,
+            batch_size=batch_size,
+            parallel=parallel,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_ids=self.device_ids,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+            extra_session_options=self._extra_session_options,
+            **kwargs,
+        )
+
+    def embed_image(
+        self,
+        images: Union[ImageInput, Iterable[ImageInput]],
+        batch_size: int = 16,
+        parallel: Optional[int] = None,
+        **kwargs: Any,
+    ) -> Iterable[NumpyArray]:
+        """
+        Encode a list of images into list of embeddings.
+
+        Args:
+            images: Iterator of image paths or single image path to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self._embed_images(
+            model_name=self.model_name,
+            cache_dir=str(self.cache_dir),
+            images=images,
+            batch_size=batch_size,
+            parallel=parallel,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_ids=self.device_ids,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+            extra_session_options=self._extra_session_options,
+            **kwargs,
+        )
+
     @classmethod
     def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]:
         return ColModernVBERTTextEmbeddingWorker
@@ -52,7 +416,7 @@ def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]:
         return ColModernVBERTmageEmbeddingWorker
 
 class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]):
-    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali:
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT:
         return ColModernVBERT(
             model_name=model_name,
             cache_dir=cache_dir,
@@ -62,7 +426,7 @@ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColP
 
 
 class ColModernVBERTmageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]):
-    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali:
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT:
         return ColModernVBERT(
             model_name=model_name,
             cache_dir=cache_dir,

From e637a7fe3aef02cf23855f9671ff0d9a121251cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 5 Dec 2025 17:47:28 +0100
Subject: [PATCH 05/24] Remove TODOs

---
 fastembed/late_interaction_multimodal/colmodernvbert.py      | 5 -----
 .../late_interaction_multimodal/onnx_multimodal_model.py     | 1 -
 2 files changed, 6 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 8058e6c7..c55d099b 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -71,9 +71,6 @@ def __init__(
         Raises:
             ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
         """
-
-        # TODO: consider unifying ColPali and ColModernVBERT __init__ methods
-
         super().__init__(model_name, cache_dir, threads, **kwargs)
         self.providers = providers
         self.lazy_load = lazy_load
@@ -108,8 +105,6 @@ def __init__(
         if not self.lazy_load:
             self.load_onnx_model()
 
-    # TODO: reproduce ColPali methods only
-
     @classmethod
     def _list_supported_models(cls) -> list[DenseModelDescription]:
         """Lists the supported models.
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index 75e7ee92..cbfc09e8 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -212,7 +212,6 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
                     encoded = encoded[:, np.newaxis, ...]  # Add num_patches=1 dimension
 
                 # All patches are real (no padding)
-                # TODO: attention_mask should be built
                 attention_mask = np.ones((len(images), encoded.shape[1]), dtype=np.int64)
                 metadata = {"patch_counts": [encoded.shape[1]] * len(images)}
 

From 74f5c3e47462ac133e155618f2cd3cea5473bcd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 5 Dec 2025 18:10:27 +0100
Subject: [PATCH 06/24] Handle empty pixel values with proper image_size

---
 .../late_interaction_multimodal/colmodernvbert.py     | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index c55d099b..a3357517 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -101,6 +101,7 @@ def __init__(
         self.pad_token_id = None
         self.image_seq_len: Optional[int] = None
         self.max_image_size: Optional[int] = None
+        self.image_size: Optional[int] = None
 
         if not self.lazy_load:
             self.load_onnx_model()
@@ -136,6 +137,13 @@ def load_onnx_model(self) -> None:
             preprocessor_config = json.load(f)
             self.max_image_size = preprocessor_config.get("max_image_size", {}).get("longest_edge", 512)
 
+        # Load model configuration
+        config_path = self._model_dir / "config.json"
+        with open(config_path) as f:
+            model_config = json.load(f)
+            vision_config = model_config.get("vision_config", {})
+            self.image_size = vision_config.get("image_size", 512)
+
     def _preprocess_onnx_text_input(
         self, onnx_input: dict[str, NumpyArray], **kwargs: Any
     ) -> dict[str, NumpyArray]:
@@ -149,9 +157,8 @@ def _preprocess_onnx_text_input(
             Iterable[NumpyArray]: Post-processed output as NumPy arrays.
         """
         batch_size, seq_length = onnx_input["input_ids"].shape
-        # TODO: use .json config, not 3, 512, 512
         empty_image_placeholder: NumpyArray = np.zeros(
-            (batch_size, seq_length, 3, 512, 512), dtype=np.float32
+            (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32
         )
         onnx_input["pixel_values"] = empty_image_placeholder
         return onnx_input

From 9e2929ec2b05fb7070ef29d9a620ecca3a1ea605 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 5 Dec 2025 18:19:05 +0100
Subject: [PATCH 07/24] Add ColModernVBERT tests

---
 tests/test_late_interaction_multimodal.py | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index 8a102ace..8aa3a44e 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -21,6 +21,17 @@
             [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122],
         ]
     ),
+    "Qdrant/colmodernvbert": np.array(
+        [
+            [0.2256, -0.0503, 0.0254, -0.011, -0.0786, 0.2152, -0.0961],
+            [-0.0028, -0.0484, -0.0724, -0.0724, -0.0977, 0.0308, -0.0236],
+            [0.0035, -0.1075, -0.0877, -0.0207, -0.0828, -0.0294, -0.0253],
+            [0.0021, -0.0797, -0.0605, -0.0008, -0.0837, 0.0015, -0.0846],
+            [-0.0473, -0.0594, -0.0553, -0.0014, -0.0712, 0.0158, -0.0546],
+            [-0.1009, -0.082, -0.0684, -0.1385, -0.0469, -0.0606, -0.0323],
+            [-0.0624, 0.006, -0.0498, -0.0127, -0.1115, 0.0076, -0.0888],
+        ]
+    ),
 }
 
 CANONICAL_QUERY_VALUES = {
@@ -35,6 +46,17 @@
             [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137],
         ]
     ),
+    "Qdrant/colmodernvbert": np.array(
+        [
+            [0.05, 0.0656, 0.0403, 0.1498, 0.1842, 0.0263, -0.1871],
+            [-0.0566, -0.1403, 0.0065, -0.0285, 0.0903, -0.0149, 0.1069],
+            [-0.1015, -0.0072, 0.0908, -0.0824, -0.0185, -0.0097, -0.0046],
+            [-0.1233, -0.1081, -0.0234, -0.0033, 0.0598, 0.0993, 0.0985],
+            [-0.0705, -0.1312, -0.0649, 0.0151, 0.0746, 0.0765, 0.1482],
+            [0.0053, -0.1384, -0.0584, -0.0272, 0.1301, 0.0508, 0.1796],
+            [0.0092, -0.1438, -0.0306, -0.0369, 0.1172, 0.037, 0.1334],
+        ]
+    ),
 }
 
 queries = ["hello world", "flag embedding"]
@@ -90,6 +112,9 @@ def test_get_embedding_size():
     model_name = "Qdrant/ColPali-v1.3-fp16"
     assert LateInteractionMultimodalEmbedding.get_embedding_size(model_name) == 128
 
+    model_name = "Qdrant/colmodernvbert"
+    assert LateInteractionMultimodalEmbedding.get_embedding_size(model_name) == 128
+
 
 def test_embedding_size():
     if os.getenv("CI"):
@@ -102,6 +127,10 @@ def test_embedding_size():
     model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
     assert model.embedding_size == 128
 
+    model_name = "Qdrant/colmodernvbert"
+    model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
+    assert model.embedding_size == 128
+
 
 def test_token_count() -> None:
     if os.getenv("CI"):

From aa93a528679c6a25fa15116628206f4c85d404d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Mon, 8 Dec 2025 13:17:38 +0100
Subject: [PATCH 08/24] Run pre-commit

---
 fastembed/image/transform/operators.py        | 24 ++++++++++++++----
 .../colmodernvbert.py                         | 25 +++++++++++++------
 .../late_interaction_multimodal_embedding.py  |  5 +++-
 3 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py
index b6d3814d..cabff63d 100644
--- a/fastembed/image/transform/operators.py
+++ b/fastembed/image/transform/operators.py
@@ -41,10 +41,15 @@ def __init__(self, mean: float | list[float], std: float | list[float]):
         self.mean = mean
         self.std = std
 
-    def __call__(self, images: Union[list[NumpyArray], list[list[NumpyArray]]]) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
+    def __call__(
+        self, images: Union[list[NumpyArray], list[list[NumpyArray]]]
+    ) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
         if images and isinstance(images[0], list):
             # Nested structure from ImageSplitter
-            return [[normalize(image, mean=self.mean, std=self.std) for image in img_patches] for img_patches in images]
+            return [
+                [normalize(image, mean=self.mean, std=self.std) for image in img_patches]
+                for img_patches in images
+            ]
         else:
             # Flat structure (backward compatibility)
             return [normalize(image, mean=self.mean, std=self.std) for image in images]
@@ -67,10 +72,15 @@ class Rescale(Transform):
     def __init__(self, scale: float = 1 / 255):
         self.scale = scale
 
-    def __call__(self, images: Union[list[NumpyArray], list[list[NumpyArray]]]) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
+    def __call__(
+        self, images: Union[list[NumpyArray], list[list[NumpyArray]]]
+    ) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
         if images and isinstance(images[0], list):
             # Nested structure from ImageSplitter
-            return [[rescale(image, scale=self.scale) for image in img_patches] for img_patches in images]
+            return [
+                [rescale(image, scale=self.scale) for image in img_patches]
+                for img_patches in images
+            ]
         else:
             # Flat structure (backward compatibility)
             return [rescale(image, scale=self.scale) for image in images]
@@ -248,7 +258,11 @@ def __init__(
 
     def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]:
         return [
-            [resize_ndarray(image, size=(self.size, self.size), resample=self.resample, channel_first=True)]
+            [
+                resize_ndarray(
+                    image, size=(self.size, self.size), resample=self.resample, channel_first=True
+                )
+            ]
             for image in images
         ]
 
diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index a3357517..d975f510 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -6,14 +6,17 @@
 
 from fastembed.common import ImageInput
 from fastembed.common.model_description import DenseModelDescription, ModelSource
-from fastembed.common.onnx_model import OnnxOutputContext, T
+from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common.types import NumpyArray, OnnxProvider
 from fastembed.common.utils import define_cache_dir
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
     LateInteractionMultimodalEmbeddingBase,
 )
-from fastembed.late_interaction_multimodal.onnx_multimodal_model import OnnxMultimodalModel, TextEmbeddingWorker, \
-    ImageEmbeddingWorker
+from fastembed.late_interaction_multimodal.onnx_multimodal_model import (
+    OnnxMultimodalModel,
+    TextEmbeddingWorker,
+    ImageEmbeddingWorker,
+)
 
 supported_colmodernvbert_models: list[DenseModelDescription] = [
     DenseModelDescription(
@@ -28,6 +31,7 @@
     ),
 ]
 
+
 class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[NumpyArray]):
     """
     The ModernVBERT/colmodernvbert model implementation. This model uses
@@ -36,7 +40,9 @@ class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel
     See: https://huggingface.co/ModernVBERT/colmodernvbert
     """
 
-    VISUAL_PROMPT_PREFIX = "<|begin_of_text|>User:<image>Describe the image.<end_of_utterance>\nAssistant:"
+    VISUAL_PROMPT_PREFIX = (
+        "<|begin_of_text|>User:<image>Describe the image.<end_of_utterance>\nAssistant:"
+    )
 
     def __init__(
         self,
@@ -135,7 +141,9 @@ def load_onnx_model(self) -> None:
         preprocessor_config_path = self._model_dir / "preprocessor_config.json"
         with open(preprocessor_config_path) as f:
             preprocessor_config = json.load(f)
-            self.max_image_size = preprocessor_config.get("max_image_size", {}).get("longest_edge", 512)
+            self.max_image_size = preprocessor_config.get("max_image_size", {}).get(
+                "longest_edge", 512
+            )
 
         # Load model configuration
         config_path = self._model_dir / "config.json"
@@ -255,7 +263,7 @@ def _compute_rows_cols_from_patches(self, patch_count: int) -> tuple[int, int]:
         grid_patches = patch_count - 1
 
         # Find rows and cols (assume square or near-square grid)
-        rows = int(grid_patches ** 0.5)
+        rows = int(grid_patches**0.5)
         cols = grid_patches // rows
 
         # Verify the calculation
@@ -286,7 +294,7 @@ def _create_split_image_prompt_string(self, rows: int, cols: int) -> str:
         for n_h in range(rows):
             for n_w in range(cols):
                 text_split_images += (
-                    f"<fake_token_around_image>"
+                    "<fake_token_around_image>"
                     + f"<row_{n_h + 1}_col_{n_w + 1}>"
                     + "<image>" * self.image_seq_len
                 )
@@ -294,7 +302,7 @@ def _create_split_image_prompt_string(self, rows: int, cols: int) -> str:
 
         # Add global image at the end
         text_split_images += (
-            f"\n<fake_token_around_image>"
+            "\n<fake_token_around_image>"
             + "<global-img>"
             + "<image>" * self.image_seq_len
             + "<fake_token_around_image>"
@@ -417,6 +425,7 @@ def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]:
     def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]:
         return ColModernVBERTmageEmbeddingWorker
 
+
 class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]):
     def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT:
         return ColModernVBERT(
diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
index f123dc63..10d426d0 100644
--- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
+++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
@@ -13,7 +13,10 @@
 
 
 class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase):
-    EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [ColPali, ColModernVBERT]
+    EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [
+        ColPali,
+        ColModernVBERT,
+    ]
 
     @classmethod
     def list_supported_models(cls) -> list[dict[str, Any]]:

From 6470e35317d6ebc4576f4b609ae9504faf93ee3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Mon, 8 Dec 2025 13:29:51 +0100
Subject: [PATCH 09/24] mypy fixes

---
 fastembed/image/transform/operators.py           | 16 ++++++++--------
 .../colmodernvbert.py                            |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py
index cabff63d..3ee2d873 100644
--- a/fastembed/image/transform/operators.py
+++ b/fastembed/image/transform/operators.py
@@ -41,18 +41,18 @@ def __init__(self, mean: float | list[float], std: float | list[float]):
         self.mean = mean
         self.std = std
 
-    def __call__(
+    def __call__(  # type: ignore[override]
         self, images: Union[list[NumpyArray], list[list[NumpyArray]]]
     ) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
         if images and isinstance(images[0], list):
             # Nested structure from ImageSplitter
             return [
-                [normalize(image, mean=self.mean, std=self.std) for image in img_patches]
+                [normalize(image, mean=self.mean, std=self.std) for image in img_patches]  # type: ignore[arg-type]
                 for img_patches in images
             ]
         else:
             # Flat structure (backward compatibility)
-            return [normalize(image, mean=self.mean, std=self.std) for image in images]
+            return [normalize(image, mean=self.mean, std=self.std) for image in images]  # type: ignore[arg-type]
 
 
 class Resize(Transform):
@@ -72,18 +72,18 @@ class Rescale(Transform):
     def __init__(self, scale: float = 1 / 255):
         self.scale = scale
 
-    def __call__(
+    def __call__(  # type: ignore[override]
         self, images: Union[list[NumpyArray], list[list[NumpyArray]]]
     ) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
         if images and isinstance(images[0], list):
             # Nested structure from ImageSplitter
             return [
-                [rescale(image, scale=self.scale) for image in img_patches]
+                [rescale(image, scale=self.scale) for image in img_patches]  # type: ignore[arg-type]
                 for img_patches in images
             ]
         else:
             # Flat structure (backward compatibility)
-            return [rescale(image, scale=self.scale) for image in images]
+            return [rescale(image, scale=self.scale) for image in images]  # type: ignore[arg-type]
 
 
 class PILtoNDarray(Transform):
@@ -190,7 +190,7 @@ def __init__(
         self.max_size = max_size
         self.resample = resample
 
-    def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]:
+    def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]:  # type: ignore[override]
         result = []
 
         for image in images:
@@ -256,7 +256,7 @@ def __init__(
         self.size = size
         self.resample = resample
 
-    def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]:
+    def __call__(self, images: list[NumpyArray]) -> list[list[NumpyArray]]:  # type: ignore[override]
         return [
             [
                 resize_ndarray(
diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index d975f510..210c8084 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -165,7 +165,7 @@ def _preprocess_onnx_text_input(
             Iterable[NumpyArray]: Post-processed output as NumPy arrays.
         """
         batch_size, seq_length = onnx_input["input_ids"].shape
-        empty_image_placeholder: NumpyArray = np.zeros(
+        empty_image_placeholder: NumpyArray = np.zeros(  # type: ignore[type-var]
             (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32
         )
         onnx_input["pixel_values"] = empty_image_placeholder
@@ -283,7 +283,7 @@ def _create_single_image_prompt_string(self) -> str:
         return (
             "<fake_token_around_image>"
             + "<global-img>"
-            + "<image>" * self.image_seq_len
+            + "<image>" * self.image_seq_len  # type: ignore[operator]
             + "<fake_token_around_image>"
         )
 
@@ -296,7 +296,7 @@ def _create_split_image_prompt_string(self, rows: int, cols: int) -> str:
                 text_split_images += (
                     "<fake_token_around_image>"
                     + f"<row_{n_h + 1}_col_{n_w + 1}>"
-                    + "<image>" * self.image_seq_len
+                    + "<image>" * self.image_seq_len  # type: ignore[operator]
                 )
             text_split_images += "\n"
 
@@ -304,7 +304,7 @@ def _create_split_image_prompt_string(self, rows: int, cols: int) -> str:
         text_split_images += (
             "\n<fake_token_around_image>"
             + "<global-img>"
-            + "<image>" * self.image_seq_len
+            + "<image>" * self.image_seq_len  # type: ignore[operator]
             + "<fake_token_around_image>"
         )
 

From bf8931721d3f80f527ec8cc80f485671eab3c068 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Mon, 8 Dec 2025 13:32:56 +0100
Subject: [PATCH 10/24] mypy fixes

---
 fastembed/late_interaction_multimodal/colmodernvbert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 210c8084..3051e962 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -165,7 +165,7 @@ def _preprocess_onnx_text_input(
             Iterable[NumpyArray]: Post-processed output as NumPy arrays.
         """
         batch_size, seq_length = onnx_input["input_ids"].shape
-        empty_image_placeholder: NumpyArray = np.zeros(  # type: ignore[type-var]
+        empty_image_placeholder: NumpyArray = np.zeros(  # type: ignore[type-var,arg-type]
             (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32
         )
         onnx_input["pixel_values"] = empty_image_placeholder

From 39c7211d7a141cabd300c0b6c8b96f60e2035122 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Mon, 8 Dec 2025 13:34:53 +0100
Subject: [PATCH 11/24] mypy fixes

---
 fastembed/late_interaction_multimodal/colmodernvbert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 3051e962..2d030387 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -165,7 +165,7 @@ def _preprocess_onnx_text_input(
             Iterable[NumpyArray]: Post-processed output as NumPy arrays.
         """
         batch_size, seq_length = onnx_input["input_ids"].shape
-        empty_image_placeholder: NumpyArray = np.zeros(  # type: ignore[type-var,arg-type]
+        empty_image_placeholder: NumpyArray = np.zeros(  # type: ignore[type-var,arg-type,assignment]
             (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32
         )
         onnx_input["pixel_values"] = empty_image_placeholder

From 889a95bf3110be781da8bd48dc0c92c15f44b8ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Mon, 8 Dec 2025 13:37:08 +0100
Subject: [PATCH 12/24] mypy fixes

---
 fastembed/late_interaction_multimodal/colmodernvbert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 2d030387..41048011 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -165,8 +165,8 @@ def _preprocess_onnx_text_input(
             Iterable[NumpyArray]: Post-processed output as NumPy arrays.
         """
         batch_size, seq_length = onnx_input["input_ids"].shape
-        empty_image_placeholder: NumpyArray = np.zeros(  # type: ignore[type-var,arg-type,assignment]
-            (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32
+        empty_image_placeholder: NumpyArray = np.zeros(
+            (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32  # type: ignore[type-var,arg-type,assignment]
         )
         onnx_input["pixel_values"] = empty_image_placeholder
         return onnx_input

From 144d8671b4328998321d5d87f9deee02e3776634 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Tue, 9 Dec 2025 12:05:50 +0100
Subject: [PATCH 13/24] Fix typo in the class name

---
 fastembed/late_interaction_multimodal/colmodernvbert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 41048011..890805c1 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -423,7 +423,7 @@ def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]:
 
     @classmethod
     def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]:
-        return ColModernVBERTmageEmbeddingWorker
+        return ColModernVBERTImageEmbeddingWorker
 
 
 class ColModernVBERTTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]):
@@ -436,7 +436,7 @@ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColM
         )
 
 
-class ColModernVBERTmageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]):
+class ColModernVBERTImageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]):
     def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColModernVBERT:
         return ColModernVBERT(
             model_name=model_name,

From 23768f1b679e417a9652504835492502f3ea583d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Tue, 9 Dec 2025 13:19:58 +0100
Subject: [PATCH 14/24] Add processor_config.json to additional files

---
 fastembed/late_interaction_multimodal/colmodernvbert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 890805c1..0a1f5d29 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -26,7 +26,7 @@
         license="mit",
         size_in_GB=1.0,
         sources=ModelSource(hf="Qdrant/colmodernvbert"),
-        additional_files=["model.onnx_data"],
+        additional_files=["processor_config.json"],
         model_file="model.onnx",
     ),
 ]

From 7bea5328d7ab5559f72d0e939cea85b2243f5c31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Tue, 9 Dec 2025 15:39:59 +0100
Subject: [PATCH 15/24] Fix mypy errors

---
 .../onnx_multimodal_model.py                  | 142 +++++++++++++-----
 tests/test_late_interaction_multimodal.py     |  11 +-
 2 files changed, 111 insertions(+), 42 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index cbfc09e8..6683a077 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -178,42 +178,15 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
             assert self.processor is not None, "Processor is not initialized"
             processed = self.processor(image_files)
 
-            # Handle nested structure (with image splitting)
+            # Dispatch to appropriate handler based on structure.
+            # ColModernVBERT processors divides the original image into
+            # subimages and processes them separately.
             if isinstance(processed[0], list):
-                # processed = [[img1_patches], [img2_patches], ...]
-                # Need shape: (batch_size, max_patches, C, H, W)
-
-                patch_counts = [len(patches) for patches in processed]
-                max_patches = max(patch_counts)
-
-                # Get dimensions from first patch
-                C, H, W = processed[0][0].shape
-
-                # Create padded array
-                batch_size = len(processed)
-                encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype)
-
-                # Create attention mask (1 for real patches, 0 for padding)
-                attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64)
-
-                # Fill in patches and attention mask
-                for i, patches in enumerate(processed):
-                    for j, patch in enumerate(patches):
-                        encoded[i, j] = patch
-                        attention_mask[i, j] = 1
-
-                # Track actual patch counts for later use
-                metadata = {"patch_counts": patch_counts}
+                encoded, attention_mask, metadata = self._process_nested_patches(processed)
             else:
-                # Flat structure (no splitting) - still need batch dimension
-                # Shape: (batch_size, 1, C, H, W)
-                encoded = np.array(processed)
-                if len(encoded.shape) == 4:  # (batch_size, C, H, W)
-                    encoded = encoded[:, np.newaxis, ...]  # Add num_patches=1 dimension
-
-                # All patches are real (no padding)
-                attention_mask = np.ones((len(images), encoded.shape[1]), dtype=np.int64)
-                metadata = {"patch_counts": [encoded.shape[1]] * len(images)}
+                encoded, attention_mask, metadata = self._process_flat_images(
+                    processed, len(images)  # type: ignore[arg-type]
+                )
 
         onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask}
         onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
@@ -221,10 +194,109 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
 
         return OnnxOutputContext(
             model_output=model_output[0],
-            attention_mask=attention_mask,
+            attention_mask=attention_mask,  # type: ignore[arg-type]
             metadata=metadata,
         )
 
+    def _process_nested_patches(
+        self, processed: list[list[NumpyArray]]
+    ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]:
+        """
+        Process nested image patches (from ImageSplitter).
+
+        Args:
+            processed: List of patch lists, one per image [[img1_patches], [img2_patches], ...]
+
+        Returns:
+            tuple: (encoded array, attention_mask, metadata)
+                - encoded: (batch_size, max_patches, C, H, W)
+                - attention_mask: (batch_size, max_patches) with 1 for real patches, 0 for padding
+                - metadata: Dict with 'patch_counts' key
+        """
+        patch_counts = [len(patches) for patches in processed]
+        max_patches = max(patch_counts)
+
+        # Get dimensions from first patch
+        C, H, W = processed[0][0].shape
+        batch_size = len(processed)
+
+        # Create padded array
+        encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype)
+
+        # Create attention mask (1 for real patches, 0 for padding)
+        attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64)
+
+        # Fill in patches and attention mask
+        for i, patches in enumerate(processed):
+            for j, patch in enumerate(patches):
+                encoded[i, j] = patch
+                attention_mask[i, j] = 1
+
+        metadata = {"patch_counts": patch_counts}
+        return encoded, attention_mask, metadata
+
+    def _process_flat_images(
+        self, processed: list[NumpyArray], num_images: int
+    ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]:
+        """
+        Process flat image arrays (from standard processors like SiglipImageProcessor).
+
+        For models expecting 5D input (Idefics3-based), adds patch dimension.
+        For models expecting 4D input, keeps original shape.
+
+        Args:
+            processed: List of image arrays
+            num_images: Number of images being processed
+
+        Returns:
+            tuple: (encoded array, attention_mask, metadata)
+                - encoded: (batch_size, C, H, W) for 4D models OR (batch_size, 1, C, H, W) for 5D models
+                - attention_mask: (batch_size, 1) with all ones
+                - metadata: Dict with 'patch_counts' key
+        """
+        encoded = np.array(processed)
+
+        # Check if model needs patch dimension based on ONNX signature
+        if len(encoded.shape) == 4 and self._needs_patch_dimension():
+            # Add patch dimension for Idefics3-based models: (batch, 1, C, H, W)
+            encoded = encoded[:, np.newaxis, ...]
+
+        # Determine attention mask shape based on final tensor shape
+        if len(encoded.shape) == 5:
+            # 5D tensor: attention_mask shape is (batch, num_patches)
+            attention_mask = np.ones((num_images, encoded.shape[1]), dtype=np.int64)
+            metadata = {"patch_counts": [encoded.shape[1]] * num_images}
+        else:
+            # 4D tensor: attention_mask shape is (batch, 1)
+            attention_mask = np.ones((num_images, 1), dtype=np.int64)
+            metadata = {"patch_counts": [1] * num_images}
+
+        return encoded, attention_mask, metadata  # type: ignore[return-value]
+
+    def _needs_patch_dimension(self) -> bool:
+        """
+        Determine if this model needs the patch dimension by checking ONNX input shape.
+
+        Idefics3-based models (like ColModernVBERT) need 5D tensors (batch_size, patch_count, C, H, W).
+        Earlier models (like ColPali v1.3) need 4D tensors (batch_size, C, H, W).
+
+        Returns:
+            bool: True if pixel_values input has 5 dimensions, False if 4 dimensions
+        """
+        if not hasattr(self, "model") or self.model is None:
+            return False
+
+        # Get pixel_values input metadata
+        for input_meta in self.model.get_inputs():
+            if input_meta.name == "pixel_values":
+                # input_meta.shape is a list like
+                #     ['batch_size', 'sequence_length', 'num_channels', 'height', 'width']
+                #  or ['batch_size', 'num_channels', 'height', 'width']
+                return len(input_meta.shape) == 5
+
+        # Default to False for backward compatibility
+        return False
+
     def _embed_images(
         self,
         model_name: str,
diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index 8aa3a44e..888586d3 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -48,13 +48,10 @@
     ),
     "Qdrant/colmodernvbert": np.array(
         [
-            [0.05, 0.0656, 0.0403, 0.1498, 0.1842, 0.0263, -0.1871],
-            [-0.0566, -0.1403, 0.0065, -0.0285, 0.0903, -0.0149, 0.1069],
-            [-0.1015, -0.0072, 0.0908, -0.0824, -0.0185, -0.0097, -0.0046],
-            [-0.1233, -0.1081, -0.0234, -0.0033, 0.0598, 0.0993, 0.0985],
-            [-0.0705, -0.1312, -0.0649, 0.0151, 0.0746, 0.0765, 0.1482],
-            [0.0053, -0.1384, -0.0584, -0.0272, 0.1301, 0.0508, 0.1796],
-            [0.0092, -0.1438, -0.0306, -0.0369, 0.1172, 0.037, 0.1334],
+            [0.0541, 0.0677, 0.0392, 0.1494, 0.1855, 0.0275, -0.1835, -0.1025, -0.1204, -0.0835],
+            [-0.0515, -0.1328, 0.0298, -0.0574, 0.0829, -0.0836, 0.0888, 0.0138, 0.0741, 0.0293],
+            [-0.1114, -0.0506, 0.0666, -0.1064, -0.0229, -0.0486, -0.007, 0.0932, 0.0054, 0.1113],
+            [0.2317, -0.0518, 0.0248, -0.0075, -0.078, 0.2073, -0.0912, -0.0622, -0.0203, 0.093]
         ]
     ),
 }

From 96eb50bcbcf27c2bb603eb11f67fee86463ab18f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Tue, 9 Dec 2025 15:39:59 +0100
Subject: [PATCH 16/24] Refactor onnx_embed_image

---
 .../late_interaction_multimodal/onnx_multimodal_model.py  | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index 6683a077..f4299421 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -184,9 +184,7 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
             if isinstance(processed[0], list):
                 encoded, attention_mask, metadata = self._process_nested_patches(processed)
             else:
-                encoded, attention_mask, metadata = self._process_flat_images(
-                    processed, len(images)  # type: ignore[arg-type]
-                )
+                encoded, attention_mask, metadata = self._process_flat_images(processed, len(images))
 
         onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask}
         onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
@@ -194,7 +192,7 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
 
         return OnnxOutputContext(
             model_output=model_output[0],
-            attention_mask=attention_mask,  # type: ignore[arg-type]
+            attention_mask=attention_mask,
             metadata=metadata,
         )
 
@@ -271,7 +269,7 @@ def _process_flat_images(
             attention_mask = np.ones((num_images, 1), dtype=np.int64)
             metadata = {"patch_counts": [1] * num_images}
 
-        return encoded, attention_mask, metadata  # type: ignore[return-value]
+        return encoded, attention_mask, metadata
 
     def _needs_patch_dimension(self) -> bool:
         """

From cf4100c9dedb59161beca7b4cdc9582860e059da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Tue, 9 Dec 2025 16:11:03 +0100
Subject: [PATCH 17/24] Fix mypy errors

---
 .../onnx_multimodal_model.py                          | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index f4299421..06af0586 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -184,7 +184,10 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
             if isinstance(processed[0], list):
                 encoded, attention_mask, metadata = self._process_nested_patches(processed)
             else:
-                encoded, attention_mask, metadata = self._process_flat_images(processed, len(images))
+                encoded, attention_mask, metadata = self._process_flat_images(
+                    processed,  # type: ignore[arg-type]
+                    len(images),
+                )
 
         onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask}
         onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
@@ -192,7 +195,7 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
 
         return OnnxOutputContext(
             model_output=model_output[0],
-            attention_mask=attention_mask,
+            attention_mask=attention_mask,  # type: ignore[arg-type]
             metadata=metadata,
         )
 
@@ -231,7 +234,7 @@ def _process_nested_patches(
                 attention_mask[i, j] = 1
 
         metadata = {"patch_counts": patch_counts}
-        return encoded, attention_mask, metadata
+        return encoded, attention_mask, metadata  # type: ignore[return-value]
 
     def _process_flat_images(
         self, processed: list[NumpyArray], num_images: int
@@ -269,7 +272,7 @@ def _process_flat_images(
             attention_mask = np.ones((num_images, 1), dtype=np.int64)
             metadata = {"patch_counts": [1] * num_images}
 
-        return encoded, attention_mask, metadata
+        return encoded, attention_mask, metadata  # type: ignore[return-value]
 
     def _needs_patch_dimension(self) -> bool:
         """

From 5a6d8841f3dcac88b39df000403237dbfc7fe459 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 2 Jan 2026 17:37:01 +0100
Subject: [PATCH 18/24] fix: colmodernvbert tests and query processing

---
 .../colmodernvbert.py                         |  5 +++-
 tests/test_late_interaction_multimodal.py     | 25 +++++++++++--------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 0a1f5d29..3ef46f55 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -43,6 +43,7 @@ class ColModernVBERT(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel
     VISUAL_PROMPT_PREFIX = (
         "<|begin_of_text|>User:<image>Describe the image.<end_of_utterance>\nAssistant:"
     )
+    QUERY_AUGMENTATION_TOKEN = "<end_of_utterance>"
 
     def __init__(
         self,
@@ -187,7 +188,9 @@ def _post_process_onnx_text_output(
         return output.model_output
 
     def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
-        encoded = self.tokenizer.encode_batch(documents)  # type: ignore[union-attr]
+        # Add query augmentation tokens (matching process_queries logic from colpali-engine)
+        augmented_queries = [doc + self.QUERY_AUGMENTATION_TOKEN * 10 for doc in documents]
+        encoded = self.tokenizer.encode_batch(augmented_queries)  # type: ignore[union-attr]
         return encoded
 
     def _preprocess_onnx_image_input(
diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index 888586d3..ac4f9eeb 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -23,13 +23,13 @@
     ),
     "Qdrant/colmodernvbert": np.array(
         [
-            [0.2256, -0.0503, 0.0254, -0.011, -0.0786, 0.2152, -0.0961],
-            [-0.0028, -0.0484, -0.0724, -0.0724, -0.0977, 0.0308, -0.0236],
-            [0.0035, -0.1075, -0.0877, -0.0207, -0.0828, -0.0294, -0.0253],
-            [0.0021, -0.0797, -0.0605, -0.0008, -0.0837, 0.0015, -0.0846],
-            [-0.0473, -0.0594, -0.0553, -0.0014, -0.0712, 0.0158, -0.0546],
-            [-0.1009, -0.082, -0.0684, -0.1385, -0.0469, -0.0606, -0.0323],
-            [-0.0624, 0.006, -0.0498, -0.0127, -0.1115, 0.0076, -0.0888],
+            [0.11614, -0.15793, -0.11194, 0.0688, 0.08001, 0.10575, -0.07871],
+            [0.10094, -0.13301, -0.12069, 0.10932, 0.04645, 0.09884, 0.04048],
+            [0.13106, -0.18613, -0.13469, 0.10566, 0.03659, 0.07712, -0.03916],
+            [0.09754, -0.09596, -0.04839, 0.14991, 0.05692, 0.10569, -0.08349],
+            [0.02576, -0.15651, -0.09977, 0.09707, 0.13412, 0.09994, -0.09931],
+            [-0.06741, -0.1787, -0.19677, -0.07618, 0.13102, -0.02131, -0.02437],
+            [-0.02776, -0.10187, -0.13793, 0.03835, 0.04766, 0.04701, -0.15635],
         ]
     ),
 }
@@ -48,10 +48,13 @@
     ),
     "Qdrant/colmodernvbert": np.array(
         [
-            [0.0541, 0.0677, 0.0392, 0.1494, 0.1855, 0.0275, -0.1835, -0.1025, -0.1204, -0.0835],
-            [-0.0515, -0.1328, 0.0298, -0.0574, 0.0829, -0.0836, 0.0888, 0.0138, 0.0741, 0.0293],
-            [-0.1114, -0.0506, 0.0666, -0.1064, -0.0229, -0.0486, -0.007, 0.0932, 0.0054, 0.1113],
-            [0.2317, -0.0518, 0.0248, -0.0075, -0.078, 0.2073, -0.0912, -0.0622, -0.0203, 0.093]
+            [0.05, 0.06557, 0.04026, 0.14981, 0.1842, 0.0263, -0.18706],
+            [-0.05664, -0.14028, 0.00649, -0.02849, 0.09034, -0.01494, 0.10693],
+            [-0.10147, -0.00716, 0.09084, -0.08236, -0.01849, -0.00972, -0.00461],
+            [-0.1233, -0.10814, -0.02337, -0.00329, 0.05984, 0.09934, 0.09846],
+            [-0.07053, -0.13119, -0.06487, 0.01508, 0.07459, 0.07655, 0.14821],
+            [0.00526, -0.13842, -0.05837, -0.02721, 0.13009, 0.05076, 0.17962],
+            [0.00924, -0.14383, -0.03057, -0.03691, 0.11718, 0.037, 0.13344],
         ]
     ),
 }

From 2dc7de8a0c29a49c13dd0b9787f6bc49532486a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20=C5=81ukawski?= <lukawski.kacper@gmail.com>
Date: Fri, 2 Jan 2026 21:13:47 +0100
Subject: [PATCH 19/24] fix: remove Union references

---
 fastembed/image/transform/functional.py | 4 ++--
 fastembed/image/transform/operators.py  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fastembed/image/transform/functional.py b/fastembed/image/transform/functional.py
index 86dfbe08..9d9e2197 100644
--- a/fastembed/image/transform/functional.py
+++ b/fastembed/image/transform/functional.py
@@ -150,7 +150,7 @@ def pad2square(
 def resize_longest_edge(
     image: Image.Image,
     max_size: int,
-    resample: Union[int, Image.Resampling] = Image.Resampling.LANCZOS,
+    resample: int | Image.Resampling = Image.Resampling.LANCZOS,
 ) -> Image.Image:
     height, width = image.height, image.width
     aspect_ratio = width / height
@@ -192,7 +192,7 @@ def crop_ndarray(
 def resize_ndarray(
     image: NumpyArray,
     size: tuple[int, int],
-    resample: Union[int, Image.Resampling] = Image.Resampling.LANCZOS,
+    resample: int | Image.Resampling = Image.Resampling.LANCZOS,
     channel_first: bool = True,
 ) -> NumpyArray:
     # Convert to PIL-friendly format (H, W, C)
diff --git a/fastembed/image/transform/operators.py b/fastembed/image/transform/operators.py
index 3ee2d873..e6ba4d95 100644
--- a/fastembed/image/transform/operators.py
+++ b/fastembed/image/transform/operators.py
@@ -42,8 +42,8 @@ def __init__(self, mean: float | list[float], std: float | list[float]):
         self.std = std
 
     def __call__(  # type: ignore[override]
-        self, images: Union[list[NumpyArray], list[list[NumpyArray]]]
-    ) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
+        self, images: list[NumpyArray] | list[list[NumpyArray]]
+    ) -> list[NumpyArray] | list[list[NumpyArray]]:
         if images and isinstance(images[0], list):
             # Nested structure from ImageSplitter
             return [
@@ -73,8 +73,8 @@ def __init__(self, scale: float = 1 / 255):
         self.scale = scale
 
     def __call__(  # type: ignore[override]
-        self, images: Union[list[NumpyArray], list[list[NumpyArray]]]
-    ) -> Union[list[NumpyArray], list[list[NumpyArray]]]:
+        self, images: list[NumpyArray] | list[list[NumpyArray]]
+    ) -> list[NumpyArray] | list[list[NumpyArray]]:
         if images and isinstance(images[0], list):
             # Nested structure from ImageSplitter
             return [

From 46453a26a918a6389a078327807af97ebf29855c Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Thu, 8 Jan 2026 17:12:23 +0700
Subject: [PATCH 20/24] fix: fix exit stack, update tests, implement token
 count

---
 fastembed/image/onnx_image_model.py           |   6 +-
 .../colmodernvbert.py                         |  31 +++-
 .../onnx_multimodal_model.py                  |   6 +-
 tests/test_late_interaction_multimodal.py     | 161 ++++++++++--------
 4 files changed, 120 insertions(+), 84 deletions(-)

diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py
index 86326da9..deddcf73 100644
--- a/fastembed/image/onnx_image_model.py
+++ b/fastembed/image/onnx_image_model.py
@@ -76,9 +76,11 @@ def _build_onnx_input(self, encoded: NumpyArray) -> dict[str, NumpyArray]:
         return {input_name: encoded}
 
     def onnx_embed(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext:
-        with contextlib.ExitStack():
+        with contextlib.ExitStack() as stack:
             image_files = [
-                Image.open(image) if not isinstance(image, Image.Image) else image
+                stack.enter_context(Image.open(image))
+                if not isinstance(image, Image.Image)
+                else image
                 for image in images
             ]
             assert self.processor is not None, "Processor is not initialized"
diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 3ef46f55..7ab84f22 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -1,4 +1,4 @@
-from typing import Any, Iterable, Type, Union, Optional, Sequence
+from typing import Any, Iterable, Type, Optional, Sequence
 import json
 
 import numpy as np
@@ -8,7 +8,7 @@
 from fastembed.common.model_description import DenseModelDescription, ModelSource
 from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common.types import NumpyArray, OnnxProvider
-from fastembed.common.utils import define_cache_dir
+from fastembed.common.utils import define_cache_dir, iter_batch
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
     LateInteractionMultimodalEmbeddingBase,
 )
@@ -167,7 +167,8 @@ def _preprocess_onnx_text_input(
         """
         batch_size, seq_length = onnx_input["input_ids"].shape
         empty_image_placeholder: NumpyArray = np.zeros(
-            (batch_size, seq_length, 3, self.image_size, self.image_size), dtype=np.float32  # type: ignore[type-var,arg-type,assignment]
+            (batch_size, seq_length, 3, self.image_size, self.image_size),
+            dtype=np.float32,  # type: ignore[type-var,arg-type,assignment]
         )
         onnx_input["pixel_values"] = empty_image_placeholder
         return onnx_input
@@ -193,6 +194,23 @@ def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
         encoded = self.tokenizer.encode_batch(augmented_queries)  # type: ignore[union-attr]
         return encoded
 
+    def token_count(
+        self,
+        texts: str | Iterable[str],
+        batch_size: int = 1024,
+        include_extension: bool = False,
+        **kwargs: Any,
+    ) -> int:
+        if not hasattr(self, "model") or self.model is None:
+            self.load_onnx_model()  # loads the tokenizer as well
+        token_num = 0
+        texts = [texts] if isinstance(texts, str) else texts
+        assert self.tokenizer is not None
+        tokenize_func = self.tokenize if include_extension else self.tokenizer.encode_batch
+        for batch in iter_batch(texts, batch_size):
+            token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)])
+        return token_num
+
     def _preprocess_onnx_image_input(
         self, onnx_input: dict[str, np.ndarray], **kwargs: Any
     ) -> dict[str, NumpyArray]:
@@ -258,7 +276,8 @@ def _preprocess_onnx_image_input(
         onnx_input["attention_mask"] = attention_mask
         return onnx_input
 
-    def _compute_rows_cols_from_patches(self, patch_count: int) -> tuple[int, int]:
+    @staticmethod
+    def _compute_rows_cols_from_patches(patch_count: int) -> tuple[int, int]:
         if patch_count <= 1:
             return 0, 0
 
@@ -350,7 +369,7 @@ def _post_process_onnx_image_output(
 
     def embed_text(
         self,
-        documents: Union[str, Iterable[str]],
+        documents: str | Iterable[str],
         batch_size: int = 256,
         parallel: Optional[int] = None,
         **kwargs: Any,
@@ -386,7 +405,7 @@ def embed_text(
 
     def embed_image(
         self,
-        images: Union[ImageInput, Iterable[ImageInput]],
+        images: ImageInput | Iterable[ImageInput],
         batch_size: int = 16,
         parallel: Optional[int] = None,
         **kwargs: Any,
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index 06af0586..bfe81fdc 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -170,9 +170,11 @@ def _embed_documents(
                 yield from self._post_process_onnx_text_output(batch)  # type: ignore
 
     def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext:
-        with contextlib.ExitStack():
+        with contextlib.ExitStack() as stack:
             image_files = [
-                Image.open(image) if not isinstance(image, Image.Image) else image
+                stack.enter_context(Image.open(image))
+                if not isinstance(image, Image.Image)
+                else image
                 for image in images
             ]
             assert self.processor is not None, "Processor is not initialized"
diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index ac4f9eeb..29d550fa 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -1,4 +1,5 @@
 import os
+from contextlib import contextmanager
 
 import pytest
 from PIL import Image
@@ -6,21 +7,21 @@
 
 from fastembed import LateInteractionMultimodalEmbedding
 from tests.config import TEST_MISC_DIR
-
+from tests.utils import delete_model_cache
 
 # vectors are abridged and rounded for brevity
 CANONICAL_IMAGE_VALUES = {
-    "Qdrant/colpali-v1.3-fp16": np.array(
-        [
-            [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738],
-            [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021],
-            [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666],
-            [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087],
-            [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064],
-            [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301],
-            [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122],
-        ]
-    ),
+    # "Qdrant/colpali-v1.3-fp16": np.array(
+    #     [
+    #         [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738],
+    #         [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021],
+    #         [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666],
+    #         [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087],
+    #         [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064],
+    #         [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301],
+    #         [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122],
+    #     ]
+    # ),
     "Qdrant/colmodernvbert": np.array(
         [
             [0.11614, -0.15793, -0.11194, 0.0688, 0.08001, 0.10575, -0.07871],
@@ -35,17 +36,17 @@
 }
 
 CANONICAL_QUERY_VALUES = {
-    "Qdrant/colpali-v1.3-fp16": np.array(
-        [
-            [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567],
-            [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537],
-            [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593],
-            [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098],
-            [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708],
-            [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022],
-            [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137],
-        ]
-    ),
+    # "Qdrant/colpali-v1.3-fp16": np.array(
+    #     [
+    #         [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567],
+    #         [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537],
+    #         [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593],
+    #         [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098],
+    #         [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708],
+    #         [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022],
+    #         [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137],
+    #     ]
+    # ),
     "Qdrant/colmodernvbert": np.array(
         [
             [0.05, 0.06557, 0.04026, 0.14981, 0.1842, 0.0263, -0.18706],
@@ -66,43 +67,68 @@
     Image.open((TEST_MISC_DIR / "image.jpeg")),
 ]
 
+MODELS_TO_CACHE = ("Qdrant/colmodernvbert",)
 
-def test_batch_embedding():
-    if os.getenv("CI"):
-        pytest.skip("Colpali is too large to test in CI")
 
-    for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
-        print("evaluating", model_name)
-        model = LateInteractionMultimodalEmbedding(model_name=model_name)
-        result = list(model.embed_image(images, batch_size=2))
+@pytest.fixture(scope="module")
+def model_cache():
+    is_ci = os.getenv("CI")
+    cache = {}
 
-        for value in result:
-            token_num, abridged_dim = expected_result.shape
-            assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3)
+    @contextmanager
+    def get_model(model_name: str):
+        lowercase_model_name = model_name.lower()
+        if lowercase_model_name not in cache:
+            cache[lowercase_model_name] = LateInteractionMultimodalEmbedding(lowercase_model_name)
+        yield cache[lowercase_model_name]
+        if lowercase_model_name not in MODELS_TO_CACHE:
+            model_inst = cache.pop(lowercase_model_name)
+            if is_ci:
+                delete_model_cache(model_inst.model._model_dir)
+            del model_inst
 
+    yield get_model
 
-def test_single_embedding():
-    if os.getenv("CI"):
-        pytest.skip("Colpali is too large to test in CI")
+    if is_ci:
+        for name, model in cache.items():
+            delete_model_cache(model.model._model_dir)
+    cache.clear()
 
+
+def test_batch_embedding(model_cache):
     for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
+        if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"):
+            continue  # colpali is too large for ci
+
         print("evaluating", model_name)
-        model = LateInteractionMultimodalEmbedding(model_name=model_name)
-        result = next(iter(model.embed_image(images, batch_size=6)))
-        token_num, abridged_dim = expected_result.shape
-        assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
+        with model_cache(model_name) as model:
+            result = list(model.embed_image(images, batch_size=2))
 
+            for value in result:
+                token_num, abridged_dim = expected_result.shape
+                assert np.allclose(value[:token_num, :abridged_dim], expected_result, atol=2e-3)
+
+
+def test_single_embedding(model_cache):
+    for model_name, expected_result in CANONICAL_IMAGE_VALUES.items():
+        if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"):
+            continue  # colpali is too large for ci
+        print("evaluating", model_name)
+        with model_cache(model_name) as model:
+            result = next(iter(model.embed_image(images, batch_size=6)))
+            token_num, abridged_dim = expected_result.shape
+            assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
 
-def test_single_embedding_query():
-    if os.getenv("CI"):
-        pytest.skip("Colpali is too large to test in CI")
 
+def test_single_embedding_query(model_cache):
     for model_name, expected_result in CANONICAL_QUERY_VALUES.items():
+        if model_name.lower() == "Qdrant/colpali-v1.3-fp16".lower() and os.getenv("CI"):
+            continue  # colpali is too large for ci
         print("evaluating", model_name)
-        model = LateInteractionMultimodalEmbedding(model_name=model_name)
-        result = next(iter(model.embed_text(queries)))
-        token_num, abridged_dim = expected_result.shape
-        assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
+        with model_cache(model_name) as model:
+            result = next(iter(model.embed_text(queries)))
+            token_num, abridged_dim = expected_result.shape
+            assert np.allclose(result[:token_num, :abridged_dim], expected_result, atol=2e-3)
 
 
 def test_get_embedding_size():
@@ -117,35 +143,22 @@ def test_get_embedding_size():
 
 
 def test_embedding_size():
-    if os.getenv("CI"):
-        pytest.skip("Colpali is too large to test in CI")
-    model_name = "Qdrant/colpali-v1.3-fp16"
-    model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
-    assert model.embedding_size == 128
-
-    model_name = "Qdrant/ColPali-v1.3-fp16"
-    model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
-    assert model.embedding_size == 128
-
     model_name = "Qdrant/colmodernvbert"
     model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
     assert model.embedding_size == 128
 
 
-def test_token_count() -> None:
-    if os.getenv("CI"):
-        pytest.skip("Colpali is too large to test in CI")
-    model_name = "Qdrant/colpali-v1.3-fp16"
-    model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True)
-
-    documents = ["short doc", "it is a long document to check attention mask for paddings"]
-    short_doc_token_count = model.token_count(documents[0])
-    long_doc_token_count = model.token_count(documents[1])
-    documents_token_count = model.token_count(documents)
-    assert short_doc_token_count + long_doc_token_count == documents_token_count
-    assert short_doc_token_count + long_doc_token_count == model.token_count(
-        documents, batch_size=1
-    )
-    assert short_doc_token_count + long_doc_token_count < model.token_count(
-        documents, include_extension=True
-    )
+def test_token_count(model_cache) -> None:
+    model_name = "Qdrant/colmodernvbert"
+    with model_cache(model_name) as model:
+        documents = ["short doc", "it is a long document to check attention mask for paddings"]
+        short_doc_token_count = model.token_count(documents[0])
+        long_doc_token_count = model.token_count(documents[1])
+        documents_token_count = model.token_count(documents)
+        assert short_doc_token_count + long_doc_token_count == documents_token_count
+        assert short_doc_token_count + long_doc_token_count == model.token_count(
+            documents, batch_size=1
+        )
+        assert short_doc_token_count + long_doc_token_count < model.token_count(
+            documents, include_extension=True
+        )

From 8f6f057b08f6d1b6761003dc01e869333f87a684 Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Fri, 9 Jan 2026 16:33:20 +0700
Subject: [PATCH 21/24] fix: uncomment colpali in tests

---
 tests/test_late_interaction_multimodal.py | 44 +++++++++++------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index 29d550fa..248352d4 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -11,17 +11,17 @@
 
 # vectors are abridged and rounded for brevity
 CANONICAL_IMAGE_VALUES = {
-    # "Qdrant/colpali-v1.3-fp16": np.array(
-    #     [
-    #         [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738],
-    #         [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021],
-    #         [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666],
-    #         [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087],
-    #         [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064],
-    #         [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301],
-    #         [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122],
-    #     ]
-    # ),
+    "Qdrant/colpali-v1.3-fp16": np.array(
+        [
+            [-0.0345, -0.022, 0.0567, -0.0518, -0.0782, 0.1714, -0.1738],
+            [-0.1181, -0.099, 0.0268, 0.0774, 0.0228, 0.0563, -0.1021],
+            [-0.117, -0.0683, 0.0371, 0.0921, 0.0107, 0.0659, -0.0666],
+            [-0.1393, -0.0948, 0.037, 0.0951, -0.0126, 0.0678, -0.087],
+            [-0.0957, -0.081, 0.0404, 0.052, 0.0409, 0.0335, -0.064],
+            [-0.0626, -0.0445, 0.056, 0.0592, -0.0229, 0.0409, -0.0301],
+            [-0.1299, -0.0691, 0.1097, 0.0728, 0.0123, 0.0519, 0.0122],
+        ]
+    ),
     "Qdrant/colmodernvbert": np.array(
         [
             [0.11614, -0.15793, -0.11194, 0.0688, 0.08001, 0.10575, -0.07871],
@@ -36,17 +36,17 @@
 }
 
 CANONICAL_QUERY_VALUES = {
-    # "Qdrant/colpali-v1.3-fp16": np.array(
-    #     [
-    #         [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567],
-    #         [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537],
-    #         [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593],
-    #         [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098],
-    #         [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708],
-    #         [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022],
-    #         [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137],
-    #     ]
-    # ),
+    "Qdrant/colpali-v1.3-fp16": np.array(
+        [
+            [-0.0023, 0.1477, 0.1594, 0.046, -0.0196, 0.0554, 0.1567],
+            [-0.0139, -0.0057, 0.0932, 0.0052, -0.0678, 0.0131, 0.0537],
+            [0.0054, 0.0364, 0.2078, -0.074, 0.0355, 0.061, 0.1593],
+            [-0.0076, -0.0154, 0.2266, 0.0103, 0.0089, -0.024, 0.098],
+            [-0.0274, 0.0098, 0.2106, -0.0634, 0.0616, -0.0021, 0.0708],
+            [0.0074, 0.0025, 0.1631, -0.0802, 0.0418, -0.0219, 0.1022],
+            [-0.0165, -0.0106, 0.1672, -0.0768, 0.0389, -0.0038, 0.1137],
+        ]
+    ),
     "Qdrant/colmodernvbert": np.array(
         [
             [0.05, 0.06557, 0.04026, 0.14981, 0.1842, 0.0263, -0.18706],

From 01965c933b6c11d7b99bb7b4f610d2fc0ba17160 Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Fri, 9 Jan 2026 17:02:00 +0700
Subject: [PATCH 22/24] fix: lowercase models to cache

---
 tests/test_late_interaction_multimodal.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index 248352d4..bf400dfa 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -67,7 +67,8 @@
     Image.open((TEST_MISC_DIR / "image.jpeg")),
 ]
 
-MODELS_TO_CACHE = ("Qdrant/colmodernvbert",)
+_MODELS_TO_CACHE = ("Qdrant/colmodernvbert",)
+MODELS_TO_CACHE = (model_name.lower() for model_name in _MODELS_TO_CACHE)
 
 
 @pytest.fixture(scope="module")
@@ -90,7 +91,7 @@ def get_model(model_name: str):
     yield get_model
 
     if is_ci:
-        for name, model in cache.items():
+        for _, model in cache.items():
             delete_model_cache(model.model._model_dir)
     cache.clear()
 

From ef9c496dcc06aee1d0058cb9bfc7f48adf12504c Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Fri, 9 Jan 2026 17:23:56 +0700
Subject: [PATCH 23/24] fix: fix models to cache

---
 tests/test_late_interaction_multimodal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py
index bf400dfa..94ae47e7 100644
--- a/tests/test_late_interaction_multimodal.py
+++ b/tests/test_late_interaction_multimodal.py
@@ -68,7 +68,7 @@
 ]
 
 _MODELS_TO_CACHE = ("Qdrant/colmodernvbert",)
-MODELS_TO_CACHE = (model_name.lower() for model_name in _MODELS_TO_CACHE)
+MODELS_TO_CACHE = tuple(model_name.lower() for model_name in _MODELS_TO_CACHE)
 
 
 @pytest.fixture(scope="module")

From 8b9f50c32b87dad0f24819e7e94aa8d4c2446342 Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Fri, 9 Jan 2026 18:21:59 +0700
Subject: [PATCH 24/24] refactor: move colmodernvbert related onnx embed to its
 class

---
 .../colmodernvbert.py                         |  64 +++++++++
 .../onnx_multimodal_model.py                  | 123 +-----------------
 2 files changed, 68 insertions(+), 119 deletions(-)

diff --git a/fastembed/late_interaction_multimodal/colmodernvbert.py b/fastembed/late_interaction_multimodal/colmodernvbert.py
index 7ab84f22..20b8e4f7 100644
--- a/fastembed/late_interaction_multimodal/colmodernvbert.py
+++ b/fastembed/late_interaction_multimodal/colmodernvbert.py
@@ -1,8 +1,10 @@
+import contextlib
 from typing import Any, Iterable, Type, Optional, Sequence
 import json
 
 import numpy as np
 from tokenizers import Encoding
+from PIL import Image
 
 from fastembed.common import ImageInput
 from fastembed.common.model_description import DenseModelDescription, ModelSource
@@ -211,6 +213,68 @@ def token_count(
             token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)])
         return token_num
 
+    def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutputContext:
+        with contextlib.ExitStack() as stack:
+            image_files = [
+                stack.enter_context(Image.open(image))
+                if not isinstance(image, Image.Image)
+                else image
+                for image in images
+            ]
+            assert self.processor is not None, "Processor is not initialized"
+            processed = self.processor(image_files)
+            encoded, attention_mask, metadata = self._process_nested_patches(processed)  # type: ignore[arg-type]
+
+        onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask}
+        onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
+        model_output = self.model.run(None, onnx_input)  # type: ignore[union-attr]
+
+        return OnnxOutputContext(
+            model_output=model_output[0],
+            attention_mask=attention_mask,  # type: ignore[arg-type]
+            metadata=metadata,
+        )
+
+    @staticmethod
+    def _process_nested_patches(
+        processed: list[list[NumpyArray]],
+    ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]:
+        """
+        Process nested image patches (from ImageSplitter).
+
+        Args:
+            processed: List of patch lists, one per image [[img1_patches], [img2_patches], ...]
+
+        Returns:
+            tuple: (encoded array, attention_mask, metadata)
+                - encoded: (batch_size, max_patches, C, H, W)
+                - attention_mask: (batch_size, max_patches) with 1 for real patches, 0 for padding
+                - metadata: Dict with 'patch_counts' key
+        """
+        patch_counts = [len(patches) for patches in processed]
+        max_patches = max(patch_counts)
+
+        # Get dimensions from first patch
+        channels, height, width = processed[0][0].shape
+        batch_size = len(processed)
+
+        # Create padded array
+        encoded = np.zeros(
+            (batch_size, max_patches, channels, height, width), dtype=processed[0][0].dtype
+        )
+
+        # Create attention mask (1 for real patches, 0 for padding)
+        attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64)
+
+        # Fill in patches and attention mask
+        for i, patches in enumerate(processed):
+            for j, patch in enumerate(patches):
+                encoded[i, j] = patch
+                attention_mask[i, j] = 1
+
+        metadata = {"patch_counts": patch_counts}
+        return encoded, attention_mask, metadata  # type: ignore[return-value]
+
     def _preprocess_onnx_image_input(
         self, onnx_input: dict[str, np.ndarray], **kwargs: Any
     ) -> dict[str, NumpyArray]:
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index bfe81fdc..93436895 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -178,127 +178,12 @@ def onnx_embed_image(self, images: list[ImageInput], **kwargs: Any) -> OnnxOutpu
                 for image in images
             ]
             assert self.processor is not None, "Processor is not initialized"
-            processed = self.processor(image_files)
-
-            # Dispatch to appropriate handler based on structure.
-            # ColModernVBERT processors divides the original image into
-            # subimages and processes them separately.
-            if isinstance(processed[0], list):
-                encoded, attention_mask, metadata = self._process_nested_patches(processed)
-            else:
-                encoded, attention_mask, metadata = self._process_flat_images(
-                    processed,  # type: ignore[arg-type]
-                    len(images),
-                )
-
-        onnx_input = {"pixel_values": encoded, "attention_mask": attention_mask}
+            encoded = np.array(self.processor(image_files))
+        onnx_input = {"pixel_values": encoded}
         onnx_input = self._preprocess_onnx_image_input(onnx_input, **kwargs)
         model_output = self.model.run(None, onnx_input)  # type: ignore[union-attr]
-
-        return OnnxOutputContext(
-            model_output=model_output[0],
-            attention_mask=attention_mask,  # type: ignore[arg-type]
-            metadata=metadata,
-        )
-
-    def _process_nested_patches(
-        self, processed: list[list[NumpyArray]]
-    ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]:
-        """
-        Process nested image patches (from ImageSplitter).
-
-        Args:
-            processed: List of patch lists, one per image [[img1_patches], [img2_patches], ...]
-
-        Returns:
-            tuple: (encoded array, attention_mask, metadata)
-                - encoded: (batch_size, max_patches, C, H, W)
-                - attention_mask: (batch_size, max_patches) with 1 for real patches, 0 for padding
-                - metadata: Dict with 'patch_counts' key
-        """
-        patch_counts = [len(patches) for patches in processed]
-        max_patches = max(patch_counts)
-
-        # Get dimensions from first patch
-        C, H, W = processed[0][0].shape
-        batch_size = len(processed)
-
-        # Create padded array
-        encoded = np.zeros((batch_size, max_patches, C, H, W), dtype=processed[0][0].dtype)
-
-        # Create attention mask (1 for real patches, 0 for padding)
-        attention_mask = np.zeros((batch_size, max_patches), dtype=np.int64)
-
-        # Fill in patches and attention mask
-        for i, patches in enumerate(processed):
-            for j, patch in enumerate(patches):
-                encoded[i, j] = patch
-                attention_mask[i, j] = 1
-
-        metadata = {"patch_counts": patch_counts}
-        return encoded, attention_mask, metadata  # type: ignore[return-value]
-
-    def _process_flat_images(
-        self, processed: list[NumpyArray], num_images: int
-    ) -> tuple[NumpyArray, NumpyArray, dict[str, Any]]:
-        """
-        Process flat image arrays (from standard processors like SiglipImageProcessor).
-
-        For models expecting 5D input (Idefics3-based), adds patch dimension.
-        For models expecting 4D input, keeps original shape.
-
-        Args:
-            processed: List of image arrays
-            num_images: Number of images being processed
-
-        Returns:
-            tuple: (encoded array, attention_mask, metadata)
-                - encoded: (batch_size, C, H, W) for 4D models OR (batch_size, 1, C, H, W) for 5D models
-                - attention_mask: (batch_size, 1) with all ones
-                - metadata: Dict with 'patch_counts' key
-        """
-        encoded = np.array(processed)
-
-        # Check if model needs patch dimension based on ONNX signature
-        if len(encoded.shape) == 4 and self._needs_patch_dimension():
-            # Add patch dimension for Idefics3-based models: (batch, 1, C, H, W)
-            encoded = encoded[:, np.newaxis, ...]
-
-        # Determine attention mask shape based on final tensor shape
-        if len(encoded.shape) == 5:
-            # 5D tensor: attention_mask shape is (batch, num_patches)
-            attention_mask = np.ones((num_images, encoded.shape[1]), dtype=np.int64)
-            metadata = {"patch_counts": [encoded.shape[1]] * num_images}
-        else:
-            # 4D tensor: attention_mask shape is (batch, 1)
-            attention_mask = np.ones((num_images, 1), dtype=np.int64)
-            metadata = {"patch_counts": [1] * num_images}
-
-        return encoded, attention_mask, metadata  # type: ignore[return-value]
-
-    def _needs_patch_dimension(self) -> bool:
-        """
-        Determine if this model needs the patch dimension by checking ONNX input shape.
-
-        Idefics3-based models (like ColModernVBERT) need 5D tensors (batch_size, patch_count, C, H, W).
-        Earlier models (like ColPali v1.3) need 4D tensors (batch_size, C, H, W).
-
-        Returns:
-            bool: True if pixel_values input has 5 dimensions, False if 4 dimensions
-        """
-        if not hasattr(self, "model") or self.model is None:
-            return False
-
-        # Get pixel_values input metadata
-        for input_meta in self.model.get_inputs():
-            if input_meta.name == "pixel_values":
-                # input_meta.shape is a list like
-                #     ['batch_size', 'sequence_length', 'num_channels', 'height', 'width']
-                #  or ['batch_size', 'num_channels', 'height', 'width']
-                return len(input_meta.shape) == 5
-
-        # Default to False for backward compatibility
-        return False
+        embeddings = model_output[0].reshape(len(images), -1)
+        return OnnxOutputContext(model_output=embeddings)
 
     def _embed_images(
         self,