From 899b92f9c1791422315a811c0f0d408b8efc0ea4 Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Fri, 20 Jun 2025 15:47:04 +0400
Subject: [PATCH 1/3] new: use cuda if available

---
 fastembed/common/onnx_model.py                   | 16 ++++++++++------
 fastembed/common/types.py                        |  9 ++++++++-
 fastembed/image/image_embedding.py               |  4 ++--
 fastembed/image/onnx_embedding.py                | 12 +++++++-----
 fastembed/image/onnx_image_model.py              |  6 +++---
 fastembed/late_interaction/colbert.py            | 11 ++++++-----
 .../late_interaction_text_embedding.py           |  4 ++--
 fastembed/late_interaction_multimodal/colpali.py | 13 +++++++------
 .../late_interaction_multimodal_embedding.py     |  4 ++--
 .../onnx_multimodal_model.py                     |  8 ++++----
 fastembed/parallel_processor.py                  |  3 ++-
 .../cross_encoder/custom_text_cross_encoder.py   |  3 ++-
 .../cross_encoder/onnx_text_cross_encoder.py     | 10 ++++++----
 .../rerank/cross_encoder/onnx_text_model.py      |  6 +++---
 .../rerank/cross_encoder/text_cross_encoder.py   |  3 ++-
 fastembed/sparse/bm42.py                         | 10 ++++++----
 fastembed/sparse/minicoil.py                     | 10 ++++++----
 fastembed/sparse/sparse_text_embedding.py        |  3 ++-
 fastembed/sparse/splade_pp.py                    | 10 ++++++----
 fastembed/text/custom_text_embedding.py          |  5 ++---
 fastembed/text/onnx_embedding.py                 | 11 ++++++-----
 fastembed/text/onnx_text_model.py                |  6 +++---
 fastembed/text/text_embedding.py                 |  4 ++--
 23 files changed, 99 insertions(+), 72 deletions(-)

diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
index 490ed875..ea09d5b8 100644
--- a/fastembed/common/onnx_model.py
+++ b/fastembed/common/onnx_model.py
@@ -9,7 +9,7 @@
 from numpy.typing import NDArray
 from tokenizers import Tokenizer
 
-from fastembed.common.types import OnnxProvider, NumpyArray
+from fastembed.common.types import OnnxProvider, NumpyArray, Device
 from fastembed.parallel_processor import Worker
 
 # Holds type of the embedding result
@@ -60,23 +60,28 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
         model_path = model_dir / model_file
         # List of Execution Providers: https://onnxruntime.ai/docs/execution-providers
+        available_providers = ort.get_available_providers()
+        cuda_available = "CUDAExecutionProvider" in available_providers
+        explicit_cuda = cuda is True or cuda == Device.CUDA
 
-        if cuda and providers is not None:
+        if explicit_cuda and providers is not None:
             warnings.warn(
-                f"`cuda` and `providers` are mutually exclusive parameters, cuda: {cuda}, providers: {providers}",
+                f"`cuda` and `providers` are mutually exclusive parameters, "
+                f"cuda: {cuda}, providers: {providers}. If you'd like to use providers, cuda should be one of "
+                f"[True, Device.CPU, Device.AUTO].",
                 category=UserWarning,
                 stacklevel=6,
             )
 
         if providers is not None:
             onnx_providers = list(providers)
-        elif cuda:
+        elif explicit_cuda or (cuda == Device.AUTO and cuda_available):
             if device_id is None:
                 onnx_providers = ["CUDAExecutionProvider"]
             else:
@@ -84,7 +89,6 @@ def _load_onnx_model(
         else:
             onnx_providers = ["CPUExecutionProvider"]
 
-        available_providers = ort.get_available_providers()
         requested_provider_names: list[str] = []
         for provider in onnx_providers:
             # check providers available
diff --git a/fastembed/common/types.py b/fastembed/common/types.py
index 69d047fb..80dee021 100644
--- a/fastembed/common/types.py
+++ b/fastembed/common/types.py
@@ -1,11 +1,18 @@
+from enum import Enum
 from pathlib import Path
-
 from typing import Any, TypeAlias
+
 import numpy as np
 from numpy.typing import NDArray
 from PIL import Image
 
 
+class Device(str, Enum):
+    CPU = "cpu"
+    CUDA = "cuda"
+    AUTO = "auto"
+
+
 PathInput: TypeAlias = str | Path
 ImageInput: TypeAlias = PathInput | Image.Image
 
diff --git a/fastembed/image/image_embedding.py b/fastembed/image/image_embedding.py
index 057dfc21..a3630345 100644
--- a/fastembed/image/image_embedding.py
+++ b/fastembed/image/image_embedding.py
@@ -1,7 +1,7 @@
 from typing import Any, Iterable, Sequence, Type
 from dataclasses import asdict
 
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.image.image_embedding_base import ImageEmbeddingBase
 from fastembed.image.onnx_embedding import OnnxImageEmbedding
@@ -51,7 +51,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,
diff --git a/fastembed/image/onnx_embedding.py b/fastembed/image/onnx_embedding.py
index 92165594..3e173995 100644
--- a/fastembed/image/onnx_embedding.py
+++ b/fastembed/image/onnx_embedding.py
@@ -1,7 +1,7 @@
 from typing import Any, Iterable, Sequence, Type
 
 
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common.utils import define_cache_dir, normalize
@@ -63,10 +63,11 @@ class OnnxImageEmbedding(ImageEmbeddingBase, OnnxImageModel[NumpyArray]):
     def __init__(
         self,
         model_name: str,
+
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -82,10 +83,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py
index 118bd382..86326da9 100644
--- a/fastembed/image/onnx_image_model.py
+++ b/fastembed/image/onnx_image_model.py
@@ -8,7 +8,7 @@
 from PIL import Image
 
 from fastembed.image.transform.operators import Compose
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_preprocessor
@@ -53,7 +53,7 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
@@ -97,7 +97,7 @@ def _embed_images(
         batch_size: int = 256,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,
diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py
index 66a44e1e..b8b92542 100644
--- a/fastembed/late_interaction/colbert.py
+++ b/fastembed/late_interaction/colbert.py
@@ -5,7 +5,7 @@
 from tokenizers import Encoding, Tokenizer
 
 from fastembed.common.preprocessor_utils import load_tokenizer
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import OnnxProvider
 from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common.utils import define_cache_dir, iter_batch
@@ -143,7 +143,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -159,10 +159,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
diff --git a/fastembed/late_interaction/late_interaction_text_embedding.py b/fastembed/late_interaction/late_interaction_text_embedding.py
index ac99b03f..30c8b70d 100644
--- a/fastembed/late_interaction/late_interaction_text_embedding.py
+++ b/fastembed/late_interaction/late_interaction_text_embedding.py
@@ -2,7 +2,7 @@
 from dataclasses import asdict
 
 from fastembed.common.model_description import DenseModelDescription
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import OnnxProvider
 from fastembed.late_interaction.colbert import Colbert
 from fastembed.late_interaction.jina_colbert import JinaColbert
@@ -54,7 +54,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,
diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index cc3273c4..512ab746 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -5,8 +5,8 @@
 
 from fastembed.common import OnnxProvider, ImageInput
 from fastembed.common.onnx_model import OnnxOutputContext
-from fastembed.common.types import NumpyArray
-from fastembed.common.utils import define_cache_dir, iter_batch
+from fastembed.common.types import NumpyArray, Device
+from fastembed.common.utils import define_cache_dir
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
     LateInteractionMultimodalEmbeddingBase,
 )
@@ -49,7 +49,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -65,10 +65,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
index f960b2bb..afe839d4 100644
--- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
+++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
@@ -2,7 +2,7 @@
 from dataclasses import asdict
 
 from fastembed.common import OnnxProvider, ImageInput
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.late_interaction_multimodal.colpali import ColPali
 
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
@@ -57,7 +57,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,
diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
index b6fc7a97..18b36338 100644
--- a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
+++ b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -11,7 +11,7 @@
 from fastembed.common import OnnxProvider, ImageInput
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_tokenizer, load_preprocessor
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common.utils import iter_batch
 from fastembed.image.transform.operators import Compose
 from fastembed.parallel_processor import ParallelWorkerPool
@@ -62,7 +62,7 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
@@ -120,7 +120,7 @@ def _embed_documents(
         batch_size: int = 256,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,
@@ -191,7 +191,7 @@ def _embed_images(
         batch_size: int = 256,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,
diff --git a/fastembed/parallel_processor.py b/fastembed/parallel_processor.py
index 1632ac2c..bfaaf0c5 100644
--- a/fastembed/parallel_processor.py
+++ b/fastembed/parallel_processor.py
@@ -10,6 +10,7 @@
 from queue import Empty
 from typing import Any, Iterable, Type
 
+from fastembed.common.types import Device
 
 # Single item should be processed in less than:
 processing_timeout = 10 * 60  # seconds
@@ -95,7 +96,7 @@ def __init__(
         worker: Type[Worker],
         start_method: str | None = None,
         device_ids: list[int] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
     ):
         self.worker_class = worker
         self.num_workers = num_workers
diff --git a/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py b/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py
index a9fa3d79..fc1f6e96 100644
--- a/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py
+++ b/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py
@@ -2,6 +2,7 @@
 
 from fastembed.common import OnnxProvider
 from fastembed.common.model_description import BaseModelDescription
+from fastembed.common.types import Device
 from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder
 
 
@@ -14,7 +15,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
diff --git a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py
index e077c216..21231551 100644
--- a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py
+++ b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py
@@ -4,6 +4,7 @@
 
 from fastembed.common import OnnxProvider
 from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.types import Device
 from fastembed.common.utils import define_cache_dir
 from fastembed.rerank.cross_encoder.onnx_text_model import (
     OnnxCrossEncoderModel,
@@ -80,7 +81,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -96,10 +97,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py
index aa058322..55f3ea85 100644
--- a/fastembed/rerank/cross_encoder/onnx_text_model.py
+++ b/fastembed/rerank/cross_encoder/onnx_text_model.py
@@ -12,7 +12,7 @@
     OnnxOutputContext,
     OnnxProvider,
 )
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common.preprocessor_utils import load_tokenizer
 from fastembed.common.utils import iter_batch
 from fastembed.parallel_processor import ParallelWorkerPool
@@ -31,7 +31,7 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
@@ -94,7 +94,7 @@ def _rerank_pairs(
         batch_size: int,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,
diff --git a/fastembed/rerank/cross_encoder/text_cross_encoder.py b/fastembed/rerank/cross_encoder/text_cross_encoder.py
index 2d3920a8..6f98cb24 100644
--- a/fastembed/rerank/cross_encoder/text_cross_encoder.py
+++ b/fastembed/rerank/cross_encoder/text_cross_encoder.py
@@ -2,6 +2,7 @@
 from dataclasses import asdict
 
 from fastembed.common import OnnxProvider
+from fastembed.common.types import Device
 from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder
 from fastembed.rerank.cross_encoder.custom_text_cross_encoder import CustomTextCrossEncoder
 
@@ -56,7 +57,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,
diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py
index ce564967..2b090f74 100644
--- a/fastembed/sparse/bm42.py
+++ b/fastembed/sparse/bm42.py
@@ -9,6 +9,7 @@
 
 from fastembed.common import OnnxProvider
 from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.types import Device
 from fastembed.common.utils import define_cache_dir
 from fastembed.sparse.sparse_embedding_base import (
     SparseEmbedding,
@@ -69,7 +70,7 @@ def __init__(
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
         alpha: float = 0.5,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -87,10 +88,11 @@ def __init__(
             alpha (float, optional): Parameter, that defines the importance of the token weight in the document
                 versus the importance of the token frequency in the corpus. Defaults to 0.5, based on empirical testing.
                 It is recommended to only change this parameter based on training data for a specific dataset.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
diff --git a/fastembed/sparse/minicoil.py b/fastembed/sparse/minicoil.py
index 611c38d4..6f29c60f 100644
--- a/fastembed/sparse/minicoil.py
+++ b/fastembed/sparse/minicoil.py
@@ -10,6 +10,7 @@
 from fastembed.common.model_description import SparseModelDescription, ModelSource
 from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common import OnnxProvider
+from fastembed.common.types import Device
 from fastembed.common.utils import define_cache_dir
 from fastembed.sparse.sparse_embedding_base import (
     SparseEmbedding,
@@ -78,7 +79,7 @@ def __init__(
         k: float = 1.2,
         b: float = 0.75,
         avg_len: float = 150.0,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -98,10 +99,11 @@ def __init__(
             b (float, optional): The b parameter in the BM25 formula. Defines the importance of the document length.
                 Defaults to 0.75.
             avg_len (float, optional): The average length of the documents in the corpus. Defaults to 150.0.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
diff --git a/fastembed/sparse/sparse_text_embedding.py b/fastembed/sparse/sparse_text_embedding.py
index cbd59057..5b5c8308 100644
--- a/fastembed/sparse/sparse_text_embedding.py
+++ b/fastembed/sparse/sparse_text_embedding.py
@@ -2,6 +2,7 @@
 from dataclasses import asdict
 
 from fastembed.common import OnnxProvider
+from fastembed.common.types import Device
 from fastembed.sparse.bm25 import Bm25
 from fastembed.sparse.bm42 import Bm42
 from fastembed.sparse.minicoil import MiniCOIL
@@ -56,7 +57,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,
diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py
index 9c739b2c..562ebcd4 100644
--- a/fastembed/sparse/splade_pp.py
+++ b/fastembed/sparse/splade_pp.py
@@ -3,6 +3,7 @@
 import numpy as np
 from fastembed.common import OnnxProvider
 from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.types import Device
 from fastembed.common.utils import define_cache_dir
 from fastembed.sparse.sparse_embedding_base import (
     SparseEmbedding,
@@ -73,7 +74,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -89,10 +90,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
diff --git a/fastembed/text/custom_text_embedding.py b/fastembed/text/custom_text_embedding.py
index 55692a22..465ffd25 100644
--- a/fastembed/text/custom_text_embedding.py
+++ b/fastembed/text/custom_text_embedding.py
@@ -1,5 +1,4 @@
 from typing import Sequence, Any, Iterable
-
 from dataclasses import dataclass
 
 import numpy as np
@@ -11,7 +10,7 @@
     DenseModelDescription,
 )
 from fastembed.common.onnx_model import OnnxOutputContext
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common.utils import normalize, mean_pooling
 from fastembed.text.onnx_embedding import OnnxTextEmbedding
 
@@ -32,7 +31,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py
index 773ad3ed..1e9978f7 100644
--- a/fastembed/text/onnx_embedding.py
+++ b/fastembed/text/onnx_embedding.py
@@ -1,6 +1,6 @@
 from typing import Any, Iterable, Sequence, Type
 
-from fastembed.common.types import NumpyArray, OnnxProvider
+from fastembed.common.types import NumpyArray, OnnxProvider, Device
 from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common.utils import define_cache_dir, normalize
 from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker
@@ -202,7 +202,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -218,10 +218,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py
index b4ecfac3..c8001a91 100644
--- a/fastembed/text/onnx_text_model.py
+++ b/fastembed/text/onnx_text_model.py
@@ -7,7 +7,7 @@
 from numpy.typing import NDArray
 from tokenizers import Encoding, Tokenizer
 
-from fastembed.common.types import NumpyArray, OnnxProvider
+from fastembed.common.types import NumpyArray, OnnxProvider, Device
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_tokenizer
 from fastembed.common.utils import iter_batch
@@ -52,7 +52,7 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
@@ -108,7 +108,7 @@ def _embed_documents(
         batch_size: int = 256,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,
diff --git a/fastembed/text/text_embedding.py b/fastembed/text/text_embedding.py
index 54ece67d..a4ae48cc 100644
--- a/fastembed/text/text_embedding.py
+++ b/fastembed/text/text_embedding.py
@@ -2,7 +2,7 @@
 from typing import Any, Iterable, Sequence, Type
 from dataclasses import asdict
 
-from fastembed.common.types import NumpyArray, OnnxProvider
+from fastembed.common.types import NumpyArray, OnnxProvider, Device
 from fastembed.text.clip_embedding import CLIPOnnxEmbedding
 from fastembed.text.custom_text_embedding import CustomTextEmbedding
 from fastembed.text.pooled_normalized_embedding import PooledNormalizedEmbedding
@@ -82,7 +82,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,

From dfdf986d9285e2f6a89ab0dc97635989072d9897 Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Mon, 8 Dec 2025 17:56:43 +0700
Subject: [PATCH 2/3] fix: fix warning msg

---
 fastembed/common/onnx_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
index ea09d5b8..d465e870 100644
--- a/fastembed/common/onnx_model.py
+++ b/fastembed/common/onnx_model.py
@@ -74,7 +74,7 @@ def _load_onnx_model(
             warnings.warn(
                 f"`cuda` and `providers` are mutually exclusive parameters, "
                 f"cuda: {cuda}, providers: {providers}. If you'd like to use providers, cuda should be one of "
-                f"[True, Device.CPU, Device.AUTO].",
+                f"[False, Device.CPU, Device.AUTO].",
                 category=UserWarning,
                 stacklevel=6,
             )

From e69a58e48de8b4b708fa27ed04ce3fd3c2cfbef4 Mon Sep 17 00:00:00 2001
From: George Panchuk <george.panchuk@qdrant.tech>
Date: Mon, 8 Dec 2025 18:00:10 +0700
Subject: [PATCH 3/3] fix: add missing import

---
 fastembed/late_interaction_multimodal/colpali.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
index 512ab746..85fbcd06 100644
--- a/fastembed/late_interaction_multimodal/colpali.py
+++ b/fastembed/late_interaction_multimodal/colpali.py
@@ -6,7 +6,7 @@
 from fastembed.common import OnnxProvider, ImageInput
 from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common.types import NumpyArray, Device
-from fastembed.common.utils import define_cache_dir
+from fastembed.common.utils import define_cache_dir, iter_batch
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
     LateInteractionMultimodalEmbeddingBase,
 )