qdrant · joein · Dec 10, 2025 · Jun 20, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/fastembed/common/onnx_model.py b/fastembed/common/onnx_model.py
@@ -9,7 +9,7 @@
 from numpy.typing import NDArray
 from tokenizers import Tokenizer
 
-from fastembed.common.types import OnnxProvider, NumpyArray
+from fastembed.common.types import OnnxProvider, NumpyArray, Device
 from fastembed.parallel_processor import Worker
 
 # Holds type of the embedding result
@@ -60,31 +60,35 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
         model_path = model_dir / model_file
         # List of Execution Providers: https://onnxruntime.ai/docs/execution-providers
+        available_providers = ort.get_available_providers()
+        cuda_available = "CUDAExecutionProvider" in available_providers
+        explicit_cuda = cuda is True or cuda == Device.CUDA
 
-        if cuda and providers is not None:
+        if explicit_cuda and providers is not None:
             warnings.warn(
-                f"`cuda` and `providers` are mutually exclusive parameters, cuda: {cuda}, providers: {providers}",
+                f"`cuda` and `providers` are mutually exclusive parameters, "
+                f"cuda: {cuda}, providers: {providers}. If you'd like to use providers, cuda should be one of "
+                f"[False, Device.CPU, Device.AUTO].",
                 category=UserWarning,
                 stacklevel=6,
             )
 
         if providers is not None:
             onnx_providers = list(providers)
-        elif cuda:
+        elif explicit_cuda or (cuda == Device.AUTO and cuda_available):
             if device_id is None:
                 onnx_providers = ["CUDAExecutionProvider"]
             else:
                 onnx_providers = [("CUDAExecutionProvider", {"device_id": device_id})]
         else:
             onnx_providers = ["CPUExecutionProvider"]
 
-        available_providers = ort.get_available_providers()
         requested_provider_names: list[str] = []
         for provider in onnx_providers:
             # check providers available

diff --git a/fastembed/common/types.py b/fastembed/common/types.py
@@ -1,11 +1,18 @@
+from enum import Enum
 from pathlib import Path
-
 from typing import Any, TypeAlias
+
 import numpy as np
 from numpy.typing import NDArray
 from PIL import Image
 
 
+class Device(str, Enum):
+    CPU = "cpu"
+    CUDA = "cuda"
+    AUTO = "auto"
+
+
 PathInput: TypeAlias = str | Path
 ImageInput: TypeAlias = PathInput | Image.Image
 

diff --git a/fastembed/image/image_embedding.py b/fastembed/image/image_embedding.py
@@ -1,7 +1,7 @@
 from typing import Any, Iterable, Sequence, Type
 from dataclasses import asdict
 
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.image.image_embedding_base import ImageEmbeddingBase
 from fastembed.image.onnx_embedding import OnnxImageEmbedding
@@ -51,7 +51,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,

diff --git a/fastembed/image/onnx_embedding.py b/fastembed/image/onnx_embedding.py
@@ -1,7 +1,7 @@
 from typing import Any, Iterable, Sequence, Type
 
 
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common.utils import define_cache_dir, normalize
@@ -63,10 +63,11 @@ class OnnxImageEmbedding(ImageEmbeddingBase, OnnxImageModel[NumpyArray]):
     def __init__(
         self,
         model_name: str,
+
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -82,10 +83,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.

diff --git a/fastembed/image/onnx_image_model.py b/fastembed/image/onnx_image_model.py
@@ -8,7 +8,7 @@
 from PIL import Image
 
 from fastembed.image.transform.operators import Compose
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import ImageInput, OnnxProvider
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_preprocessor
@@ -53,7 +53,7 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
@@ -97,7 +97,7 @@ def _embed_images(
         batch_size: int = 256,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,

diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py
@@ -5,7 +5,7 @@
 from tokenizers import Encoding, Tokenizer
 
 from fastembed.common.preprocessor_utils import load_tokenizer
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import OnnxProvider
 from fastembed.common.onnx_model import OnnxOutputContext
 from fastembed.common.utils import define_cache_dir, iter_batch
@@ -143,7 +143,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -159,10 +159,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.

diff --git a/fastembed/late_interaction/late_interaction_text_embedding.py b/fastembed/late_interaction/late_interaction_text_embedding.py
@@ -2,7 +2,7 @@
 from dataclasses import asdict
 
 from fastembed.common.model_description import DenseModelDescription
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common import OnnxProvider
 from fastembed.late_interaction.colbert import Colbert
 from fastembed.late_interaction.jina_colbert import JinaColbert
@@ -54,7 +54,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,

diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py
@@ -5,7 +5,7 @@
 
 from fastembed.common import OnnxProvider, ImageInput
 from fastembed.common.onnx_model import OnnxOutputContext
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common.utils import define_cache_dir, iter_batch
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
     LateInteractionMultimodalEmbeddingBase,
@@ -49,7 +49,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -65,10 +65,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.

diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py
@@ -2,7 +2,7 @@
 from dataclasses import asdict
 
 from fastembed.common import OnnxProvider, ImageInput
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.late_interaction_multimodal.colpali import ColPali
 
 from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
@@ -57,7 +57,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,

diff --git a/fastembed/late_interaction_multimodal/onnx_multimodal_model.py b/fastembed/late_interaction_multimodal/onnx_multimodal_model.py
@@ -11,7 +11,7 @@
 from fastembed.common import OnnxProvider, ImageInput
 from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
 from fastembed.common.preprocessor_utils import load_tokenizer, load_preprocessor
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common.utils import iter_batch
 from fastembed.image.transform.operators import Compose
 from fastembed.parallel_processor import ParallelWorkerPool
@@ -62,7 +62,7 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
@@ -120,7 +120,7 @@ def _embed_documents(
         batch_size: int = 256,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,
@@ -191,7 +191,7 @@ def _embed_images(
         batch_size: int = 256,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,

diff --git a/fastembed/parallel_processor.py b/fastembed/parallel_processor.py
@@ -10,6 +10,7 @@
 from queue import Empty
 from typing import Any, Iterable, Type
 
+from fastembed.common.types import Device
 
 # Single item should be processed in less than:
 processing_timeout = 10 * 60  # seconds
@@ -95,7 +96,7 @@ def __init__(
         worker: Type[Worker],
         start_method: str | None = None,
         device_ids: list[int] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
     ):
         self.worker_class = worker
         self.num_workers = num_workers

diff --git a/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py b/fastembed/rerank/cross_encoder/custom_text_cross_encoder.py
@@ -2,6 +2,7 @@
 
 from fastembed.common import OnnxProvider
 from fastembed.common.model_description import BaseModelDescription
+from fastembed.common.types import Device
 from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder
 
 
@@ -14,7 +15,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,

diff --git a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py
@@ -4,6 +4,7 @@
 
 from fastembed.common import OnnxProvider
 from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.types import Device
 from fastembed.common.utils import define_cache_dir
 from fastembed.rerank.cross_encoder.onnx_text_model import (
     OnnxCrossEncoderModel,
@@ -80,7 +81,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         device_id: int | None = None,
@@ -96,10 +97,11 @@ def __init__(
             threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
             providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
                 Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
             device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
             lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                 Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
             device_id (Optional[int], optional): The device id to use for loading the model in the worker process.

diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py
@@ -12,7 +12,7 @@
     OnnxOutputContext,
     OnnxProvider,
 )
-from fastembed.common.types import NumpyArray
+from fastembed.common.types import NumpyArray, Device
 from fastembed.common.preprocessor_utils import load_tokenizer
 from fastembed.common.utils import iter_batch
 from fastembed.parallel_processor import ParallelWorkerPool
@@ -31,7 +31,7 @@ def _load_onnx_model(
         model_file: str,
         threads: int | None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_id: int | None = None,
         extra_session_options: dict[str, Any] | None = None,
     ) -> None:
@@ -94,7 +94,7 @@ def _rerank_pairs(
         batch_size: int,
         parallel: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         local_files_only: bool = False,
         specific_model_path: str | None = None,

diff --git a/fastembed/rerank/cross_encoder/text_cross_encoder.py b/fastembed/rerank/cross_encoder/text_cross_encoder.py
@@ -2,6 +2,7 @@
 from dataclasses import asdict
 
 from fastembed.common import OnnxProvider
+from fastembed.common.types import Device
 from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder
 from fastembed.rerank.cross_encoder.custom_text_cross_encoder import CustomTextCrossEncoder
 
@@ -56,7 +57,7 @@ def __init__(
         cache_dir: str | None = None,
         threads: int | None = None,
         providers: Sequence[OnnxProvider] | None = None,
-        cuda: bool = False,
+        cuda: bool | Device = Device.AUTO,
         device_ids: list[int] | None = None,
         lazy_load: bool = False,
         **kwargs: Any,