Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions fastembed/common/onnx_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from numpy.typing import NDArray
from tokenizers import Tokenizer

from fastembed.common.types import OnnxProvider, NumpyArray
from fastembed.common.types import OnnxProvider, NumpyArray, Device
from fastembed.parallel_processor import Worker

# Holds type of the embedding result
Expand Down Expand Up @@ -60,31 +60,35 @@ def _load_onnx_model(
model_file: str,
threads: int | None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_id: int | None = None,
extra_session_options: dict[str, Any] | None = None,
) -> None:
model_path = model_dir / model_file
# List of Execution Providers: https://onnxruntime.ai/docs/execution-providers
available_providers = ort.get_available_providers()
cuda_available = "CUDAExecutionProvider" in available_providers
explicit_cuda = cuda is True or cuda == Device.CUDA

if cuda and providers is not None:
if explicit_cuda and providers is not None:
warnings.warn(
f"`cuda` and `providers` are mutually exclusive parameters, cuda: {cuda}, providers: {providers}",
f"`cuda` and `providers` are mutually exclusive parameters, "
f"cuda: {cuda}, providers: {providers}. If you'd like to use providers, cuda should be one of "
f"[False, Device.CPU, Device.AUTO].",
category=UserWarning,
stacklevel=6,
)

if providers is not None:
onnx_providers = list(providers)
elif cuda:
elif explicit_cuda or (cuda == Device.AUTO and cuda_available):
if device_id is None:
onnx_providers = ["CUDAExecutionProvider"]
else:
onnx_providers = [("CUDAExecutionProvider", {"device_id": device_id})]
else:
onnx_providers = ["CPUExecutionProvider"]

available_providers = ort.get_available_providers()
requested_provider_names: list[str] = []
for provider in onnx_providers:
# check providers available
Expand Down
9 changes: 8 additions & 1 deletion fastembed/common/types.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from enum import Enum
from pathlib import Path

from typing import Any, TypeAlias

import numpy as np
from numpy.typing import NDArray
from PIL import Image


class Device(str, Enum):
CPU = "cpu"
CUDA = "cuda"
AUTO = "auto"


PathInput: TypeAlias = str | Path
ImageInput: TypeAlias = PathInput | Image.Image

Expand Down
4 changes: 2 additions & 2 deletions fastembed/image/image_embedding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Any, Iterable, Sequence, Type
from dataclasses import asdict

from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.common import ImageInput, OnnxProvider
from fastembed.image.image_embedding_base import ImageEmbeddingBase
from fastembed.image.onnx_embedding import OnnxImageEmbedding
Expand Down Expand Up @@ -51,7 +51,7 @@ def __init__(
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
**kwargs: Any,
Expand Down
12 changes: 7 additions & 5 deletions fastembed/image/onnx_embedding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Any, Iterable, Sequence, Type


from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.common import ImageInput, OnnxProvider
from fastembed.common.onnx_model import OnnxOutputContext
from fastembed.common.utils import define_cache_dir, normalize
Expand Down Expand Up @@ -63,10 +63,11 @@ class OnnxImageEmbedding(ImageEmbeddingBase, OnnxImageModel[NumpyArray]):
def __init__(
self,
model_name: str,

cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
device_id: int | None = None,
Expand All @@ -82,10 +83,11 @@ def __init__(
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to False.
cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to Device.AUTO.
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
with `providers`. Defaults to None.
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
Expand Down
6 changes: 3 additions & 3 deletions fastembed/image/onnx_image_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from PIL import Image

from fastembed.image.transform.operators import Compose
from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.common import ImageInput, OnnxProvider
from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
from fastembed.common.preprocessor_utils import load_preprocessor
Expand Down Expand Up @@ -53,7 +53,7 @@ def _load_onnx_model(
model_file: str,
threads: int | None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_id: int | None = None,
extra_session_options: dict[str, Any] | None = None,
) -> None:
Expand Down Expand Up @@ -97,7 +97,7 @@ def _embed_images(
batch_size: int = 256,
parallel: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
local_files_only: bool = False,
specific_model_path: str | None = None,
Expand Down
11 changes: 6 additions & 5 deletions fastembed/late_interaction/colbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from tokenizers import Encoding, Tokenizer

from fastembed.common.preprocessor_utils import load_tokenizer
from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.common import OnnxProvider
from fastembed.common.onnx_model import OnnxOutputContext
from fastembed.common.utils import define_cache_dir, iter_batch
Expand Down Expand Up @@ -143,7 +143,7 @@ def __init__(
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
device_id: int | None = None,
Expand All @@ -159,10 +159,11 @@ def __init__(
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to False.
cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to Device.AUTO.
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
with `providers`. Defaults to None.
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
Expand Down
4 changes: 2 additions & 2 deletions fastembed/late_interaction/late_interaction_text_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from dataclasses import asdict

from fastembed.common.model_description import DenseModelDescription
from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.common import OnnxProvider
from fastembed.late_interaction.colbert import Colbert
from fastembed.late_interaction.jina_colbert import JinaColbert
Expand Down Expand Up @@ -54,7 +54,7 @@ def __init__(
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
**kwargs: Any,
Expand Down
11 changes: 6 additions & 5 deletions fastembed/late_interaction_multimodal/colpali.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from fastembed.common import OnnxProvider, ImageInput
from fastembed.common.onnx_model import OnnxOutputContext
from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.common.utils import define_cache_dir, iter_batch
from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
LateInteractionMultimodalEmbeddingBase,
Expand Down Expand Up @@ -49,7 +49,7 @@ def __init__(
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
device_id: int | None = None,
Expand All @@ -65,10 +65,11 @@ def __init__(
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to False.
cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to Device.AUTO.
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
with `providers`. Defaults to None.
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from dataclasses import asdict

from fastembed.common import OnnxProvider, ImageInput
from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.late_interaction_multimodal.colpali import ColPali

from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
Expand Down Expand Up @@ -57,7 +57,7 @@ def __init__(
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
**kwargs: Any,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from fastembed.common import OnnxProvider, ImageInput
from fastembed.common.onnx_model import EmbeddingWorker, OnnxModel, OnnxOutputContext, T
from fastembed.common.preprocessor_utils import load_tokenizer, load_preprocessor
from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.common.utils import iter_batch
from fastembed.image.transform.operators import Compose
from fastembed.parallel_processor import ParallelWorkerPool
Expand Down Expand Up @@ -62,7 +62,7 @@ def _load_onnx_model(
model_file: str,
threads: int | None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_id: int | None = None,
extra_session_options: dict[str, Any] | None = None,
) -> None:
Expand Down Expand Up @@ -120,7 +120,7 @@ def _embed_documents(
batch_size: int = 256,
parallel: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
local_files_only: bool = False,
specific_model_path: str | None = None,
Expand Down Expand Up @@ -191,7 +191,7 @@ def _embed_images(
batch_size: int = 256,
parallel: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
local_files_only: bool = False,
specific_model_path: str | None = None,
Expand Down
3 changes: 2 additions & 1 deletion fastembed/parallel_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from queue import Empty
from typing import Any, Iterable, Type

from fastembed.common.types import Device

# Single item should be processed in less than:
processing_timeout = 10 * 60 # seconds
Expand Down Expand Up @@ -95,7 +96,7 @@ def __init__(
worker: Type[Worker],
start_method: str | None = None,
device_ids: list[int] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
):
self.worker_class = worker
self.num_workers = num_workers
Expand Down
3 changes: 2 additions & 1 deletion fastembed/rerank/cross_encoder/custom_text_cross_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from fastembed.common import OnnxProvider
from fastembed.common.model_description import BaseModelDescription
from fastembed.common.types import Device
from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder


Expand All @@ -14,7 +15,7 @@ def __init__(
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
device_id: int | None = None,
Expand Down
10 changes: 6 additions & 4 deletions fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from fastembed.common import OnnxProvider
from fastembed.common.onnx_model import OnnxOutputContext
from fastembed.common.types import Device
from fastembed.common.utils import define_cache_dir
from fastembed.rerank.cross_encoder.onnx_text_model import (
OnnxCrossEncoderModel,
Expand Down Expand Up @@ -80,7 +81,7 @@ def __init__(
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
device_id: int | None = None,
Expand All @@ -96,10 +97,11 @@ def __init__(
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to False.
cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
Defaults to Device.AUTO.
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
with `providers`. Defaults to None.
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
Expand Down
6 changes: 3 additions & 3 deletions fastembed/rerank/cross_encoder/onnx_text_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
OnnxOutputContext,
OnnxProvider,
)
from fastembed.common.types import NumpyArray
from fastembed.common.types import NumpyArray, Device
from fastembed.common.preprocessor_utils import load_tokenizer
from fastembed.common.utils import iter_batch
from fastembed.parallel_processor import ParallelWorkerPool
Expand All @@ -31,7 +31,7 @@ def _load_onnx_model(
model_file: str,
threads: int | None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_id: int | None = None,
extra_session_options: dict[str, Any] | None = None,
) -> None:
Expand Down Expand Up @@ -94,7 +94,7 @@ def _rerank_pairs(
batch_size: int,
parallel: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
local_files_only: bool = False,
specific_model_path: str | None = None,
Expand Down
3 changes: 2 additions & 1 deletion fastembed/rerank/cross_encoder/text_cross_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from dataclasses import asdict

from fastembed.common import OnnxProvider
from fastembed.common.types import Device
from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder
from fastembed.rerank.cross_encoder.custom_text_cross_encoder import CustomTextCrossEncoder

Expand Down Expand Up @@ -56,7 +57,7 @@ def __init__(
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
cuda: bool | Device = Device.AUTO,
device_ids: list[int] | None = None,
lazy_load: bool = False,
**kwargs: Any,
Expand Down
Loading