Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.9.x'
python-version: '3.10.x'
- name: Install dependencies
run: |
python -m pip install poetry
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ jobs:
strategy:
matrix:
python-version:
- '3.9.x'
- '3.10.x'
- '3.11.x'
- '3.12.x'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/type-checkers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
python-version: ["3.10", "3.11", "3.12", "3.13"]
os: [ubuntu-latest]

name: Python ${{ matrix.python-version }} test
Expand Down
14 changes: 7 additions & 7 deletions fastembed/common/model_description.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional, Any
from typing import Any


@dataclass(frozen=True)
class ModelSource:
hf: Optional[str] = None
url: Optional[str] = None
hf: str | None = None
url: str | None = None
_deprecated_tar_struct: bool = False

@property
Expand All @@ -33,17 +33,17 @@ class BaseModelDescription:

@dataclass(frozen=True)
class DenseModelDescription(BaseModelDescription):
dim: Optional[int] = None
tasks: Optional[dict[str, Any]] = field(default_factory=dict)
dim: int | None = None
tasks: dict[str, Any] | None = field(default_factory=dict)

def __post_init__(self) -> None:
assert self.dim is not None, "dim is required for dense model description"


@dataclass(frozen=True)
class SparseModelDescription(BaseModelDescription):
requires_idf: Optional[bool] = None
vocab_size: Optional[int] = None
requires_idf: bool | None = None
vocab_size: int | None = None


class PoolingType(str, Enum):
Expand Down
12 changes: 5 additions & 7 deletions fastembed/common/model_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import tarfile
from copy import deepcopy
from pathlib import Path
from typing import Any, Optional, Union, TypeVar, Generic
from typing import Any, TypeVar, Generic

import requests
from huggingface_hub import snapshot_download, model_info, list_repo_tree
Expand Down Expand Up @@ -180,8 +180,8 @@ def _verify_files_from_metadata(

def _collect_file_metadata(
model_dir: Path, repo_files: list[RepoFile]
) -> dict[str, dict[str, Union[int, str]]]:
meta: dict[str, dict[str, Union[int, str]]] = {}
) -> dict[str, dict[str, int | str]]:
meta: dict[str, dict[str, int | str]] = {}
file_info_map = {f.path: f for f in repo_files}
for file_path in model_dir.rglob("*"):
if file_path.is_file() and file_path.name != cls.METADATA_FILE:
Expand All @@ -193,9 +193,7 @@ def _collect_file_metadata(
}
return meta

def _save_file_metadata(
model_dir: Path, meta: dict[str, dict[str, Union[int, str]]]
) -> None:
def _save_file_metadata(model_dir: Path, meta: dict[str, dict[str, int | str]]) -> None:
try:
if not model_dir.exists():
model_dir.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -397,7 +395,7 @@ def download_model(cls, model: T, cache_dir: str, retries: int = 3, **kwargs: An
Path: The path to the downloaded model directory.
"""
local_files_only = kwargs.get("local_files_only", False)
specific_model_path: Optional[str] = kwargs.pop("specific_model_path", None)
specific_model_path: str | None = kwargs.pop("specific_model_path", None)
if specific_model_path:
return Path(specific_model_path)
retries = 1 if local_files_only else retries
Expand Down
18 changes: 9 additions & 9 deletions fastembed/common/onnx_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Generic, Iterable, Optional, Sequence, Type, TypeVar
from typing import Any, Generic, Iterable, Sequence, Type, TypeVar

import numpy as np
import onnxruntime as ort
Expand All @@ -19,8 +19,8 @@
@dataclass
class OnnxOutputContext:
model_output: NumpyArray
attention_mask: Optional[NDArray[np.int64]] = None
input_ids: Optional[NDArray[np.int64]] = None
attention_mask: NDArray[np.int64] | None = None
input_ids: NDArray[np.int64] | None = None
Comment on lines +22 to +23
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Avoid NDArray[...] | None in dataclass fields; keep Union here.

For OnnxOutputContext, annotating attention_mask and input_ids as NDArray[np.int64] | None can trigger runtime issues because dataclasses evaluate annotations at class creation time and NDArray[...] does not reliably support the | operator in this codebase. Based on prior experience, these fields should keep typing.Union[...] instead of PEP 604 unions, with a short comment explaining why.

A possible adjustment:

-from numpy.typing import NDArray
+from numpy.typing import NDArray
+from typing import Union
@@
-    attention_mask: NDArray[np.int64] | None = None
-    input_ids: NDArray[np.int64] | None = None
+    # NOTE: Use Union instead of `|` with NDArray in dataclasses to avoid
+    # runtime annotation evaluation issues.
+    attention_mask: Union[NDArray[np.int64], None] = None
+    input_ids: Union[NDArray[np.int64], None] = None

Based on learnings, this keeps the modernisation elsewhere while avoiding the known NDArray/dataclass pitfall.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
attention_mask: NDArray[np.int64] | None = None
input_ids: NDArray[np.int64] | None = None
# NOTE: Use Union instead of `|` with NDArray in dataclasses to avoid
# runtime annotation evaluation issues.
attention_mask: Union[NDArray[np.int64], None] = None
input_ids: Union[NDArray[np.int64], None] = None
🤖 Prompt for AI Agents
In fastembed/common/onnx_model.py around lines 22 to 23, the dataclass fields
use PEP 604 style unions (NDArray[np.int64] | None) which can break at dataclass
creation time for NDArray; change these annotations back to
typing.Union[NDArray[np.int64], None] (or Optional[NDArray[np.int64]]) and add a
short comment noting this workaround (dataclass/NDArray incompatibility) so
other maintainers understand why Union is used instead of the | operator.



class OnnxModel(Generic[T]):
Expand All @@ -43,8 +43,8 @@ def _post_process_onnx_output(self, output: OnnxOutputContext, **kwargs: Any) ->
raise NotImplementedError("Subclasses must implement this method")

def __init__(self) -> None:
self.model: Optional[ort.InferenceSession] = None
self.tokenizer: Optional[Tokenizer] = None
self.model: ort.InferenceSession | None = None
self.tokenizer: Tokenizer | None = None

def _preprocess_onnx_input(
self, onnx_input: dict[str, NumpyArray], **kwargs: Any
Expand All @@ -58,11 +58,11 @@ def _load_onnx_model(
self,
model_dir: Path,
model_file: str,
threads: Optional[int],
providers: Optional[Sequence[OnnxProvider]] = None,
threads: int | None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
device_id: Optional[int] = None,
extra_session_options: Optional[dict[str, Any]] = None,
device_id: int | None = None,
extra_session_options: dict[str, Any] | None = None,
) -> None:
model_path = model_dir / model_file
# List of Execution Providers: https://onnxruntime.ai/docs/execution-providers
Expand Down
33 changes: 14 additions & 19 deletions fastembed/common/types.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
from pathlib import Path
import sys
from PIL import Image
from typing import Any, Union

from typing import Any, TypeAlias
import numpy as np
from numpy.typing import NDArray

if sys.version_info >= (3, 10):
from typing import TypeAlias
else:
from typing_extensions import TypeAlias
from PIL import Image


PathInput: TypeAlias = Union[str, Path]
ImageInput: TypeAlias = Union[PathInput, Image.Image]
PathInput: TypeAlias = str | Path
ImageInput: TypeAlias = PathInput | Image.Image

OnnxProvider: TypeAlias = Union[str, tuple[str, dict[Any, Any]]]
NumpyArray = Union[
NDArray[np.float64],
NDArray[np.float32],
NDArray[np.float16],
NDArray[np.int8],
NDArray[np.int64],
NDArray[np.int32],
]
OnnxProvider: TypeAlias = str | tuple[str, dict[Any, Any]]
NumpyArray: TypeAlias = (
NDArray[np.float64]
| NDArray[np.float32]
| NDArray[np.float16]
| NDArray[np.int8]
| NDArray[np.int64]
| NDArray[np.int32]
)
Comment on lines +13 to +20
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, find and examine the project's dependency specifications
find . -maxdepth 2 -type f \( -name "setup.py" -o -name "requirements.txt" -o -name "pyproject.toml" -o -name "setup.cfg" \) | head -20

Repository: qdrant/fastembed

Length of output: 76


🏁 Script executed:

# Examine the current state of fastembed/common/types.py
cat -n fastembed/common/types.py

Repository: qdrant/fastembed

Length of output: 652


🏁 Script executed:

# Search for usages of NumpyArray alias in the codebase
rg "NumpyArray" --type py -B 2 -A 2

Repository: qdrant/fastembed

Length of output: 50372


🏁 Script executed:

# Check if there are any recent changes or related issues with NDArray
git log --oneline --all --grep="NDArray\|NumpyArray" | head -20

Repository: qdrant/fastembed

Length of output: 42


🏁 Script executed:

# Check numpy version requirements
cat pyproject.toml | grep -A 10 -B 2 "numpy\|python"

Repository: qdrant/fastembed

Length of output: 146


🏁 Script executed:

# Try to import the types module to see if there are any immediate issues
python3 << 'EOF'
try:
    import sys
    sys.path.insert(0, '/repo')
    from fastembed.common.types import NumpyArray
    print("✓ NumpyArray imported successfully")
    print(f"NumpyArray = {NumpyArray}")
except Exception as e:
    print(f"✗ Import failed: {type(e).__name__}: {e}")
EOF

Repository: qdrant/fastembed

Length of output: 146


🏁 Script executed:

# Examine the sparse_embedding_base.py file where the prior issue occurred
head -30 fastembed/sparse/sparse_embedding_base.py

Repository: qdrant/fastembed

Length of output: 980


Use Union[...] instead of | operator for NumpyArray type alias

The current pipe operator syntax for NDArray[...] | NDArray[...] can cause issues at runtime when this type alias is evaluated in dataclass field annotations. NDArray types don't reliably support the __or__ operator across numpy versions, especially in dataclass contexts where annotations are evaluated at class definition time.

Switch to Union[...] syntax:

from typing import Union

NumpyArray: TypeAlias = Union[
    NDArray[np.float64],
    NDArray[np.float32],
    NDArray[np.float16],
    NDArray[np.int8],
    NDArray[np.int64],
    NDArray[np.int32],
]

Add a comment explaining why Union is preferred over the pipe operator for NDArray unions.

🤖 Prompt for AI Agents
In fastembed/common/types.py around lines 13 to 20, replace the pipe-operation
union of NDArray types with typing.Union: import Union from typing, change the
NumpyArray TypeAlias to use Union[...] with the listed NDArray[...] types, and
add a brief comment stating that Union is preferred because NDArray does not
reliably support the | operator across NumPy versions and dataclass annotation
evaluation; ensure imports are ordered and no runtime evaluation of the |
operator remains.

4 changes: 2 additions & 2 deletions fastembed/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import unicodedata
from pathlib import Path
from itertools import islice
from typing import Iterable, Optional, TypeVar
from typing import Iterable, TypeVar

import numpy as np
from numpy.typing import NDArray
Expand Down Expand Up @@ -45,7 +45,7 @@ def iter_batch(iterable: Iterable[T], size: int) -> Iterable[list[T]]:
yield b


def define_cache_dir(cache_dir: Optional[str] = None) -> Path:
def define_cache_dir(cache_dir: str | None = None) -> Path:
"""
Define the cache directory for fastembed
"""
Expand Down
6 changes: 3 additions & 3 deletions fastembed/embedding.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Any
from typing import Any

from loguru import logger

Expand All @@ -17,8 +17,8 @@ class JinaEmbedding(TextEmbedding):
def __init__(
self,
model_name: str = "jinaai/jina-embeddings-v2-base-en",
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
cache_dir: str | None = None,
threads: int | None = None,
**kwargs: Any,
):
super().__init__(model_name, cache_dir, threads, **kwargs)
16 changes: 8 additions & 8 deletions fastembed/image/image_embedding.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Iterable, Optional, Sequence, Type, Union
from typing import Any, Iterable, Sequence, Type
from dataclasses import asdict

from fastembed.common.types import NumpyArray
Expand Down Expand Up @@ -48,11 +48,11 @@ def _list_supported_models(cls) -> list[DenseModelDescription]:
def __init__(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
providers: Optional[Sequence[OnnxProvider]] = None,
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
device_ids: list[int] | None = None,
lazy_load: bool = False,
**kwargs: Any,
):
Expand Down Expand Up @@ -98,7 +98,7 @@ def get_embedding_size(cls, model_name: str) -> int:
ValueError: If the model name is not found in the supported models.
"""
descriptions = cls._list_supported_models()
embedding_size: Optional[int] = None
embedding_size: int | None = None
for description in descriptions:
if description.model.lower() == model_name.lower():
embedding_size = description.dim
Expand All @@ -113,9 +113,9 @@ def get_embedding_size(cls, model_name: str) -> int:

def embed(
self,
images: Union[ImageInput, Iterable[ImageInput]],
images: ImageInput | Iterable[ImageInput],
batch_size: int = 16,
parallel: Optional[int] = None,
parallel: int | None = None,
**kwargs: Any,
) -> Iterable[NumpyArray]:
"""
Expand Down
12 changes: 6 additions & 6 deletions fastembed/image/image_embedding_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterable, Optional, Any, Union
from typing import Iterable, Any

from fastembed.common.model_description import DenseModelDescription
from fastembed.common.types import NumpyArray
Expand All @@ -10,21 +10,21 @@ class ImageEmbeddingBase(ModelManagement[DenseModelDescription]):
def __init__(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
cache_dir: str | None = None,
threads: int | None = None,
**kwargs: Any,
):
self.model_name = model_name
self.cache_dir = cache_dir
self.threads = threads
self._local_files_only = kwargs.pop("local_files_only", False)
self._embedding_size: Optional[int] = None
self._embedding_size: int | None = None

def embed(
self,
images: Union[ImageInput, Iterable[ImageInput]],
images: ImageInput | Iterable[ImageInput],
batch_size: int = 16,
parallel: Optional[int] = None,
parallel: int | None = None,
**kwargs: Any,
) -> Iterable[NumpyArray]:
"""
Expand Down
20 changes: 10 additions & 10 deletions fastembed/image/onnx_embedding.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Iterable, Optional, Sequence, Type, Union
from typing import Any, Iterable, Sequence, Type


from fastembed.common.types import NumpyArray
Expand Down Expand Up @@ -63,14 +63,14 @@ class OnnxImageEmbedding(ImageEmbeddingBase, OnnxImageModel[NumpyArray]):
def __init__(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
providers: Optional[Sequence[OnnxProvider]] = None,
cache_dir: str | None = None,
threads: int | None = None,
providers: Sequence[OnnxProvider] | None = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
device_ids: list[int] | None = None,
lazy_load: bool = False,
device_id: Optional[int] = None,
specific_model_path: Optional[str] = None,
device_id: int | None = None,
specific_model_path: str | None = None,
**kwargs: Any,
):
"""
Expand Down Expand Up @@ -105,7 +105,7 @@ def __init__(
self.cuda = cuda

# This device_id will be used if we need to load model in current process
self.device_id: Optional[int] = None
self.device_id: int | None = None
if device_id is not None:
self.device_id = device_id
elif self.device_ids is not None:
Expand Down Expand Up @@ -150,9 +150,9 @@ def _list_supported_models(cls) -> list[DenseModelDescription]:

def embed(
self,
images: Union[ImageInput, Iterable[ImageInput]],
images: ImageInput | Iterable[ImageInput],
batch_size: int = 16,
parallel: Optional[int] = None,
parallel: int | None = None,
**kwargs: Any,
) -> Iterable[NumpyArray]:
"""
Expand Down
Loading