From a38c30919ef537aae9a97ed38789a20049e59149 Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Tue, 2 Dec 2025 00:24:11 +0700 Subject: [PATCH 1/4] new: add token count method --- fastembed/late_interaction/colbert.py | 64 ++++++++++++++++++- .../late_interaction_embedding_base.py | 10 +++ .../late_interaction_text_embedding.py | 22 +++++++ .../late_interaction_multimodal/colpali.py | 15 ++++- .../late_interaction_multimodal_embedding.py | 20 ++++++ ...e_interaction_multimodal_embedding_base.py | 9 +++ .../cross_encoder/onnx_text_cross_encoder.py | 12 ++++ .../rerank/cross_encoder/onnx_text_model.py | 9 +++ .../cross_encoder/text_cross_encoder.py | 12 ++++ .../cross_encoder/text_cross_encoder_base.py | 4 ++ fastembed/sparse/bm25.py | 9 +++ fastembed/sparse/bm42.py | 3 + fastembed/sparse/minicoil.py | 3 + fastembed/sparse/sparse_embedding_base.py | 4 ++ fastembed/sparse/sparse_text_embedding.py | 12 ++++ fastembed/sparse/splade_pp.py | 3 + fastembed/text/onnx_embedding.py | 3 + fastembed/text/onnx_text_model.py | 9 +++ fastembed/text/text_embedding.py | 12 ++++ fastembed/text/text_embedding_base.py | 4 ++ tests/test_late_interaction_embeddings.py | 24 +++++++ tests/test_late_interaction_multimodal.py | 19 ++++++ tests/test_sparse_embeddings.py | 22 +++++++ tests/test_text_cross_encoder.py | 17 +++++ tests/test_text_onnx_embeddings.py | 14 ++++ 25 files changed, 333 insertions(+), 2 deletions(-) diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index 4dfc2a05..27dd0653 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -8,7 +8,7 @@ from fastembed.common.types import NumpyArray from fastembed.common import OnnxProvider from fastembed.common.onnx_model import OnnxOutputContext -from fastembed.common.utils import define_cache_dir +from fastembed.common.utils import define_cache_dir, iter_batch from fastembed.late_interaction.late_interaction_embedding_base import ( LateInteractionTextEmbeddingBase, ) @@ -96,6 +96,34 @@ def _tokenize_documents(self, documents: list[str]) -> list[Encoding]: encoded = self.tokenizer.encode_batch(documents) # type: ignore[union-attr] return encoded + def token_count( + self, + texts: Union[str, Iterable[str]], + batch_size: int = 1024, + is_doc: bool = True, + include_extension: bool = False, + ) -> int: + token_num = 0 + texts = texts if isinstance(texts, list) else [texts] + tokenizer = self.tokenizer if is_doc else self.query_tokenizer + for batch in iter_batch(texts, batch_size): + for tokens in tokenizer.encode_batch(batch): + if is_doc: + token_num += sum(tokens.attention_mask) + else: + attend_count = sum(tokens.attention_mask) + if include_extension: + token_num += max(attend_count, self.MIN_QUERY_LENGTH) + + else: + token_num += attend_count + if include_extension: + token_num += len( + batch + ) # add 1 for each cls.DOC_MARKER_TOKEN_ID or cls.QUERY_MARKER_TOKEN_ID + + return token_num + @classmethod def _list_supported_models(cls) -> list[DenseModelDescription]: """Lists the supported models. @@ -266,3 +294,37 @@ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> Colb threads=1, **kwargs, ) + + +if __name__ == "__main__": + te = Colbert("answerdotai/answerai-colbert-small-v1") + + print( + te.token_count( + texts=[ + "qwe", + "adsda ads qwe dasd dsad cxc czx as qw er tr y fg s a x b h f d a s w e t y k,l; czcx asd ", + ] + ) + ) + print( + te.token_count( + texts=[ + "qwe", + "adsda ads qwe dasd dsad cxc czx as qw er tr y fg s a x b h f d a s w e t y k,l; czcx asd ", + ], + is_doc=False, + ) + ) + # data = [] + # with open('../../training_data.csv', 'r') as f: + # for i, line in enumerate(f): + # + # if i == 0: + # continue + # + # data.append(line.rsplit(',', maxsplit=1)[0][1:-1]) + # import time + # a = time.perf_counter() + # te.token_count(data, batch_size=1024) + # print(time.perf_counter() - a) diff --git a/fastembed/late_interaction/late_interaction_embedding_base.py b/fastembed/late_interaction/late_interaction_embedding_base.py index f677ba98..bf1ec9ce 100644 --- a/fastembed/late_interaction/late_interaction_embedding_base.py +++ b/fastembed/late_interaction/late_interaction_embedding_base.py @@ -69,3 +69,13 @@ def get_embedding_size(cls, model_name: str) -> int: def embedding_size(self) -> int: """Returns embedding size for the current model""" raise NotImplementedError("Subclasses must implement this method") + + def token_count( + self, + texts: Union[str, Iterable[str]], + batch_size: int = 1024, + is_doc: bool = True, + include_extension: bool = False, + ) -> int: + """Returns the number of tokens in the texts.""" + raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/late_interaction/late_interaction_text_embedding.py b/fastembed/late_interaction/late_interaction_text_embedding.py index 22833618..ea5b22e7 100644 --- a/fastembed/late_interaction/late_interaction_text_embedding.py +++ b/fastembed/late_interaction/late_interaction_text_embedding.py @@ -151,3 +151,25 @@ def query_embed(self, query: Union[str, Iterable[str]], **kwargs: Any) -> Iterab # This is model-specific, so that different models can have specialized implementations yield from self.model.query_embed(query, **kwargs) + + def token_count( + self, + texts: Union[str, Iterable[str]], + batch_size: int = 1024, + is_doc: bool = True, + include_extension: bool = False, + ) -> int: + """Returns the number of tokens in the texts. + + Args: + texts (str | Iterable[str]): The list of texts to embed. + batch_size (int): Batch size for encoding + is_doc (bool): Whether the texts are documents (disable embedding a query with include_mask=True). + include_extension (bool): Turn on to count DOC / QUERY marker tokens, and [MASK] token in query mode. + + Returns: + int: Sum of number of tokens in the texts. + """ + return self.model.token_count( + texts, batch_size=batch_size, is_doc=is_doc, include_extension=include_extension + ) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index 059f7971..a676076c 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -6,7 +6,7 @@ from fastembed.common import OnnxProvider, ImageInput from fastembed.common.onnx_model import OnnxOutputContext from fastembed.common.types import NumpyArray -from fastembed.common.utils import define_cache_dir +from fastembed.common.utils import define_cache_dir, iter_batch from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import ( LateInteractionMultimodalEmbeddingBase, ) @@ -172,6 +172,19 @@ def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: encoded = self.tokenizer.encode_batch(texts_query) # type: ignore[union-attr] return encoded + def token_count( + self, + texts: Union[str, Iterable[str]], + batch_size: int = 1024, + include_extension: bool = False, + ) -> int: + token_num = 0 + texts = texts if isinstance(texts, list) else [texts] + tokenize_func = self.tokenize if include_extension else self.tokenizer.encode_batch + for batch in iter_batch(texts, batch_size): + token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)]) + return token_num + def _preprocess_onnx_text_input( self, onnx_input: dict[str, NumpyArray], **kwargs: Any ) -> dict[str, NumpyArray]: diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py index 39c1763e..0e4c9263 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py @@ -162,3 +162,23 @@ def embed_image( List of embeddings, one per image """ yield from self.model.embed_image(images, batch_size, parallel, **kwargs) + + def token_count( + self, + texts: Union[str, Iterable[str]], + batch_size: int = 1024, + include_extension: bool = False, + ) -> int: + """Returns the number of tokens in the texts. + + Args: + texts (str | Iterable[str]): The list of texts to embed. + batch_size (int): Batch size for encoding + include_extension (bool): Whether to include tokens added by preprocessing + + Returns: + int: Sum of number of tokens in the texts. + """ + return self.model.token_count( + texts, batch_size=batch_size, include_extension=include_extension + ) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py index 12e3553c..f813cec6 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py @@ -76,3 +76,12 @@ def get_embedding_size(cls, model_name: str) -> int: def embedding_size(self) -> int: """Returns embedding size for the current model""" raise NotImplementedError("Subclasses must implement this method") + + def token_count( + self, + texts: Union[str, Iterable[str]], + batch_size: int = 1024, + include_extension: bool = False, + ) -> int: + """Returns the number of tokens in the texts.""" + raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py index 56f2b86c..6668ad4e 100644 --- a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +++ b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py @@ -207,6 +207,18 @@ def _post_process_onnx_output( ) -> Iterable[float]: return (float(elem) for elem in output.model_output) + def token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) -> int: + """Returns the number of tokens in the pairs. + + Args: + pairs: Iterable of tuples, where each tuple contains a query and a document to be tokenized + batch_size: Batch size for tokenizing + + Returns: + token count: overall number of tokens in the pairs + """ + return self._token_count(pairs, batch_size=batch_size) + class TextCrossEncoderWorker(TextRerankerWorker): def init_embedding( diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py index 5c85d27e..34489e6b 100644 --- a/fastembed/rerank/cross_encoder/onnx_text_model.py +++ b/fastembed/rerank/cross_encoder/onnx_text_model.py @@ -165,6 +165,15 @@ def _preprocess_onnx_input( """ return onnx_input + def _token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) -> int: + token_num = 0 + + for batch in iter_batch(pairs, batch_size): + for tokens in self.tokenizer.encode_batch(batch): + token_num += sum(tokens.attention_mask) + + return token_num + class TextRerankerWorker(EmbeddingWorker[float]): def __init__( diff --git a/fastembed/rerank/cross_encoder/text_cross_encoder.py b/fastembed/rerank/cross_encoder/text_cross_encoder.py index e269570d..ce2869db 100644 --- a/fastembed/rerank/cross_encoder/text_cross_encoder.py +++ b/fastembed/rerank/cross_encoder/text_cross_encoder.py @@ -161,3 +161,15 @@ def add_custom_model( additional_files=additional_files or [], ) ) + + def token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) -> int: + """Returns the number of tokens in the pairs. + + Args: + pairs: Iterable of tuples, where each tuple contains a query and a document to be tokenized + batch_size: Batch size for tokenizing + + Returns: + token count: overall number of tokens in the pairs + """ + return self.model.token_count(pairs, batch_size=batch_size) diff --git a/fastembed/rerank/cross_encoder/text_cross_encoder_base.py b/fastembed/rerank/cross_encoder/text_cross_encoder_base.py index 84b44e41..42fdea71 100644 --- a/fastembed/rerank/cross_encoder/text_cross_encoder_base.py +++ b/fastembed/rerank/cross_encoder/text_cross_encoder_base.py @@ -57,3 +57,7 @@ def rerank_pairs( Iterable[float]: Scores for each individual pair """ raise NotImplementedError("This method should be overridden by subclasses") + + def token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) -> int: + """Returns the number of tokens in the pairs.""" + raise NotImplementedError("This method should be overridden by subclasses") diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py index b6ac59fd..d2b405ad 100644 --- a/fastembed/sparse/bm25.py +++ b/fastembed/sparse/bm25.py @@ -268,6 +268,15 @@ def raw_embed( embeddings.append(SparseEmbedding.from_dict(token_id2value)) return embeddings + def token_count(self, texts: Union[str, Iterable[str]], **kwargs: Any) -> int: + token_num = 0 + texts = texts if isinstance(texts, list) else [texts] + for text in texts: + document = remove_non_alphanumeric(text) + tokens = self.tokenizer.tokenize(document) + token_num += len(tokens) + return token_num + def _term_frequency(self, tokens: list[str]) -> dict[int, float]: """Calculate the term frequency part of the BM25 formula. diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py index 848b1753..4a9103d7 100644 --- a/fastembed/sparse/bm42.py +++ b/fastembed/sparse/bm42.py @@ -352,6 +352,9 @@ def query_embed( def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]: return Bm42TextEmbeddingWorker + def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + return self._token_count(texts, batch_size=batch_size) + class Bm42TextEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]): def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> Bm42: diff --git a/fastembed/sparse/minicoil.py b/fastembed/sparse/minicoil.py index dde52d90..be0c0638 100644 --- a/fastembed/sparse/minicoil.py +++ b/fastembed/sparse/minicoil.py @@ -187,6 +187,9 @@ def load_onnx_model(self) -> None: avg_len=self.avg_len, ) + def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + return self._token_count(texts, batch_size=batch_size) + def embed( self, documents: Union[str, Iterable[str]], diff --git a/fastembed/sparse/sparse_embedding_base.py b/fastembed/sparse/sparse_embedding_base.py index b153c814..ae8c3968 100644 --- a/fastembed/sparse/sparse_embedding_base.py +++ b/fastembed/sparse/sparse_embedding_base.py @@ -86,3 +86,7 @@ def query_embed( yield from self.embed([query], **kwargs) else: yield from self.embed(query, **kwargs) + + def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + """Returns the number of tokens in the texts.""" + raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/sparse/sparse_text_embedding.py b/fastembed/sparse/sparse_text_embedding.py index 3cb14c3e..0d8a2153 100644 --- a/fastembed/sparse/sparse_text_embedding.py +++ b/fastembed/sparse/sparse_text_embedding.py @@ -128,3 +128,15 @@ def query_embed( Iterable[SparseEmbedding]: The sparse embeddings. """ yield from self.model.query_embed(query, **kwargs) + + def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + """Returns the number of tokens in the texts. + + Args: + texts (str | Iterable[str]): The list of texts to embed. + batch_size (int): Batch size for encoding + + Returns: + int: Sum of number of tokens in the texts. + """ + return self.model.token_count(texts, batch_size=batch_size) diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py index 95e43bb2..32497d65 100644 --- a/fastembed/sparse/splade_pp.py +++ b/fastembed/sparse/splade_pp.py @@ -53,6 +53,9 @@ def _post_process_onnx_output( scores = row_scores[indices] yield SparseEmbedding(values=scores, indices=indices) + def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + return self._token_count(texts, batch_size=batch_size) + @classmethod def _list_supported_models(cls) -> list[SparseModelDescription]: """Lists the supported models. diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index d76db8bf..a9c26736 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -331,6 +331,9 @@ def load_onnx_model(self) -> None: extra_session_options=self._extra_session_options, ) + def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + return self._token_count(texts, batch_size=batch_size) + class OnnxTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): def init_embedding( diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py index 6cb49178..4a49656e 100644 --- a/fastembed/text/onnx_text_model.py +++ b/fastembed/text/onnx_text_model.py @@ -159,6 +159,15 @@ def _embed_documents( for batch in pool.ordered_map(iter_batch(documents, batch_size), **params): yield from self._post_process_onnx_output(batch, **kwargs) # type: ignore + def _token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + token_num = 0 + texts = texts if isinstance(texts, list) else [texts] + for batch in iter_batch(texts, batch_size): + for tokens in self.tokenizer.encode_batch(batch): + token_num += sum(tokens.attention_mask) + + return token_num + class TextEmbeddingWorker(EmbeddingWorker[T]): def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, OnnxOutputContext]]: diff --git a/fastembed/text/text_embedding.py b/fastembed/text/text_embedding.py index 117f5af7..576b6eb5 100644 --- a/fastembed/text/text_embedding.py +++ b/fastembed/text/text_embedding.py @@ -212,3 +212,15 @@ def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyAr """ # This is model-specific, so that different models can have specialized implementations yield from self.model.passage_embed(texts, **kwargs) + + def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + """Returns the number of tokens in the texts. + + Args: + texts (str | Iterable[str]): The list of texts to embed. + batch_size (int): Batch size for encoding + + Returns: + int: Sum of number of tokens in the texts. + """ + return self.model.token_count(texts, batch_size=batch_size) diff --git a/fastembed/text/text_embedding_base.py b/fastembed/text/text_embedding_base.py index 75df9ac5..f0a1d135 100644 --- a/fastembed/text/text_embedding_base.py +++ b/fastembed/text/text_embedding_base.py @@ -69,3 +69,7 @@ def get_embedding_size(cls, model_name: str) -> int: def embedding_size(self) -> int: """Returns embedding size for the current model""" raise NotImplementedError("Subclasses must implement this method") + + def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + """Returns the number of tokens in the texts.""" + raise NotImplementedError("Subclasses must implement this method") diff --git a/tests/test_late_interaction_embeddings.py b/tests/test_late_interaction_embeddings.py index f2499db8..ea83e76a 100644 --- a/tests/test_late_interaction_embeddings.py +++ b/tests/test_late_interaction_embeddings.py @@ -318,3 +318,27 @@ def test_session_options(model_cache, model_name) -> None: model = LateInteractionTextEmbedding(model_name=model_name, enable_cpu_mem_arena=False) session_options = model.model.model.get_session_options() assert session_options.enable_cpu_mem_arena is False + + +@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"]) +def test_token_count(model_cache, model_name) -> None: + with model_cache(model_name) as model: + documents = ["short doc", "it is a long document to check attention mask for paddings"] + short_doc_token_count = model.token_count(documents[0]) + long_doc_token_count = model.token_count(documents[1]) + documents_token_count = model.token_count(documents) + assert short_doc_token_count + long_doc_token_count == documents_token_count + # 2 is 2*DOC_MARKER_TOKEN_ID for each document + assert short_doc_token_count + long_doc_token_count + 2 == model.token_count( + documents, include_extension=True + ) + assert short_doc_token_count + long_doc_token_count == model.token_count( + documents, batch_size=1 + ) + assert short_doc_token_count + long_doc_token_count == model.token_count( + documents, is_doc=False + ) + # query min length is 32 + assert model.token_count(documents, is_doc=False, include_extension=True) == 64 + very_long_query = "It's a very long query which definitely contains more than 32 tokens and we're using it to check whether the method can handle large query properly without cutting it to 32 tokens" + assert model.token_count(very_long_query, is_doc=False, include_extension=True) > 32 diff --git a/tests/test_late_interaction_multimodal.py b/tests/test_late_interaction_multimodal.py index 80135f3b..8a102ace 100644 --- a/tests/test_late_interaction_multimodal.py +++ b/tests/test_late_interaction_multimodal.py @@ -101,3 +101,22 @@ def test_embedding_size(): model_name = "Qdrant/ColPali-v1.3-fp16" model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) assert model.embedding_size == 128 + + +def test_token_count() -> None: + if os.getenv("CI"): + pytest.skip("Colpali is too large to test in CI") + model_name = "Qdrant/colpali-v1.3-fp16" + model = LateInteractionMultimodalEmbedding(model_name=model_name, lazy_load=True) + + documents = ["short doc", "it is a long document to check attention mask for paddings"] + short_doc_token_count = model.token_count(documents[0]) + long_doc_token_count = model.token_count(documents[1]) + documents_token_count = model.token_count(documents) + assert short_doc_token_count + long_doc_token_count == documents_token_count + assert short_doc_token_count + long_doc_token_count == model.token_count( + documents, batch_size=1 + ) + assert short_doc_token_count + long_doc_token_count < model.token_count( + documents, include_extension=True + ) diff --git a/tests/test_sparse_embeddings.py b/tests/test_sparse_embeddings.py index 4c02a683..c2a7e2ff 100644 --- a/tests/test_sparse_embeddings.py +++ b/tests/test_sparse_embeddings.py @@ -298,3 +298,25 @@ def test_session_options(model_cache, model_name) -> None: model = SparseTextEmbedding(model_name=model_name, enable_cpu_mem_arena=False) session_options = model.model.model.get_session_options() assert session_options.enable_cpu_mem_arena is False + + +@pytest.mark.parametrize( + "model_name", + [ + "prithivida/Splade_PP_en_v1", + "Qdrant/minicoil-v1", + "Qdrant/bm42-all-minilm-l6-v2-attentions", + "Qdrant/bm25", + ], +) +def test_token_count(model_cache, model_name) -> None: + with model_cache(model_name) as model: + documents = [ + "Name me a couple of cities were the capitals of Germany?", + "Berlin is the current capital of Germany, Bonn is a former capital of Germany.", + ] + first_doc_token_count = model.token_count(documents[0]) + second_doc_token_count = model.token_count(documents[1]) + doc_token_count = model.token_count(documents) + assert first_doc_token_count + second_doc_token_count == doc_token_count + assert doc_token_count == model.token_count(documents, batch_size=1) diff --git a/tests/test_text_cross_encoder.py b/tests/test_text_cross_encoder.py index d23ee8ef..4d0d5b7d 100644 --- a/tests/test_text_cross_encoder.py +++ b/tests/test_text_cross_encoder.py @@ -124,6 +124,23 @@ def test_rerank_pairs_parallel(model_cache, model_name: str) -> None: ), f"Model: {model_name}, Scores (Parallel): {scores_parallel}, Expected: {canonical_scores}" +@pytest.mark.parametrize("model_name", ["Xenova/ms-marco-MiniLM-L-6-v2"]) +def test_token_count(model_cache, model_name: str) -> None: + with model_cache(model_name) as model: + pairs = [ + ("What is the capital of France?", "Paris is the capital of France."), + ( + "Name me a couple of cities were the capitals of Germany?", + "Berlin is the current capital of Germany, Bonn is a former capital of Germany.", + ), + ] + first_pair_token_count = model.token_count([pairs[0]]) + second_pair_token_count = model.token_count([pairs[1]]) + pairs_token_count = model.token_count(pairs) + assert first_pair_token_count + second_pair_token_count == pairs_token_count + assert pairs_token_count == model.token_count(pairs, batch_size=1) + + @pytest.mark.parametrize("model_name", ["Xenova/ms-marco-MiniLM-L-6-v2"]) def test_session_options(model_cache, model_name) -> None: with model_cache(model_name) as default_model: diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index 43e88ca8..e919faf9 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -203,3 +203,17 @@ def test_session_options(model_cache, model_name) -> None: model = TextEmbedding(model_name=model_name, enable_cpu_mem_arena=False) session_options = model.model.model.get_session_options() assert session_options.enable_cpu_mem_arena is False + + +@pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"]) +def test_token_count(model_cache, model_name) -> None: + with model_cache(model_name) as model: + documents = [ + "Name me a couple of cities were the capitals of Germany?", + "Berlin is the current capital of Germany, Bonn is a former capital of Germany.", + ] + first_doc_token_count = model.token_count(documents[0]) + second_doc_token_count = model.token_count(documents[1]) + doc_token_count = model.token_count(documents) + assert first_doc_token_count + second_doc_token_count == doc_token_count + assert doc_token_count == model.token_count(documents, batch_size=1) From 21d4f457709383d9f7f47ed75f896d5ea97b75ed Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Tue, 2 Dec 2025 00:36:57 +0700 Subject: [PATCH 2/4] fix: fix mypy --- fastembed/late_interaction/colbert.py | 4 +++- .../late_interaction/late_interaction_embedding_base.py | 3 +-- .../late_interaction/late_interaction_text_embedding.py | 7 ++++++- fastembed/late_interaction_multimodal/colpali.py | 4 +++- .../late_interaction_multimodal_embedding.py | 3 ++- .../late_interaction_multimodal_embedding_base.py | 3 +-- fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py | 6 ++++-- fastembed/rerank/cross_encoder/onnx_text_model.py | 6 ++++-- fastembed/rerank/cross_encoder/text_cross_encoder.py | 6 ++++-- fastembed/rerank/cross_encoder/text_cross_encoder_base.py | 2 +- fastembed/sparse/bm25.py | 2 +- fastembed/sparse/bm42.py | 6 ++++-- fastembed/sparse/minicoil.py | 6 ++++-- fastembed/sparse/sparse_embedding_base.py | 2 +- fastembed/sparse/sparse_text_embedding.py | 6 ++++-- fastembed/sparse/splade_pp.py | 6 ++++-- fastembed/text/onnx_embedding.py | 6 ++++-- fastembed/text/onnx_text_model.py | 7 +++++-- fastembed/text/text_embedding.py | 6 ++++-- fastembed/text/text_embedding_base.py | 2 +- 20 files changed, 61 insertions(+), 32 deletions(-) diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index 27dd0653..45bb4437 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -102,10 +102,12 @@ def token_count( batch_size: int = 1024, is_doc: bool = True, include_extension: bool = False, + **kwargs: Any, ) -> int: token_num = 0 - texts = texts if isinstance(texts, list) else [texts] + texts = [texts] if isinstance(texts, str) else texts tokenizer = self.tokenizer if is_doc else self.query_tokenizer + assert tokenizer is not None for batch in iter_batch(texts, batch_size): for tokens in tokenizer.encode_batch(batch): if is_doc: diff --git a/fastembed/late_interaction/late_interaction_embedding_base.py b/fastembed/late_interaction/late_interaction_embedding_base.py index bf1ec9ce..ec37c79b 100644 --- a/fastembed/late_interaction/late_interaction_embedding_base.py +++ b/fastembed/late_interaction/late_interaction_embedding_base.py @@ -74,8 +74,7 @@ def token_count( self, texts: Union[str, Iterable[str]], batch_size: int = 1024, - is_doc: bool = True, - include_extension: bool = False, + **kwargs: Any, ) -> int: """Returns the number of tokens in the texts.""" raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/late_interaction/late_interaction_text_embedding.py b/fastembed/late_interaction/late_interaction_text_embedding.py index ea5b22e7..482a4331 100644 --- a/fastembed/late_interaction/late_interaction_text_embedding.py +++ b/fastembed/late_interaction/late_interaction_text_embedding.py @@ -158,6 +158,7 @@ def token_count( batch_size: int = 1024, is_doc: bool = True, include_extension: bool = False, + **kwargs: Any, ) -> int: """Returns the number of tokens in the texts. @@ -171,5 +172,9 @@ def token_count( int: Sum of number of tokens in the texts. """ return self.model.token_count( - texts, batch_size=batch_size, is_doc=is_doc, include_extension=include_extension + texts, + batch_size=batch_size, + is_doc=is_doc, + include_extension=include_extension, + **kwargs, ) diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index a676076c..d5ce1983 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -177,9 +177,11 @@ def token_count( texts: Union[str, Iterable[str]], batch_size: int = 1024, include_extension: bool = False, + **kwargs: Any, ) -> int: token_num = 0 - texts = texts if isinstance(texts, list) else [texts] + texts = [texts] if isinstance(texts, str) else texts + assert self.tokenizer is not None tokenize_func = self.tokenize if include_extension else self.tokenizer.encode_batch for batch in iter_batch(texts, batch_size): token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)]) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py index 0e4c9263..01a57294 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py @@ -168,6 +168,7 @@ def token_count( texts: Union[str, Iterable[str]], batch_size: int = 1024, include_extension: bool = False, + **kwargs: Any, ) -> int: """Returns the number of tokens in the texts. @@ -180,5 +181,5 @@ def token_count( int: Sum of number of tokens in the texts. """ return self.model.token_count( - texts, batch_size=batch_size, include_extension=include_extension + texts, batch_size=batch_size, include_extension=include_extension, **kwargs ) diff --git a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py index f813cec6..0d148ce4 100644 --- a/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +++ b/fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py @@ -80,8 +80,7 @@ def embedding_size(self) -> int: def token_count( self, texts: Union[str, Iterable[str]], - batch_size: int = 1024, - include_extension: bool = False, + **kwargs: Any, ) -> int: """Returns the number of tokens in the texts.""" raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py index 6668ad4e..4a91a010 100644 --- a/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +++ b/fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py @@ -207,7 +207,9 @@ def _post_process_onnx_output( ) -> Iterable[float]: return (float(elem) for elem in output.model_output) - def token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) -> int: + def token_count( + self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024, **kwargs: Any + ) -> int: """Returns the number of tokens in the pairs. Args: @@ -217,7 +219,7 @@ def token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) Returns: token count: overall number of tokens in the pairs """ - return self._token_count(pairs, batch_size=batch_size) + return self._token_count(pairs, batch_size=batch_size, **kwargs) class TextCrossEncoderWorker(TextRerankerWorker): diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py index 34489e6b..6253185c 100644 --- a/fastembed/rerank/cross_encoder/onnx_text_model.py +++ b/fastembed/rerank/cross_encoder/onnx_text_model.py @@ -165,9 +165,11 @@ def _preprocess_onnx_input( """ return onnx_input - def _token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) -> int: + def _token_count( + self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024, **_: Any + ) -> int: token_num = 0 - + assert self.tokenizer is not None for batch in iter_batch(pairs, batch_size): for tokens in self.tokenizer.encode_batch(batch): token_num += sum(tokens.attention_mask) diff --git a/fastembed/rerank/cross_encoder/text_cross_encoder.py b/fastembed/rerank/cross_encoder/text_cross_encoder.py index ce2869db..c6182084 100644 --- a/fastembed/rerank/cross_encoder/text_cross_encoder.py +++ b/fastembed/rerank/cross_encoder/text_cross_encoder.py @@ -162,7 +162,9 @@ def add_custom_model( ) ) - def token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) -> int: + def token_count( + self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024, **kwargs: Any + ) -> int: """Returns the number of tokens in the pairs. Args: @@ -172,4 +174,4 @@ def token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) Returns: token count: overall number of tokens in the pairs """ - return self.model.token_count(pairs, batch_size=batch_size) + return self.model.token_count(pairs, batch_size=batch_size, **kwargs) diff --git a/fastembed/rerank/cross_encoder/text_cross_encoder_base.py b/fastembed/rerank/cross_encoder/text_cross_encoder_base.py index 42fdea71..7baffd0e 100644 --- a/fastembed/rerank/cross_encoder/text_cross_encoder_base.py +++ b/fastembed/rerank/cross_encoder/text_cross_encoder_base.py @@ -58,6 +58,6 @@ def rerank_pairs( """ raise NotImplementedError("This method should be overridden by subclasses") - def token_count(self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024) -> int: + def token_count(self, pairs: Iterable[tuple[str, str]], **kwargs: Any) -> int: """Returns the number of tokens in the pairs.""" raise NotImplementedError("This method should be overridden by subclasses") diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py index d2b405ad..8265a621 100644 --- a/fastembed/sparse/bm25.py +++ b/fastembed/sparse/bm25.py @@ -270,7 +270,7 @@ def raw_embed( def token_count(self, texts: Union[str, Iterable[str]], **kwargs: Any) -> int: token_num = 0 - texts = texts if isinstance(texts, list) else [texts] + texts = [texts] if isinstance(texts, str) else texts for text in texts: document = remove_non_alphanumeric(text) tokens = self.tokenizer.tokenize(document) diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py index 4a9103d7..9e1a089e 100644 --- a/fastembed/sparse/bm42.py +++ b/fastembed/sparse/bm42.py @@ -352,8 +352,10 @@ def query_embed( def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]: return Bm42TextEmbeddingWorker - def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: - return self._token_count(texts, batch_size=batch_size) + def token_count( + self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **kwargs: Any + ) -> int: + return self._token_count(texts, batch_size=batch_size, **kwargs) class Bm42TextEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]): diff --git a/fastembed/sparse/minicoil.py b/fastembed/sparse/minicoil.py index be0c0638..04d74793 100644 --- a/fastembed/sparse/minicoil.py +++ b/fastembed/sparse/minicoil.py @@ -187,8 +187,10 @@ def load_onnx_model(self) -> None: avg_len=self.avg_len, ) - def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: - return self._token_count(texts, batch_size=batch_size) + def token_count( + self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **kwargs: Any + ) -> int: + return self._token_count(texts, batch_size=batch_size, **kwargs) def embed( self, diff --git a/fastembed/sparse/sparse_embedding_base.py b/fastembed/sparse/sparse_embedding_base.py index ae8c3968..47026f65 100644 --- a/fastembed/sparse/sparse_embedding_base.py +++ b/fastembed/sparse/sparse_embedding_base.py @@ -87,6 +87,6 @@ def query_embed( else: yield from self.embed(query, **kwargs) - def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + def token_count(self, texts: Union[str, Iterable[str]], **kwargs: Any) -> int: """Returns the number of tokens in the texts.""" raise NotImplementedError("Subclasses must implement this method") diff --git a/fastembed/sparse/sparse_text_embedding.py b/fastembed/sparse/sparse_text_embedding.py index 0d8a2153..6f51f69e 100644 --- a/fastembed/sparse/sparse_text_embedding.py +++ b/fastembed/sparse/sparse_text_embedding.py @@ -129,7 +129,9 @@ def query_embed( """ yield from self.model.query_embed(query, **kwargs) - def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + def token_count( + self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **kwargs: Any + ) -> int: """Returns the number of tokens in the texts. Args: @@ -139,4 +141,4 @@ def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) Returns: int: Sum of number of tokens in the texts. """ - return self.model.token_count(texts, batch_size=batch_size) + return self.model.token_count(texts, batch_size=batch_size, **kwargs) diff --git a/fastembed/sparse/splade_pp.py b/fastembed/sparse/splade_pp.py index 32497d65..8480cb10 100644 --- a/fastembed/sparse/splade_pp.py +++ b/fastembed/sparse/splade_pp.py @@ -53,8 +53,10 @@ def _post_process_onnx_output( scores = row_scores[indices] yield SparseEmbedding(values=scores, indices=indices) - def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: - return self._token_count(texts, batch_size=batch_size) + def token_count( + self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **kwargs: Any + ) -> int: + return self._token_count(texts, batch_size=batch_size, **kwargs) @classmethod def _list_supported_models(cls) -> list[SparseModelDescription]: diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index a9c26736..2e3fc7d2 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -331,8 +331,10 @@ def load_onnx_model(self) -> None: extra_session_options=self._extra_session_options, ) - def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: - return self._token_count(texts, batch_size=batch_size) + def token_count( + self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **kwargs: Any + ) -> int: + return self._token_count(texts, batch_size=batch_size, **kwargs) class OnnxTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]): diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py index 4a49656e..c07951bc 100644 --- a/fastembed/text/onnx_text_model.py +++ b/fastembed/text/onnx_text_model.py @@ -159,9 +159,12 @@ def _embed_documents( for batch in pool.ordered_map(iter_batch(documents, batch_size), **params): yield from self._post_process_onnx_output(batch, **kwargs) # type: ignore - def _token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + def _token_count( + self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **_: Any + ) -> int: token_num = 0 - texts = texts if isinstance(texts, list) else [texts] + assert self.tokenizer is not None + texts = [texts] if isinstance(texts, str) else texts for batch in iter_batch(texts, batch_size): for tokens in self.tokenizer.encode_batch(batch): token_num += sum(tokens.attention_mask) diff --git a/fastembed/text/text_embedding.py b/fastembed/text/text_embedding.py index 576b6eb5..0c58a7f3 100644 --- a/fastembed/text/text_embedding.py +++ b/fastembed/text/text_embedding.py @@ -213,7 +213,9 @@ def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyAr # This is model-specific, so that different models can have specialized implementations yield from self.model.passage_embed(texts, **kwargs) - def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + def token_count( + self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **kwargs: Any + ) -> int: """Returns the number of tokens in the texts. Args: @@ -223,4 +225,4 @@ def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) Returns: int: Sum of number of tokens in the texts. """ - return self.model.token_count(texts, batch_size=batch_size) + return self.model.token_count(texts, batch_size=batch_size, **kwargs) diff --git a/fastembed/text/text_embedding_base.py b/fastembed/text/text_embedding_base.py index f0a1d135..a11ecae0 100644 --- a/fastembed/text/text_embedding_base.py +++ b/fastembed/text/text_embedding_base.py @@ -70,6 +70,6 @@ def embedding_size(self) -> int: """Returns embedding size for the current model""" raise NotImplementedError("Subclasses must implement this method") - def token_count(self, texts: Union[str, Iterable[str]], batch_size: int = 1024) -> int: + def token_count(self, texts: Union[str, Iterable[str]], **kwargs: Any) -> int: """Returns the number of tokens in the texts.""" raise NotImplementedError("Subclasses must implement this method") From 9e3b3192a5ab17b50980b9d3ff423ffa369800bf Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Tue, 2 Dec 2025 00:45:05 +0700 Subject: [PATCH 3/4] fix: load model in token_count --- fastembed/late_interaction/colbert.py | 2 ++ fastembed/late_interaction_multimodal/colpali.py | 2 ++ fastembed/rerank/cross_encoder/onnx_text_model.py | 3 +++ fastembed/sparse/bm42.py | 2 ++ fastembed/text/onnx_text_model.py | 3 +++ 5 files changed, 12 insertions(+) diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index 45bb4437..4ea5212e 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -104,6 +104,8 @@ def token_count( include_extension: bool = False, **kwargs: Any, ) -> int: + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() # loads the tokenizer as well token_num = 0 texts = [texts] if isinstance(texts, str) else texts tokenizer = self.tokenizer if is_doc else self.query_tokenizer diff --git a/fastembed/late_interaction_multimodal/colpali.py b/fastembed/late_interaction_multimodal/colpali.py index d5ce1983..7d0218fe 100644 --- a/fastembed/late_interaction_multimodal/colpali.py +++ b/fastembed/late_interaction_multimodal/colpali.py @@ -179,6 +179,8 @@ def token_count( include_extension: bool = False, **kwargs: Any, ) -> int: + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() # loads the tokenizer as well token_num = 0 texts = [texts] if isinstance(texts, str) else texts assert self.tokenizer is not None diff --git a/fastembed/rerank/cross_encoder/onnx_text_model.py b/fastembed/rerank/cross_encoder/onnx_text_model.py index 6253185c..801c60dc 100644 --- a/fastembed/rerank/cross_encoder/onnx_text_model.py +++ b/fastembed/rerank/cross_encoder/onnx_text_model.py @@ -168,6 +168,9 @@ def _preprocess_onnx_input( def _token_count( self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024, **_: Any ) -> int: + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() # loads the tokenizer as well + token_num = 0 assert self.tokenizer is not None for batch in iter_batch(pairs, batch_size): diff --git a/fastembed/sparse/bm42.py b/fastembed/sparse/bm42.py index 9e1a089e..536ba61e 100644 --- a/fastembed/sparse/bm42.py +++ b/fastembed/sparse/bm42.py @@ -355,6 +355,8 @@ def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]: def token_count( self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **kwargs: Any ) -> int: + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() # loads the tokenizer as well return self._token_count(texts, batch_size=batch_size, **kwargs) diff --git a/fastembed/text/onnx_text_model.py b/fastembed/text/onnx_text_model.py index c07951bc..16dd6946 100644 --- a/fastembed/text/onnx_text_model.py +++ b/fastembed/text/onnx_text_model.py @@ -162,6 +162,9 @@ def _embed_documents( def _token_count( self, texts: Union[str, Iterable[str]], batch_size: int = 1024, **_: Any ) -> int: + if not hasattr(self, "model") or self.model is None: + self.load_onnx_model() # loads the tokenizer as well + token_num = 0 assert self.tokenizer is not None texts = [texts] if isinstance(texts, str) else texts From 5e9a4260ad7fe4f695b9c39a3fe5572fe000f618 Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Tue, 2 Dec 2025 00:45:37 +0700 Subject: [PATCH 4/4] fix: remove debug code --- fastembed/late_interaction/colbert.py | 34 --------------------------- 1 file changed, 34 deletions(-) diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index 4ea5212e..6545a5ab 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -298,37 +298,3 @@ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> Colb threads=1, **kwargs, ) - - -if __name__ == "__main__": - te = Colbert("answerdotai/answerai-colbert-small-v1") - - print( - te.token_count( - texts=[ - "qwe", - "adsda ads qwe dasd dsad cxc czx as qw er tr y fg s a x b h f d a s w e t y k,l; czcx asd ", - ] - ) - ) - print( - te.token_count( - texts=[ - "qwe", - "adsda ads qwe dasd dsad cxc czx as qw er tr y fg s a x b h f d a s w e t y k,l; czcx asd ", - ], - is_doc=False, - ) - ) - # data = [] - # with open('../../training_data.csv', 'r') as f: - # for i, line in enumerate(f): - # - # if i == 0: - # continue - # - # data.append(line.rsplit(',', maxsplit=1)[0][1:-1]) - # import time - # a = time.perf_counter() - # te.token_count(data, batch_size=1024) - # print(time.perf_counter() - a)