chore: Change classification multi_label bool to mode: Literal["multi", "single"]. (#244)

rmitsch · Raphael Mitsch · web-flow · commit 7271863970d4 · 2025-12-25T14:47:02.000+01:00
Co-authored-by: Raphael Mitsch &lt;raphael@climatiq.com&gt;
diff --git a/demos/crisis_tweets/case_study.py b/demos/crisis_tweets/case_study.py
@@ -220,7 +220,7 @@ def _(batch_size, data_sampled, model):
     crisis_label_classifier = tasks.Classification(
         task_id="crisis_label_classifier",
         labels=data_sampled.label.unique(),
-        multi_label=False,
+        mode='single',
         model=model,
         batch_size=batch_size,
     )
@@ -249,7 +249,7 @@ def related_to_crisis(doc: Doc) -> bool:
     crisis_type_classifier = tasks.Classification(
         task_id="crisis_type_classifier",
         labels=data_sampled.crisis_type.unique(),
-        multi_label=False,
+        mode='single',
         model=model,
         condition=related_to_crisis,
         batch_size=batch_size,
diff --git a/demos/demo_spam.py b/demos/demo_spam.py
@@ -141,7 +141,7 @@ def _(model):
 
     classifier = tasks.Classification(
         labels=["spam", "not spam"],
-        multi_label=False,
+        mode='single',
         model=model,
     )
     summarizer = tasks.Summarization(n_words=10, model=model)
diff --git a/docs/guides/distillation.md b/docs/guides/distillation.md
@@ -291,7 +291,7 @@ The distillation process automatically handles both classification modes:
 task = Classification(
     labels=["technology", "politics", "sports"],
     model=model,
-    multi_label=False,
+    mode='single',
 )
 ```
 
diff --git a/docs/tasks/predictive/classification.md b/docs/tasks/predictive/classification.md
@@ -18,8 +18,8 @@ The `Classification` task returns a unified result schema regardless of the mode
 --8<-- "sieves/tasks/predictive/schemas/classification.py:Result"
 ```
 
-- When `multi_label=True` (default): results are of type `ResultMultiLabel`, containing a list of `(label, score)` tuples.
-- When `multi_label=False`: results are of type `ResultSingleLabel`, containing a single `label` and `score`.
+- When `mode == 'multi'` (default): results are of type `ResultMultiLabel`, containing a list of `(label, score)` tuples.
+- When `mode == 'single'`: results are of type `ResultSingleLabel`, containing a single `label` and `score`.
 
 ---
 
diff --git a/sieves/model_wrappers/huggingface_.py b/sieves/model_wrappers/huggingface_.py
@@ -66,7 +66,7 @@ def execute(values: Sequence[dict[str, Any]]) -> Sequence[tuple[Result | None, A
                         sequences=[doc_values["text"] for doc_values in values],
                         candidate_labels=prompt_signature,
                         hypothesis_template=template,
-                        multi_label=True,
+                        mode="multi",
                         **self._inference_kwargs,
                     )
 
diff --git a/sieves/tasks/predictive/classification/bridges.py b/sieves/tasks/predictive/classification/bridges.py
@@ -34,15 +34,15 @@ def __init__(
         task_id: str,
         prompt_instructions: str | None,
         labels: list[str] | dict[str, str],
-        multi_label: bool,
+        mode: Literal["single", "multi"],
         model_settings: ModelSettings,
     ):
         """Initialize ClassificationBridge.
 
         :param task_id: Task ID.
         :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
         :param labels: Labels to classify. Can be a list of label strings, or a dict mapping labels to descriptions.
-        :param multi_label: If True, task returns confidence scores for all specified labels. If False, task returns
+        :param mode: If 'multi'', task returns confidence scores for all specified labels. If 'single', task returns
             most likely class label. In the latter case label forcing mechanisms are utilized, which can lead to higher
             accuracy.
         :param model_settings: Model settings.
@@ -59,7 +59,7 @@ def __init__(
         else:
             self._labels = labels
             self._label_descriptions = {}
-        self._multi_label = multi_label
+        self._mode = mode
 
     def _get_label_descriptions(self) -> str:
         """Return a string with the label descriptions.
@@ -87,7 +87,7 @@ class DSPyClassification(ClassificationBridge[dspy_.PromptSignature, dspy_.Resul
     @override
     @property
     def _default_prompt_instructions(self) -> str:
-        if self._multi_label:
+        if self._mode == "multi":
             return f"""
             Multi-label classification of the provided text given the labels {self._labels}.
             For each label, provide the confidence with which you believe that the provided text should be assigned
@@ -121,7 +121,7 @@ def prompt_signature(self) -> type[dspy_.PromptSignature]:
         labels = self._labels
         LabelType = Literal[*labels]  # type: ignore[valid-type]
 
-        if self._multi_label:
+        if self._mode == "multi":
 
             class MultiLabelTextClassification(dspy.Signature):  # type: ignore[misc]
                 text: str = dspy.InputField(description="Text to classify.")
@@ -164,7 +164,7 @@ def integrate(self, results: Sequence[dspy_.Result], docs: list[Doc]) -> list[Do
                 reverse=True,
             )
 
-            if self._multi_label:
+            if self._mode == "multi":
                 doc.results[self._task_id] = ResultMultiLabel(label_scores=sorted_preds)
             else:
                 if isinstance(sorted_preds, list) and len(sorted_preds) > 0:
@@ -188,7 +188,7 @@ def consolidate(
 
                 # Clamp score to range between 0 and 1. Alternatively we could force this in the prompt signature,
                 # but this fails occasionally with some models and feels too strict.
-                if self._multi_label:
+                if self._mode == "multi":
                     for label, score in res.confidence_per_label.items():
                         label_scores[label] += max(0, min(score, 1))
                 else:
@@ -228,7 +228,7 @@ def _default_prompt_instructions(self) -> str:
     @override
     @property
     def _prompt_example_template(self) -> str | None:
-        if self._multi_label:
+        if self._mode == "multi":
             return """
             {% if examples|length > 0 -%}
 
@@ -285,7 +285,7 @@ def inference_mode(self) -> huggingface_.InferenceMode:
     def integrate(self, results: Sequence[huggingface_.Result], docs: list[Doc]) -> list[Doc]:
         for doc, result in zip(docs, results):
             label_scores = [(label, score) for label, score in zip(result["labels"], result["scores"])]
-            if self._multi_label:
+            if self._mode == "multi":
                 doc.results[self._task_id] = ResultMultiLabel(label_scores=label_scores)
             else:
                 if len(label_scores) > 0:
@@ -333,7 +333,7 @@ class PydanticBasedClassification(
     @override
     @property
     def _default_prompt_instructions(self) -> str:
-        if self._multi_label:
+        if self._mode == "multi":
             return (
                 f"""
             Perform multi-label classification of the provided text given the provided labels: {",".join(self._labels)}.
@@ -369,7 +369,7 @@ def _default_prompt_instructions(self) -> str:
     @override
     @property
     def _prompt_example_template(self) -> str | None:
-        if self._multi_label:
+        if self._mode == "multi":
             return """
             {% if examples|length > 0 -%}
                 Examples:
@@ -417,7 +417,7 @@ def _prompt_conclusion(self) -> str | None:
     @override
     @cached_property
     def prompt_signature(self) -> type[pydantic.BaseModel] | list[str]:
-        if self._multi_label:
+        if self._mode == "multi":
             prompt_sig = pydantic.create_model(  # type: ignore[no-matching-overload]
                 "MultilabelClassification",
                 __base__=pydantic.BaseModel,
@@ -442,7 +442,7 @@ class SingleLabelClassification(pydantic.BaseModel):
     @override
     def integrate(self, results: Sequence[pydantic.BaseModel | str], docs: list[Doc]) -> list[Doc]:
         for doc, result in zip(docs, results):
-            if self._multi_label:
+            if self._mode == "multi":
                 assert isinstance(result, pydantic.BaseModel)
                 label_scores = result.model_dump()
                 sorted_label_scores = sorted(
@@ -471,7 +471,7 @@ def consolidate(
 
                 # We clamp the score to 0 <= x <= 1. Alternatively we could force this in the prompt signature, but
                 # this fails occasionally with some models and feels too strict.
-                if self._multi_label:
+                if self._mode == "multi":
                     for label in self._labels:
                         label_scores[label] += max(0, min(getattr(res, label), 1))
                 else:
@@ -482,7 +482,7 @@ def consolidate(
             assert issubclass(prompt_signature, pydantic.BaseModel)  # type: ignore[arg-type]
             assert callable(prompt_signature)
 
-            if self._multi_label:
+            if self._mode == "multi":
                 consolidated_results.append(prompt_signature(**avg_label_scores))
             else:
                 max_score_label = max(avg_label_scores, key=avg_label_scores.__getitem__)
@@ -510,12 +510,12 @@ class PydanticBasedClassificationWithLabelForcing(PydanticBasedClassification[Mo
     @override
     @cached_property
     def prompt_signature(self) -> type[pydantic.BaseModel] | list[str]:
-        return super().prompt_signature if self._multi_label else self._labels
+        return super().prompt_signature if self._mode == "multi" else self._labels
 
     @override
     @property
     def _default_prompt_instructions(self) -> str:
-        if self._multi_label:
+        if self._mode == "multi":
             return super()._default_prompt_instructions
 
         return f"""
@@ -534,7 +534,7 @@ def _default_prompt_instructions(self) -> str:
     @override
     @property
     def _prompt_example_template(self) -> str | None:
-        if self._multi_label:
+        if self._mode == "multi":
             return super()._prompt_example_template
 
         return """
@@ -555,7 +555,7 @@ def _prompt_example_template(self) -> str | None:
 
     @override
     def integrate(self, results: Sequence[pydantic.BaseModel | str], docs: list[Doc]) -> list[Doc]:
-        if self._multi_label:
+        if self._mode == "multi":
             return super().integrate(results, docs)
 
         for doc, result in zip(docs, results):
@@ -572,7 +572,7 @@ def integrate(self, results: Sequence[pydantic.BaseModel | str], docs: list[Doc]
     def consolidate(
         self, results: Sequence[pydantic.BaseModel | str], docs_offsets: list[tuple[int, int]]
     ) -> Sequence[pydantic.BaseModel | str]:
-        if self._multi_label:
+        if self._mode == "multi":
             return super().consolidate(results, docs_offsets)
 
         else:
@@ -592,5 +592,5 @@ class OutlinesClassification(PydanticBasedClassificationWithLabelForcing[outline
     @property
     def inference_mode(self) -> outlines_.InferenceMode:
         return self._model_settings.inference_mode or (
-            outlines_.InferenceMode.json if self._multi_label else outlines_.InferenceMode.choice
+            outlines_.InferenceMode.json if self._mode == "multi" else outlines_.InferenceMode.choice
         )
diff --git a/sieves/tasks/predictive/classification/core.py b/sieves/tasks/predictive/classification/core.py
@@ -5,7 +5,7 @@
 import json
 from collections.abc import Callable, Iterable, Sequence
 from pathlib import Path
-from typing import Any, override
+from typing import Any, Literal, override
 
 import datasets
 import dspy
@@ -74,7 +74,7 @@ def __init__(
         batch_size: int = -1,
         prompt_instructions: str | None = None,
         fewshot_examples: Sequence[FewshotExample] = (),
-        multi_label: bool = True,
+        mode: Literal["single", "multi"] = "multi",
         model_settings: ModelSettings = ModelSettings(),
         condition: Callable[[Doc], bool] | None = None,
     ) -> None:
@@ -90,7 +90,7 @@ def __init__(
         :param batch_size: Batch size to use for inference. Use -1 to process all documents at once.
         :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
         :param fewshot_examples: Few-shot examples.
-        :param multi_label: If True, task returns confidence scores for all specified labels. If False, task returns
+        :param mode: If 'multi', task returns confidence scores for all specified labels. If 'single', task returns
             most likely class label. In the latter case label forcing mechanisms are utilized, which can lead to higher
             accuracy.
         :param model_settings: Model settings.
@@ -102,7 +102,7 @@ def __init__(
         else:
             self._labels = list(labels)
             self._label_descriptions = {}
-        self._multi_label = multi_label
+        self._mode = mode
 
         super().__init__(
             model=model,
@@ -137,7 +137,7 @@ def _init_bridge(self, model_type: ModelType) -> _TaskBridge:
                 prompt_signature=gliner2.inference.engine.Schema().classification(
                     task="classification",
                     labels=labels,
-                    multi_label=self._multi_label,
+                    mode=self._mode,
                 ),
                 model_settings=self._model_settings,
                 inference_mode=gliner_.InferenceMode.classification,
@@ -158,7 +158,7 @@ def _init_bridge(self, model_type: ModelType) -> _TaskBridge:
                 task_id=self._task_id,
                 prompt_instructions=self._custom_prompt_instructions,
                 labels=labels,
-                multi_label=self._multi_label,
+                mode=self._mode,
                 model_settings=self._model_settings,
             )
         except KeyError as err:
@@ -179,12 +179,12 @@ def _validate_fewshot_examples(self) -> None:
         label_error_text = (
             "Label mismatch: {task_id} has labels {labels}. Few-shot examples have labels {example_labels}."
         )
-        example_type_error_text = "Fewshot example type mismatch: multi_label = {multi_label} requires {example_type}."
+        example_type_error_text = "Fewshot example type mismatch: mode = {mode} requires {example_type}."
 
         for fs_example in self._fewshot_examples or []:
-            if self._multi_label:
+            if self._mode == "multi":
                 assert isinstance(fs_example, FewshotExampleMultiLabel), TypeError(
-                    example_type_error_text.format(example_type=FewshotExampleMultiLabel, multi_label=self._multi_label)
+                    example_type_error_text.format(example_type=FewshotExampleMultiLabel, mode=self._mode)
                 )
                 if any([label not in self._labels for label in fs_example.confidence_per_label]) or not all(
                     [label in fs_example.confidence_per_label for label in self._labels]
@@ -196,9 +196,7 @@ def _validate_fewshot_examples(self) -> None:
                     )
             else:
                 assert isinstance(fs_example, FewshotExampleSingleLabel), TypeError(
-                    example_type_error_text.format(
-                        example_type=FewshotExampleSingleLabel, multi_label=self._multi_label
-                    )
+                    example_type_error_text.format(example_type=FewshotExampleSingleLabel, mode=self._mode)
                 )
                 if fs_example.label not in self._labels:
                     raise ValueError(
@@ -283,7 +281,7 @@ def distill(
                 default_init_kwargs: dict[str, Any] = {}
                 metric_kwargs: dict[str, Any] = {}
 
-                if self._multi_label:
+                if self._mode == "multi":
                     default_init_kwargs["multi_target_strategy"] = "multi-output"
                     metric_kwargs = {"average": "macro"}
 
@@ -369,7 +367,7 @@ def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets
         data: list[dict[str, str | list[bool]]] = []
 
         # Define metadata and features (multi-hot across declared labels for multi-label).
-        if self._multi_label:
+        if self._mode == "multi":
             features = datasets.Features(
                 {"text": datasets.Value("string"), "labels": datasets.Sequence(datasets.Value("bool"))}
             )
@@ -380,7 +378,7 @@ def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets
 
         info = datasets.DatasetInfo(
             description=(
-                f"{'Multi-label' if self._multi_label else 'Single-label'} classification dataset with labels "
+                f"{'Multi-label' if self._mode == 'multi' else 'Single-label'} classification dataset with labels "
                 f"{self._labels}. Generated with sieves v{Config.get_version()}."
             ),
             features=features,
@@ -391,7 +389,7 @@ def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets
                 scores = Classification._result_to_scores(doc.results[self._task_id])
 
                 # If multi-label: store one-hot representation.
-                if self._multi_label:
+                if self._mode == "multi":
                     result_normalized = [int(scores.get(label, 0.0) >= threshold) for label in self._labels]  # type: ignore[no-matching-overload]
                 # If single-label: get single-label result as is.
                 else:
@@ -410,7 +408,7 @@ def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets
     def _evaluate_optimization_example(
         self, truth: dspy.Example, pred: dspy.Prediction, trace: Any, model: dspy.LM
     ) -> float:
-        if not self._multi_label:
+        if self._mode == "single":
             return 1 - abs(truth["confidence"] - pred["confidence"]) if truth["label"] == pred["label"] else 0
 
         # For multi-label: compute label-wise accuracy as
diff --git a/sieves/tasks/predictive/gliner_bridge.py b/sieves/tasks/predictive/gliner_bridge.py
@@ -155,9 +155,9 @@ def integrate(self, results: Sequence[gliner_.Result], docs: list[Doc]) -> list[
                 # Used by: Classification
                 case gliner_.InferenceMode.classification:
                     assert hasattr(self._prompt_signature.schema, "__getitem__")
-                    is_multilabel = self._prompt_signature.schema["classifications"][0]["multi_label"]
+                    mode = self._prompt_signature.schema["classifications"][0]["mode"]
 
-                    if is_multilabel:
+                    if mode == "multi":
                         label_scores: list[tuple[str, float]] = []
                         for res in sorted(result, key=lambda x: x["score"], reverse=True):
                             assert isinstance(res, dict)
diff --git a/sieves/tests/docs/test_optimization.py b/sieves/tests/docs/test_optimization.py
@@ -61,7 +61,7 @@ def test_basic_optimization_example(small_dspy_model):
         },
         model=model,
         fewshot_examples=examples,
-        multi_label=False,
+        mode='single',
         model_settings=ModelSettings(),
     )
     # --8<-- [end:optimization-task-setup]
diff --git a/sieves/tests/tasks/predictive/test_classification.py b/sieves/tests/tasks/predictive/test_classification.py
diff --git a/sieves/tests/tasks/test_optimization.py b/sieves/tests/tasks/test_optimization.py

Original file line number	Diff line number	Diff line change
`@@ -141,7 +141,7 @@ def _(model):`
`141`	`141`
`142`	`142`	`classifier = tasks.Classification(`
`143`	`143`	`labels=["spam", "not spam"],`
`144`		`- multi_label=False,`
	`144`	`+ mode='single',`
`145`	`145`	`model=model,`
`146`	`146`	`)`
`147`	`147`	`summarizer = tasks.Summarization(n_words=10, model=model)`
Original file line number	Diff line number	Diff line change
`@@ -291,7 +291,7 @@ The distillation process automatically handles both classification modes:`
`291`	`291`	`task = Classification(`
`292`	`292`	`labels=["technology", "politics", "sports"],`
`293`	`293`	`model=model,`
`294`		`- multi_label=False,`
	`294`	`+ mode='single',`
`295`	`295`	`)`
`296`	`296`	```
`297`	`297`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ def execute(values: Sequence[dict[str, Any]]) -> Sequence[tuple[Result \| None, A`
`66`	`66`	`sequences=[doc_values["text"] for doc_values in values],`
`67`	`67`	`candidate_labels=prompt_signature,`
`68`	`68`	`hypothesis_template=template,`
`69`		`- multi_label=True,`
	`69`	`+ mode="multi",`
`70`	`70`	`**self._inference_kwargs,`
`71`	`71`	`)`
`72`	`72`
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ def test_basic_optimization_example(small_dspy_model):`
`61`	`61`	`},`
`62`	`62`	`model=model,`
`63`	`63`	`fewshot_examples=examples,`
`64`		`- multi_label=False,`
	`64`	`+ mode='single',`
`65`	`65`	`model_settings=ModelSettings(),`
`66`	`66`	`)`
`67`	`67`	`# --8<-- [end:optimization-task-setup]`