From d97ce50dec04c85a0c474d4a1afccbeefe4cc20b Mon Sep 17 00:00:00 2001
From: franuluc <franuluc@amazon.de>
Date: Fri, 24 Nov 2023 16:23:56 +0100
Subject: [PATCH 1/2] fix(pre-launch science review): changing config, default
 behaviour for aggregation strategy, introducing unknown ratio measure,
 splitting also considers \n

BREAKING CHANGES: ClassificationAccuracyConfig changing fields, introducing 2 more fields
---
 .../classification_accuracy.py                | 69 ++++++++++++++-----
 .../test_classification_accuracy.py           | 48 +++++++++++--
 2 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/src/fmeval/eval_algorithms/classification_accuracy.py b/src/fmeval/eval_algorithms/classification_accuracy.py
index 62080fc0..8354fb4b 100644
--- a/src/fmeval/eval_algorithms/classification_accuracy.py
+++ b/src/fmeval/eval_algorithms/classification_accuracy.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import warnings
 from dataclasses import dataclass
 from typing import Callable, Dict, List, Optional, Any
@@ -28,7 +29,6 @@
     DATASET_CONFIGS,
     CategoryScore,
     get_default_prompt_template,
-    WOMENS_CLOTHING_ECOMMERCE_REVIEWS,
 )
 from fmeval.eval_algorithms.util import (
     generate_prompt_column_for_dataset,
@@ -46,13 +46,27 @@
 BALANCED_ACCURACY_SCORE = "balanced_accuracy_score"
 PRECISION_SCORE = "precision_score"
 RECALL_SCORE = "recall_score"
+UNKNOWN_RATIO = "unknown_ratio"
 UNKNOWN_LABEL = "unknown"
 PROMPT_COLUMN_NAME = "prompt"
-CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME = "classified_model_output"
+PREDICTED_CLASS_COLUMN_NAME = "predicted_class"
+
+
+def unknown_ratio(y_pred) -> float:
+    """
+    Computes the ratio of predictions that do not contain any valid class label
+
+    :param y_pred:  Predicted class labels, array or pandas dataframe
+    :return:        The fraction of predicted labels that do not match any known class.
+    """
+    return (y_pred == UNKNOWN_LABEL).sum() / y_pred.count()
+
+
 CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {
     BALANCED_ACCURACY_SCORE: balanced_accuracy_score,
     PRECISION_SCORE: precision_score,
     RECALL_SCORE: recall_score,
+    UNKNOWN_RATIO: unknown_ratio,
 }
 UNIQUENESS_FACTOR = 0.05
 
@@ -73,7 +87,7 @@ def convert_model_output_to_label(model_output: str, valid_labels: List[str]) ->
     # normalise to lowercase & strip
     valid_labels = [label.lower().strip() for label in valid_labels]
 
-    response_words = model_output.split(" ")
+    response_words = re.split(r"[\n\s+]", model_output)
     predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels]
     # if there is more than one label in the model output we pick the first
     string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL
@@ -88,15 +102,24 @@ class ClassificationAccuracyConfig(EvalAlgorithmConfig):
 
     :param valid_labels: The labels of the classes predicted from the model.
     :param converter_fn: Function to process model output to labels, defaults to simple integer conversion.
+    :param binary_average_strategy: `average` to be passed to sklearn's precision and recall scores.
+        This determines how scores are aggregated in the binary classification settings
+        (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html).
+        Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'.
+    :param positive_label: The label that is considered to be the positive class when computing precision and recall
+        for binary classification problems and the averaging strategy is `binary`. This parameter has no effect
+         for the multiclass case. Default='1'.
     :param multiclass_average_strategy: `average` to be passed to sklearn's precision and recall scores.
-        This determines how scores are aggregated in the multiclass classification setting
+        This determines how scores are aggregated in multiclass classification settings
         (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html).
-        Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'.
+        Options are {'micro', 'macro', 'samples', 'weighted'} or None, default='weighted'.
     """
 
     valid_labels: Optional[List[str]] = None
     converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label
-    multiclass_average_strategy: Optional[str] = "micro"
+    binary_average_strategy: Optional[str] = "binary"
+    positive_label: str = "1"
+    multiclass_average_strategy: Optional[str] = "weighted"
 
     def __post_init__(self):
         if self.valid_labels:
@@ -138,7 +161,7 @@ def evaluate(
                      EvalAlgorithmInterface.EVAL_RESULTS_PATH
         :param num_records: The number of records to be sampled randomly from the input dataset to perform the
                             evaluation
-        :returns: List of EvalOutput objects. Current implementation returns only one score.
+        :returns: List of EvalOutput objects.
         """
         if dataset_config:
             dataset_configs = [dataset_config]
@@ -185,11 +208,11 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]:  # pragma: no cove
                     Map function for generating classified model output and classification accuracy
                     columns for dataset.
                     """
-                    row[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] = self._eval_algorithm_config.converter_fn(
+                    row[PREDICTED_CLASS_COLUMN_NAME] = self._eval_algorithm_config.converter_fn(
                         row[MODEL_OUTPUT_COLUMN_NAME], self._valid_labels  # type: ignore
                     )
                     row[CLASSIFICATION_ACCURACY_SCORE] = int(
-                        row[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] == str(row[TARGET_OUTPUT_COLUMN_NAME])
+                        row[PREDICTED_CLASS_COLUMN_NAME] == str(row[TARGET_OUTPUT_COLUMN_NAME])
                     )
                     return row
 
@@ -208,7 +231,7 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]:  # pragma: no cove
                             value=self._get_score(
                                 # TODO dataloader should ensure target output is string
                                 y_true=df[TARGET_OUTPUT_COLUMN_NAME],
-                                y_pred=df[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME],
+                                y_pred=df[PREDICTED_CLASS_COLUMN_NAME],
                                 eval_fn=eval_fn,
                             ),
                         )
@@ -232,7 +255,7 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]:  # pragma: no cove
                             df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], TARGET_OUTPUT_COLUMN_NAME
                         ]
                         categorical_y_pred = df.loc[
-                            df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME
+                            df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], PREDICTED_CLASS_COLUMN_NAME
                         ]
                         for eval_score, eval_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items():
                             category_scores[row[CATEGORY_COLUMN_NAME]].scores.append(
@@ -274,14 +297,28 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]:  # pragma: no cove
 
     def _get_score(self, y_true, y_pred, eval_fn: Callable[..., float]) -> float:
         """
-        Method to generate accuracy score
-        :param y_true: Ground truth (correct) target values.
-        :param y_pred: Estimated targets as returned by a classifier.
-        :param eval_fn: Score evaluate function.
+        Method to compute accuracy scores
+        :param y_true: Ground truth (correct) labels.
+        :param y_pred: Predicted labels.
+        :param eval_fn: method to compute the score.
         :returns: Computed score
         """
         if eval_fn == recall_score or eval_fn == precision_score:
-            return eval_fn(y_true, y_pred, average=self._eval_algorithm_config.multiclass_average_strategy)
+            # compute these metrics only on the subset of records that have a known predicted label
+            y_true_sub = y_true[y_pred != UNKNOWN_LABEL]
+            y_pred_sub = y_pred[y_pred != UNKNOWN_LABEL]
+
+            # picks the averaging strategy according to the number of classes
+            avg_strategy = (
+                self._eval_algorithm_config.binary_average_strategy
+                if len(self._valid_labels) == 2
+                else self._eval_algorithm_config.multiclass_average_strategy
+            )
+            return eval_fn(
+                y_true_sub, y_pred_sub, average=avg_strategy, pos_label=self._eval_algorithm_config.positive_label
+            )
+        elif eval_fn == unknown_ratio:
+            return eval_fn(y_pred)
         return eval_fn(y_true, y_pred)
 
     def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]:  # type: ignore[override]
diff --git a/test/unit/eval_algorithms/test_classification_accuracy.py b/test/unit/eval_algorithms/test_classification_accuracy.py
index d0d5128e..f455409d 100644
--- a/test/unit/eval_algorithms/test_classification_accuracy.py
+++ b/test/unit/eval_algorithms/test_classification_accuracy.py
@@ -31,6 +31,7 @@
     BALANCED_ACCURACY_SCORE,
     PRECISION_SCORE,
     RECALL_SCORE,
+    UNKNOWN_RATIO,
 )
 from ray.data import Dataset
 
@@ -77,6 +78,15 @@
     EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2),
     EvalScore(name=PRECISION_SCORE, value=2 / 3),
     EvalScore(name=RECALL_SCORE, value=2 / 3),
+    EvalScore(name=UNKNOWN_RATIO, value=0.0),
+]
+
+DATASET_SCORES_WO_CONFIG = [
+    EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=2 / 3),
+    EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2),
+    EvalScore(name=PRECISION_SCORE, value=0),
+    EvalScore(name=RECALL_SCORE, value=0),
+    EvalScore(name=UNKNOWN_RATIO, value=1 / 3),
 ]
 
 CATEGORY_SCORES = [
@@ -87,6 +97,7 @@
             EvalScore(name=BALANCED_ACCURACY_SCORE, value=1.0),
             EvalScore(name=PRECISION_SCORE, value=1.0),
             EvalScore(name=RECALL_SCORE, value=1.0),
+            EvalScore(name=UNKNOWN_RATIO, value=0.0),
         ],
     ),
     CategoryScore(
@@ -96,6 +107,31 @@
             EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2),
             EvalScore(name=PRECISION_SCORE, value=1 / 2),
             EvalScore(name=RECALL_SCORE, value=1 / 2),
+            EvalScore(name=UNKNOWN_RATIO, value=0.0),
+        ],
+    ),
+]
+
+
+CATEGORY_SCORES_WO_CONFIG = [
+    CategoryScore(
+        name="brownie",
+        scores=[
+            EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1.0),
+            EvalScore(name=BALANCED_ACCURACY_SCORE, value=1.0),
+            EvalScore(name=PRECISION_SCORE, value=0),
+            EvalScore(name=RECALL_SCORE, value=0),
+            EvalScore(name=UNKNOWN_RATIO, value=0),
+        ],
+    ),
+    CategoryScore(
+        name="vanilla cake",
+        scores=[
+            EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1 / 2),
+            EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2),
+            EvalScore(name=PRECISION_SCORE, value=0),
+            EvalScore(name=RECALL_SCORE, value=0),
+            EvalScore(name=UNKNOWN_RATIO, value=1 / 2),
         ],
     ),
 ]
@@ -185,8 +221,8 @@ def test_classification_accuracy_evaluate_without_model(
                         eval_name="classification_accuracy",
                         dataset_name=WOMENS_CLOTHING_ECOMMERCE_REVIEWS,
                         prompt_template=BUILT_IN_DATASET_DEFAULT_PROMPT_TEMPLATES[WOMENS_CLOTHING_ECOMMERCE_REVIEWS],
-                        dataset_scores=DATASET_SCORES,
-                        category_scores=CATEGORY_SCORES,
+                        dataset_scores=DATASET_SCORES_WO_CONFIG,
+                        category_scores=CATEGORY_SCORES_WO_CONFIG,
                         output_path="/tmp/eval_results/classification_accuracy_womens_clothing_ecommerce_reviews.jsonl",
                     ),
                 ],
@@ -209,8 +245,8 @@ def test_classification_accuracy_evaluate_without_model(
                         eval_name="classification_accuracy",
                         dataset_name="my_custom_dataset",
                         prompt_template="Classify: $feature",
-                        dataset_scores=DATASET_SCORES,
-                        category_scores=CATEGORY_SCORES,
+                        dataset_scores=DATASET_SCORES_WO_CONFIG,
+                        category_scores=CATEGORY_SCORES_WO_CONFIG,
                         output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl",
                     )
                 ],
@@ -233,7 +269,7 @@ def test_classification_accuracy_evaluate_without_model(
                         eval_name="classification_accuracy",
                         dataset_name="my_custom_dataset",
                         prompt_template="Classify: $feature",
-                        dataset_scores=DATASET_SCORES,
+                        dataset_scores=DATASET_SCORES_WO_CONFIG,
                         category_scores=None,
                         output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl",
                     )
@@ -257,7 +293,7 @@ def test_classification_accuracy_evaluate_without_model(
                         eval_name="classification_accuracy",
                         dataset_name="my_custom_dataset",
                         prompt_template=DEFAULT_PROMPT_TEMPLATE,
-                        dataset_scores=DATASET_SCORES,
+                        dataset_scores=DATASET_SCORES_WO_CONFIG,
                         category_scores=None,
                         output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl",
                     )

From a037b04c50f68b50d7efff50dc68933332db0aeb Mon Sep 17 00:00:00 2001
From: franuluc <franuluc@amazon.de>
Date: Fri, 24 Nov 2023 16:23:56 +0100
Subject: [PATCH 2/2] fix(pre-launch science review): changing config, default
 behaviour for aggregation strategy, introducing unknown ratio measure,
 splitting also considers \n

BREAKING CHANGES: ClassificationAccuracyConfig changing fields, introducing 2 more fields
---
 src/fmeval/eval_algorithms/classification_accuracy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fmeval/eval_algorithms/classification_accuracy.py b/src/fmeval/eval_algorithms/classification_accuracy.py
index 8354fb4b..7103e7a1 100644
--- a/src/fmeval/eval_algorithms/classification_accuracy.py
+++ b/src/fmeval/eval_algorithms/classification_accuracy.py
@@ -311,7 +311,7 @@ def _get_score(self, y_true, y_pred, eval_fn: Callable[..., float]) -> float:
             # picks the averaging strategy according to the number of classes
             avg_strategy = (
                 self._eval_algorithm_config.binary_average_strategy
-                if len(self._valid_labels) == 2
+                if len(self._valid_labels) == 2  # type: ignore
                 else self._eval_algorithm_config.multiclass_average_strategy
             )
             return eval_fn(