From d97ce50dec04c85a0c474d4a1afccbeefe4cc20b Mon Sep 17 00:00:00 2001 From: franuluc Date: Fri, 24 Nov 2023 16:23:56 +0100 Subject: [PATCH 1/2] fix(pre-launch science review): changing config, default behaviour for aggregation strategy, introducing unknown ratio measure, splitting also considers \n BREAKING CHANGES: ClassificationAccuracyConfig changing fields, introducing 2 more fields --- .../classification_accuracy.py | 69 ++++++++++++++----- .../test_classification_accuracy.py | 48 +++++++++++-- 2 files changed, 95 insertions(+), 22 deletions(-) diff --git a/src/fmeval/eval_algorithms/classification_accuracy.py b/src/fmeval/eval_algorithms/classification_accuracy.py index 62080fc0..8354fb4b 100644 --- a/src/fmeval/eval_algorithms/classification_accuracy.py +++ b/src/fmeval/eval_algorithms/classification_accuracy.py @@ -1,4 +1,5 @@ import logging +import re import warnings from dataclasses import dataclass from typing import Callable, Dict, List, Optional, Any @@ -28,7 +29,6 @@ DATASET_CONFIGS, CategoryScore, get_default_prompt_template, - WOMENS_CLOTHING_ECOMMERCE_REVIEWS, ) from fmeval.eval_algorithms.util import ( generate_prompt_column_for_dataset, @@ -46,13 +46,27 @@ BALANCED_ACCURACY_SCORE = "balanced_accuracy_score" PRECISION_SCORE = "precision_score" RECALL_SCORE = "recall_score" +UNKNOWN_RATIO = "unknown_ratio" UNKNOWN_LABEL = "unknown" PROMPT_COLUMN_NAME = "prompt" -CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME = "classified_model_output" +PREDICTED_CLASS_COLUMN_NAME = "predicted_class" + + +def unknown_ratio(y_pred) -> float: + """ + Computes the ratio of predictions that do not contain any valid class label + + :param y_pred: Predicted class labels, array or pandas dataframe + :return: The fraction of predicted labels that do not match any known class. + """ + return (y_pred == UNKNOWN_LABEL).sum() / y_pred.count() + + CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = { BALANCED_ACCURACY_SCORE: balanced_accuracy_score, PRECISION_SCORE: precision_score, RECALL_SCORE: recall_score, + UNKNOWN_RATIO: unknown_ratio, } UNIQUENESS_FACTOR = 0.05 @@ -73,7 +87,7 @@ def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> # normalise to lowercase & strip valid_labels = [label.lower().strip() for label in valid_labels] - response_words = model_output.split(" ") + response_words = re.split(r"[\n\s+]", model_output) predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels] # if there is more than one label in the model output we pick the first string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL @@ -88,15 +102,24 @@ class ClassificationAccuracyConfig(EvalAlgorithmConfig): :param valid_labels: The labels of the classes predicted from the model. :param converter_fn: Function to process model output to labels, defaults to simple integer conversion. + :param binary_average_strategy: `average` to be passed to sklearn's precision and recall scores. + This determines how scores are aggregated in the binary classification settings + (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html). + Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'. + :param positive_label: The label that is considered to be the positive class when computing precision and recall + for binary classification problems and the averaging strategy is `binary`. This parameter has no effect + for the multiclass case. Default='1'. :param multiclass_average_strategy: `average` to be passed to sklearn's precision and recall scores. - This determines how scores are aggregated in the multiclass classification setting + This determines how scores are aggregated in multiclass classification settings (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html). - Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'. + Options are {'micro', 'macro', 'samples', 'weighted'} or None, default='weighted'. """ valid_labels: Optional[List[str]] = None converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label - multiclass_average_strategy: Optional[str] = "micro" + binary_average_strategy: Optional[str] = "binary" + positive_label: str = "1" + multiclass_average_strategy: Optional[str] = "weighted" def __post_init__(self): if self.valid_labels: @@ -138,7 +161,7 @@ def evaluate( EvalAlgorithmInterface.EVAL_RESULTS_PATH :param num_records: The number of records to be sampled randomly from the input dataset to perform the evaluation - :returns: List of EvalOutput objects. Current implementation returns only one score. + :returns: List of EvalOutput objects. """ if dataset_config: dataset_configs = [dataset_config] @@ -185,11 +208,11 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove Map function for generating classified model output and classification accuracy columns for dataset. """ - row[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] = self._eval_algorithm_config.converter_fn( + row[PREDICTED_CLASS_COLUMN_NAME] = self._eval_algorithm_config.converter_fn( row[MODEL_OUTPUT_COLUMN_NAME], self._valid_labels # type: ignore ) row[CLASSIFICATION_ACCURACY_SCORE] = int( - row[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] == str(row[TARGET_OUTPUT_COLUMN_NAME]) + row[PREDICTED_CLASS_COLUMN_NAME] == str(row[TARGET_OUTPUT_COLUMN_NAME]) ) return row @@ -208,7 +231,7 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove value=self._get_score( # TODO dataloader should ensure target output is string y_true=df[TARGET_OUTPUT_COLUMN_NAME], - y_pred=df[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME], + y_pred=df[PREDICTED_CLASS_COLUMN_NAME], eval_fn=eval_fn, ), ) @@ -232,7 +255,7 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], TARGET_OUTPUT_COLUMN_NAME ] categorical_y_pred = df.loc[ - df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME + df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], PREDICTED_CLASS_COLUMN_NAME ] for eval_score, eval_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items(): category_scores[row[CATEGORY_COLUMN_NAME]].scores.append( @@ -274,14 +297,28 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove def _get_score(self, y_true, y_pred, eval_fn: Callable[..., float]) -> float: """ - Method to generate accuracy score - :param y_true: Ground truth (correct) target values. - :param y_pred: Estimated targets as returned by a classifier. - :param eval_fn: Score evaluate function. + Method to compute accuracy scores + :param y_true: Ground truth (correct) labels. + :param y_pred: Predicted labels. + :param eval_fn: method to compute the score. :returns: Computed score """ if eval_fn == recall_score or eval_fn == precision_score: - return eval_fn(y_true, y_pred, average=self._eval_algorithm_config.multiclass_average_strategy) + # compute these metrics only on the subset of records that have a known predicted label + y_true_sub = y_true[y_pred != UNKNOWN_LABEL] + y_pred_sub = y_pred[y_pred != UNKNOWN_LABEL] + + # picks the averaging strategy according to the number of classes + avg_strategy = ( + self._eval_algorithm_config.binary_average_strategy + if len(self._valid_labels) == 2 + else self._eval_algorithm_config.multiclass_average_strategy + ) + return eval_fn( + y_true_sub, y_pred_sub, average=avg_strategy, pos_label=self._eval_algorithm_config.positive_label + ) + elif eval_fn == unknown_ratio: + return eval_fn(y_pred) return eval_fn(y_true, y_pred) def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: # type: ignore[override] diff --git a/test/unit/eval_algorithms/test_classification_accuracy.py b/test/unit/eval_algorithms/test_classification_accuracy.py index d0d5128e..f455409d 100644 --- a/test/unit/eval_algorithms/test_classification_accuracy.py +++ b/test/unit/eval_algorithms/test_classification_accuracy.py @@ -31,6 +31,7 @@ BALANCED_ACCURACY_SCORE, PRECISION_SCORE, RECALL_SCORE, + UNKNOWN_RATIO, ) from ray.data import Dataset @@ -77,6 +78,15 @@ EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2), EvalScore(name=PRECISION_SCORE, value=2 / 3), EvalScore(name=RECALL_SCORE, value=2 / 3), + EvalScore(name=UNKNOWN_RATIO, value=0.0), +] + +DATASET_SCORES_WO_CONFIG = [ + EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=2 / 3), + EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2), + EvalScore(name=PRECISION_SCORE, value=0), + EvalScore(name=RECALL_SCORE, value=0), + EvalScore(name=UNKNOWN_RATIO, value=1 / 3), ] CATEGORY_SCORES = [ @@ -87,6 +97,7 @@ EvalScore(name=BALANCED_ACCURACY_SCORE, value=1.0), EvalScore(name=PRECISION_SCORE, value=1.0), EvalScore(name=RECALL_SCORE, value=1.0), + EvalScore(name=UNKNOWN_RATIO, value=0.0), ], ), CategoryScore( @@ -96,6 +107,31 @@ EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2), EvalScore(name=PRECISION_SCORE, value=1 / 2), EvalScore(name=RECALL_SCORE, value=1 / 2), + EvalScore(name=UNKNOWN_RATIO, value=0.0), + ], + ), +] + + +CATEGORY_SCORES_WO_CONFIG = [ + CategoryScore( + name="brownie", + scores=[ + EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1.0), + EvalScore(name=BALANCED_ACCURACY_SCORE, value=1.0), + EvalScore(name=PRECISION_SCORE, value=0), + EvalScore(name=RECALL_SCORE, value=0), + EvalScore(name=UNKNOWN_RATIO, value=0), + ], + ), + CategoryScore( + name="vanilla cake", + scores=[ + EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1 / 2), + EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2), + EvalScore(name=PRECISION_SCORE, value=0), + EvalScore(name=RECALL_SCORE, value=0), + EvalScore(name=UNKNOWN_RATIO, value=1 / 2), ], ), ] @@ -185,8 +221,8 @@ def test_classification_accuracy_evaluate_without_model( eval_name="classification_accuracy", dataset_name=WOMENS_CLOTHING_ECOMMERCE_REVIEWS, prompt_template=BUILT_IN_DATASET_DEFAULT_PROMPT_TEMPLATES[WOMENS_CLOTHING_ECOMMERCE_REVIEWS], - dataset_scores=DATASET_SCORES, - category_scores=CATEGORY_SCORES, + dataset_scores=DATASET_SCORES_WO_CONFIG, + category_scores=CATEGORY_SCORES_WO_CONFIG, output_path="/tmp/eval_results/classification_accuracy_womens_clothing_ecommerce_reviews.jsonl", ), ], @@ -209,8 +245,8 @@ def test_classification_accuracy_evaluate_without_model( eval_name="classification_accuracy", dataset_name="my_custom_dataset", prompt_template="Classify: $feature", - dataset_scores=DATASET_SCORES, - category_scores=CATEGORY_SCORES, + dataset_scores=DATASET_SCORES_WO_CONFIG, + category_scores=CATEGORY_SCORES_WO_CONFIG, output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl", ) ], @@ -233,7 +269,7 @@ def test_classification_accuracy_evaluate_without_model( eval_name="classification_accuracy", dataset_name="my_custom_dataset", prompt_template="Classify: $feature", - dataset_scores=DATASET_SCORES, + dataset_scores=DATASET_SCORES_WO_CONFIG, category_scores=None, output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl", ) @@ -257,7 +293,7 @@ def test_classification_accuracy_evaluate_without_model( eval_name="classification_accuracy", dataset_name="my_custom_dataset", prompt_template=DEFAULT_PROMPT_TEMPLATE, - dataset_scores=DATASET_SCORES, + dataset_scores=DATASET_SCORES_WO_CONFIG, category_scores=None, output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl", ) From a037b04c50f68b50d7efff50dc68933332db0aeb Mon Sep 17 00:00:00 2001 From: franuluc Date: Fri, 24 Nov 2023 16:23:56 +0100 Subject: [PATCH 2/2] fix(pre-launch science review): changing config, default behaviour for aggregation strategy, introducing unknown ratio measure, splitting also considers \n BREAKING CHANGES: ClassificationAccuracyConfig changing fields, introducing 2 more fields --- src/fmeval/eval_algorithms/classification_accuracy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fmeval/eval_algorithms/classification_accuracy.py b/src/fmeval/eval_algorithms/classification_accuracy.py index 8354fb4b..7103e7a1 100644 --- a/src/fmeval/eval_algorithms/classification_accuracy.py +++ b/src/fmeval/eval_algorithms/classification_accuracy.py @@ -311,7 +311,7 @@ def _get_score(self, y_true, y_pred, eval_fn: Callable[..., float]) -> float: # picks the averaging strategy according to the number of classes avg_strategy = ( self._eval_algorithm_config.binary_average_strategy - if len(self._valid_labels) == 2 + if len(self._valid_labels) == 2 # type: ignore else self._eval_algorithm_config.multiclass_average_strategy ) return eval_fn(