Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 53 additions & 16 deletions src/fmeval/eval_algorithms/classification_accuracy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
import warnings
from dataclasses import dataclass
from typing import Callable, Dict, List, Optional, Any
Expand Down Expand Up @@ -28,7 +29,6 @@
DATASET_CONFIGS,
CategoryScore,
get_default_prompt_template,
WOMENS_CLOTHING_ECOMMERCE_REVIEWS,
)
from fmeval.eval_algorithms.util import (
generate_prompt_column_for_dataset,
Expand All @@ -46,13 +46,27 @@
BALANCED_ACCURACY_SCORE = "balanced_accuracy_score"
PRECISION_SCORE = "precision_score"
RECALL_SCORE = "recall_score"
UNKNOWN_RATIO = "unknown_ratio"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't add new scores right now. We are very close to launch, this can break integration tests for us and clients.

Can we please stick to bug fixes and refactoring only? new scores should be added post launch.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure we can put this one later on

UNKNOWN_LABEL = "unknown"
PROMPT_COLUMN_NAME = "prompt"
CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME = "classified_model_output"
PREDICTED_CLASS_COLUMN_NAME = "predicted_class"


def unknown_ratio(y_pred) -> float:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add type annotations

Copy link
Contributor

@danielezhu danielezhu Jan 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think type annotations are tricky in these kinds of use cases where the argument is best described as a "1D array-like" (see sklearn docs for example).

I supposed we could do something like Union[pandas.DataFrame, pandas.Series, numpy.array, List[Any]], but that would be quite clunky, and not even necessarily exhaustive.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps rename this to unknown_fraction instead?

"""
Computes the ratio of predictions that do not contain any valid class label

:param y_pred: Predicted class labels, array or pandas dataframe
:return: The fraction of predicted labels that do not match any known class.
"""
return (y_pred == UNKNOWN_LABEL).sum() / y_pred.count()


CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {
BALANCED_ACCURACY_SCORE: balanced_accuracy_score,
PRECISION_SCORE: precision_score,
RECALL_SCORE: recall_score,
UNKNOWN_RATIO: unknown_ratio,
}
UNIQUENESS_FACTOR = 0.05

Expand All @@ -73,7 +87,7 @@ def convert_model_output_to_label(model_output: str, valid_labels: List[str]) ->
# normalise to lowercase & strip
valid_labels = [label.lower().strip() for label in valid_labels]

response_words = model_output.split(" ")
response_words = re.split(r"[\n\s+]", model_output)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not tested as far as I can tell, and I am in doubt about what it does and how it relates to batching. Can you add a unit test please?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a standard regular expression. It matches new lines \n or one or more spaces \s+. The [...] indicates set matching (which is same as a logical or among characters).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we are anticipating new lines within a single answer? Are you imagining a case where the model replies something like this?

The answer is 
2

I still think a unit test could be a good way to show what we are matching here, could just be one more test case in test_convert_model_output_to_label.

predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels]
# if there is more than one label in the model output we pick the first
string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL
Expand All @@ -88,15 +102,24 @@ class ClassificationAccuracyConfig(EvalAlgorithmConfig):

:param valid_labels: The labels of the classes predicted from the model.
:param converter_fn: Function to process model output to labels, defaults to simple integer conversion.
:param binary_average_strategy: `average` to be passed to sklearn's precision and recall scores.
This determines how scores are aggregated in the binary classification settings
(see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html).
Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'.
:param positive_label: The label that is considered to be the positive class when computing precision and recall
for binary classification problems and the averaging strategy is `binary`. This parameter has no effect
for the multiclass case. Default='1'.
:param multiclass_average_strategy: `average` to be passed to sklearn's precision and recall scores.
This determines how scores are aggregated in the multiclass classification setting
This determines how scores are aggregated in multiclass classification settings
(see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html).
Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'.
Options are {'micro', 'macro', 'samples', 'weighted'} or None, default='weighted'.
"""

valid_labels: Optional[List[str]] = None
converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label
multiclass_average_strategy: Optional[str] = "micro"
binary_average_strategy: Optional[str] = "binary"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you update this, you have to update ClassificationAccuracySemanticRobustnessConfig as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a variable for binary_average_strategy="binary"? It can be hardcoded since there is no other options for the binary case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@keerthanvasist ok will do.
@polaschwoebel no, all other options are valid as well for the binary case and result in different computations, so they still make sense

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I doubt that others will be used much -- "binary" is very much the standard behavior as we discussed offline -- but if we can add this new parameter to the config without too much trouble it's of course more flexible.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I very much agree with Pola; "binary" is pretty much the only option that makes sense here. I suppose giving the user freedom to choose other options is fine, since we've configured the default appropriately. If they choose something like "micro" and are confused why precision and recall are always the same, that's a self-inflicted customer error.

positive_label: str = "1"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am also concerned with this default. I am worried this won't play well with valid_labels. Let's discuss on Monday.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. What are you concerned about?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Keerthan, positive_label should be one of the valid_labels.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes true, but the problem is that if we autodetect the valid labels, I don't think the order of the resulting list is going to be deterministinc (especially when we do downsampling). So.. if we pick, say, the first then we might pick a label depending on the random seed which is also not that good.

What I can do is to leave positive_label here, and if positive_label is not in valid_labels then I pick the first of valid_labels and print a warning. Would this work?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so!

multiclass_average_strategy: Optional[str] = "weighted"

def __post_init__(self):
if self.valid_labels:
Expand Down Expand Up @@ -138,7 +161,7 @@ def evaluate(
EvalAlgorithmInterface.EVAL_RESULTS_PATH
:param num_records: The number of records to be sampled randomly from the input dataset to perform the
evaluation
:returns: List of EvalOutput objects. Current implementation returns only one score.
:returns: List of EvalOutput objects.
"""
if dataset_config:
dataset_configs = [dataset_config]
Expand Down Expand Up @@ -185,11 +208,11 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove
Map function for generating classified model output and classification accuracy
columns for dataset.
"""
row[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] = self._eval_algorithm_config.converter_fn(
row[PREDICTED_CLASS_COLUMN_NAME] = self._eval_algorithm_config.converter_fn(
row[MODEL_OUTPUT_COLUMN_NAME], self._valid_labels # type: ignore
)
row[CLASSIFICATION_ACCURACY_SCORE] = int(
row[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] == str(row[TARGET_OUTPUT_COLUMN_NAME])
row[PREDICTED_CLASS_COLUMN_NAME] == str(row[TARGET_OUTPUT_COLUMN_NAME])
)
return row

Expand All @@ -208,7 +231,7 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove
value=self._get_score(
# TODO dataloader should ensure target output is string
y_true=df[TARGET_OUTPUT_COLUMN_NAME],
y_pred=df[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME],
y_pred=df[PREDICTED_CLASS_COLUMN_NAME],
eval_fn=eval_fn,
),
)
Expand All @@ -232,7 +255,7 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove
df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], TARGET_OUTPUT_COLUMN_NAME
]
categorical_y_pred = df.loc[
df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME
df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], PREDICTED_CLASS_COLUMN_NAME
]
for eval_score, eval_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items():
category_scores[row[CATEGORY_COLUMN_NAME]].scores.append(
Expand Down Expand Up @@ -274,14 +297,28 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove

def _get_score(self, y_true, y_pred, eval_fn: Callable[..., float]) -> float:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add unit tests for _get_score for the various cases (binary recall, binary precision, multiclass recall, multiclass precision, etc).

"""
Method to generate accuracy score
:param y_true: Ground truth (correct) target values.
:param y_pred: Estimated targets as returned by a classifier.
:param eval_fn: Score evaluate function.
Method to compute accuracy scores
:param y_true: Ground truth (correct) labels.
:param y_pred: Predicted labels.
:param eval_fn: method to compute the score.
:returns: Computed score
"""
if eval_fn == recall_score or eval_fn == precision_score:
return eval_fn(y_true, y_pred, average=self._eval_algorithm_config.multiclass_average_strategy)
# compute these metrics only on the subset of records that have a known predicted label
y_true_sub = y_true[y_pred != UNKNOWN_LABEL]
y_pred_sub = y_pred[y_pred != UNKNOWN_LABEL]

# picks the averaging strategy according to the number of classes
avg_strategy = (
self._eval_algorithm_config.binary_average_strategy
if len(self._valid_labels) == 2 # type: ignore
else self._eval_algorithm_config.multiclass_average_strategy
)
return eval_fn(
y_true_sub, y_pred_sub, average=avg_strategy, pos_label=self._eval_algorithm_config.positive_label
)
elif eval_fn == unknown_ratio:
return eval_fn(y_pred)
return eval_fn(y_true, y_pred)

def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: # type: ignore[override]
Expand Down
48 changes: 42 additions & 6 deletions test/unit/eval_algorithms/test_classification_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
BALANCED_ACCURACY_SCORE,
PRECISION_SCORE,
RECALL_SCORE,
UNKNOWN_RATIO,
)
from ray.data import Dataset

Expand Down Expand Up @@ -77,6 +78,15 @@
EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2),
EvalScore(name=PRECISION_SCORE, value=2 / 3),
EvalScore(name=RECALL_SCORE, value=2 / 3),
EvalScore(name=UNKNOWN_RATIO, value=0.0),
]

DATASET_SCORES_WO_CONFIG = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain what this represents? I see that you replaced all instances of DATASET_SCORES with DATASET_SCORES_WO_CONFIG. Shouldn't we still have some tests that use DATASET_SCORES? Also, could you please come up with a case where the expected precision and recall scores are different?

EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=2 / 3),
EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2),
EvalScore(name=PRECISION_SCORE, value=0),
EvalScore(name=RECALL_SCORE, value=0),
EvalScore(name=UNKNOWN_RATIO, value=1 / 3),
]

CATEGORY_SCORES = [
Expand All @@ -87,6 +97,7 @@
EvalScore(name=BALANCED_ACCURACY_SCORE, value=1.0),
EvalScore(name=PRECISION_SCORE, value=1.0),
EvalScore(name=RECALL_SCORE, value=1.0),
EvalScore(name=UNKNOWN_RATIO, value=0.0),
],
),
CategoryScore(
Expand All @@ -96,6 +107,31 @@
EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2),
EvalScore(name=PRECISION_SCORE, value=1 / 2),
EvalScore(name=RECALL_SCORE, value=1 / 2),
EvalScore(name=UNKNOWN_RATIO, value=0.0),
],
),
]


CATEGORY_SCORES_WO_CONFIG = [
CategoryScore(
name="brownie",
scores=[
EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1.0),
EvalScore(name=BALANCED_ACCURACY_SCORE, value=1.0),
EvalScore(name=PRECISION_SCORE, value=0),
EvalScore(name=RECALL_SCORE, value=0),
EvalScore(name=UNKNOWN_RATIO, value=0),
],
),
CategoryScore(
name="vanilla cake",
scores=[
EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1 / 2),
EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2),
EvalScore(name=PRECISION_SCORE, value=0),
EvalScore(name=RECALL_SCORE, value=0),
EvalScore(name=UNKNOWN_RATIO, value=1 / 2),
],
),
]
Expand Down Expand Up @@ -185,8 +221,8 @@ def test_classification_accuracy_evaluate_without_model(
eval_name="classification_accuracy",
dataset_name=WOMENS_CLOTHING_ECOMMERCE_REVIEWS,
prompt_template=BUILT_IN_DATASET_DEFAULT_PROMPT_TEMPLATES[WOMENS_CLOTHING_ECOMMERCE_REVIEWS],
dataset_scores=DATASET_SCORES,
category_scores=CATEGORY_SCORES,
dataset_scores=DATASET_SCORES_WO_CONFIG,
category_scores=CATEGORY_SCORES_WO_CONFIG,
output_path="/tmp/eval_results/classification_accuracy_womens_clothing_ecommerce_reviews.jsonl",
),
],
Expand All @@ -209,8 +245,8 @@ def test_classification_accuracy_evaluate_without_model(
eval_name="classification_accuracy",
dataset_name="my_custom_dataset",
prompt_template="Classify: $feature",
dataset_scores=DATASET_SCORES,
category_scores=CATEGORY_SCORES,
dataset_scores=DATASET_SCORES_WO_CONFIG,
category_scores=CATEGORY_SCORES_WO_CONFIG,
output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl",
)
],
Expand All @@ -233,7 +269,7 @@ def test_classification_accuracy_evaluate_without_model(
eval_name="classification_accuracy",
dataset_name="my_custom_dataset",
prompt_template="Classify: $feature",
dataset_scores=DATASET_SCORES,
dataset_scores=DATASET_SCORES_WO_CONFIG,
category_scores=None,
output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl",
)
Expand All @@ -257,7 +293,7 @@ def test_classification_accuracy_evaluate_without_model(
eval_name="classification_accuracy",
dataset_name="my_custom_dataset",
prompt_template=DEFAULT_PROMPT_TEMPLATE,
dataset_scores=DATASET_SCORES,
dataset_scores=DATASET_SCORES_WO_CONFIG,
category_scores=None,
output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl",
)
Expand Down