-
Notifications
You must be signed in to change notification settings - Fork 58
fix(pre-launch science review): changing config, default behaviour fo… #138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| import logging | ||
| import re | ||
| import warnings | ||
| from dataclasses import dataclass | ||
| from typing import Callable, Dict, List, Optional, Any | ||
|
|
@@ -28,7 +29,6 @@ | |
| DATASET_CONFIGS, | ||
| CategoryScore, | ||
| get_default_prompt_template, | ||
| WOMENS_CLOTHING_ECOMMERCE_REVIEWS, | ||
| ) | ||
| from fmeval.eval_algorithms.util import ( | ||
| generate_prompt_column_for_dataset, | ||
|
|
@@ -46,13 +46,27 @@ | |
| BALANCED_ACCURACY_SCORE = "balanced_accuracy_score" | ||
| PRECISION_SCORE = "precision_score" | ||
| RECALL_SCORE = "recall_score" | ||
| UNKNOWN_RATIO = "unknown_ratio" | ||
| UNKNOWN_LABEL = "unknown" | ||
| PROMPT_COLUMN_NAME = "prompt" | ||
| CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME = "classified_model_output" | ||
| PREDICTED_CLASS_COLUMN_NAME = "predicted_class" | ||
|
|
||
|
|
||
| def unknown_ratio(y_pred) -> float: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type annotations
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think type annotations are tricky in these kinds of use cases where the argument is best described as a "1D array-like" (see sklearn docs for example). I supposed we could do something like
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps rename this to |
||
| """ | ||
| Computes the ratio of predictions that do not contain any valid class label | ||
|
|
||
| :param y_pred: Predicted class labels, array or pandas dataframe | ||
| :return: The fraction of predicted labels that do not match any known class. | ||
| """ | ||
| return (y_pred == UNKNOWN_LABEL).sum() / y_pred.count() | ||
|
|
||
|
|
||
| CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = { | ||
| BALANCED_ACCURACY_SCORE: balanced_accuracy_score, | ||
| PRECISION_SCORE: precision_score, | ||
| RECALL_SCORE: recall_score, | ||
| UNKNOWN_RATIO: unknown_ratio, | ||
| } | ||
| UNIQUENESS_FACTOR = 0.05 | ||
|
|
||
|
|
@@ -73,7 +87,7 @@ def convert_model_output_to_label(model_output: str, valid_labels: List[str]) -> | |
| # normalise to lowercase & strip | ||
| valid_labels = [label.lower().strip() for label in valid_labels] | ||
|
|
||
| response_words = model_output.split(" ") | ||
| response_words = re.split(r"[\n\s+]", model_output) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not tested as far as I can tell, and I am in doubt about what it does and how it relates to batching. Can you add a unit test please?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a standard regular expression. It matches new lines
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we are anticipating new lines within a single answer? Are you imagining a case where the model replies something like this? I still think a unit test could be a good way to show what we are matching here, could just be one more test case in |
||
| predicted_labels = [word.lower().strip() for word in response_words if word.lower().strip() in valid_labels] | ||
| # if there is more than one label in the model output we pick the first | ||
| string_label = predicted_labels[0] if predicted_labels else UNKNOWN_LABEL | ||
|
|
@@ -88,15 +102,24 @@ class ClassificationAccuracyConfig(EvalAlgorithmConfig): | |
|
|
||
| :param valid_labels: The labels of the classes predicted from the model. | ||
| :param converter_fn: Function to process model output to labels, defaults to simple integer conversion. | ||
| :param binary_average_strategy: `average` to be passed to sklearn's precision and recall scores. | ||
| This determines how scores are aggregated in the binary classification settings | ||
| (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html). | ||
| Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'. | ||
| :param positive_label: The label that is considered to be the positive class when computing precision and recall | ||
| for binary classification problems and the averaging strategy is `binary`. This parameter has no effect | ||
| for the multiclass case. Default='1'. | ||
| :param multiclass_average_strategy: `average` to be passed to sklearn's precision and recall scores. | ||
| This determines how scores are aggregated in the multiclass classification setting | ||
| This determines how scores are aggregated in multiclass classification settings | ||
| (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html). | ||
| Options are {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='micro'. | ||
| Options are {'micro', 'macro', 'samples', 'weighted'} or None, default='weighted'. | ||
| """ | ||
|
|
||
| valid_labels: Optional[List[str]] = None | ||
| converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label | ||
| multiclass_average_strategy: Optional[str] = "micro" | ||
| binary_average_strategy: Optional[str] = "binary" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you update this, you have to update
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need a variable for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @keerthanvasist ok will do.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I doubt that others will be used much -- "binary" is very much the standard behavior as we discussed offline -- but if we can add this new parameter to the config without too much trouble it's of course more flexible.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I very much agree with Pola; "binary" is pretty much the only option that makes sense here. I suppose giving the user freedom to choose other options is fine, since we've configured the default appropriately. If they choose something like "micro" and are confused why precision and recall are always the same, that's a self-inflicted customer error. |
||
| positive_label: str = "1" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am also concerned with this default. I am worried this won't play well with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. What are you concerned about?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with Keerthan,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes true, but the problem is that if we autodetect the valid labels, I don't think the order of the resulting list is going to be deterministinc (especially when we do downsampling). So.. if we pick, say, the first then we might pick a label depending on the random seed which is also not that good. What I can do is to leave
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think so! |
||
| multiclass_average_strategy: Optional[str] = "weighted" | ||
|
|
||
| def __post_init__(self): | ||
| if self.valid_labels: | ||
|
|
@@ -138,7 +161,7 @@ def evaluate( | |
| EvalAlgorithmInterface.EVAL_RESULTS_PATH | ||
| :param num_records: The number of records to be sampled randomly from the input dataset to perform the | ||
| evaluation | ||
| :returns: List of EvalOutput objects. Current implementation returns only one score. | ||
| :returns: List of EvalOutput objects. | ||
| """ | ||
| if dataset_config: | ||
| dataset_configs = [dataset_config] | ||
|
|
@@ -185,11 +208,11 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove | |
| Map function for generating classified model output and classification accuracy | ||
| columns for dataset. | ||
| """ | ||
| row[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] = self._eval_algorithm_config.converter_fn( | ||
| row[PREDICTED_CLASS_COLUMN_NAME] = self._eval_algorithm_config.converter_fn( | ||
| row[MODEL_OUTPUT_COLUMN_NAME], self._valid_labels # type: ignore | ||
| ) | ||
| row[CLASSIFICATION_ACCURACY_SCORE] = int( | ||
| row[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME] == str(row[TARGET_OUTPUT_COLUMN_NAME]) | ||
| row[PREDICTED_CLASS_COLUMN_NAME] == str(row[TARGET_OUTPUT_COLUMN_NAME]) | ||
| ) | ||
| return row | ||
|
|
||
|
|
@@ -208,7 +231,7 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove | |
| value=self._get_score( | ||
| # TODO dataloader should ensure target output is string | ||
| y_true=df[TARGET_OUTPUT_COLUMN_NAME], | ||
| y_pred=df[CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME], | ||
| y_pred=df[PREDICTED_CLASS_COLUMN_NAME], | ||
| eval_fn=eval_fn, | ||
| ), | ||
| ) | ||
|
|
@@ -232,7 +255,7 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove | |
| df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], TARGET_OUTPUT_COLUMN_NAME | ||
| ] | ||
| categorical_y_pred = df.loc[ | ||
| df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME | ||
| df[CATEGORY_COLUMN_NAME] == row[CATEGORY_COLUMN_NAME], PREDICTED_CLASS_COLUMN_NAME | ||
| ] | ||
| for eval_score, eval_fn in CLASSIFICATION_ACCURACY_SCORES_TO_FUNCS.items(): | ||
| category_scores[row[CATEGORY_COLUMN_NAME]].scores.append( | ||
|
|
@@ -274,14 +297,28 @@ def _generate_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cove | |
|
|
||
| def _get_score(self, y_true, y_pred, eval_fn: Callable[..., float]) -> float: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add unit tests for |
||
| """ | ||
| Method to generate accuracy score | ||
| :param y_true: Ground truth (correct) target values. | ||
| :param y_pred: Estimated targets as returned by a classifier. | ||
| :param eval_fn: Score evaluate function. | ||
| Method to compute accuracy scores | ||
| :param y_true: Ground truth (correct) labels. | ||
| :param y_pred: Predicted labels. | ||
| :param eval_fn: method to compute the score. | ||
| :returns: Computed score | ||
| """ | ||
| if eval_fn == recall_score or eval_fn == precision_score: | ||
| return eval_fn(y_true, y_pred, average=self._eval_algorithm_config.multiclass_average_strategy) | ||
| # compute these metrics only on the subset of records that have a known predicted label | ||
| y_true_sub = y_true[y_pred != UNKNOWN_LABEL] | ||
| y_pred_sub = y_pred[y_pred != UNKNOWN_LABEL] | ||
|
|
||
| # picks the averaging strategy according to the number of classes | ||
| avg_strategy = ( | ||
| self._eval_algorithm_config.binary_average_strategy | ||
polaschwoebel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if len(self._valid_labels) == 2 # type: ignore | ||
| else self._eval_algorithm_config.multiclass_average_strategy | ||
| ) | ||
| return eval_fn( | ||
| y_true_sub, y_pred_sub, average=avg_strategy, pos_label=self._eval_algorithm_config.positive_label | ||
| ) | ||
| elif eval_fn == unknown_ratio: | ||
| return eval_fn(y_pred) | ||
| return eval_fn(y_true, y_pred) | ||
|
|
||
| def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: # type: ignore[override] | ||
|
|
||
polaschwoebel marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| BALANCED_ACCURACY_SCORE, | ||
| PRECISION_SCORE, | ||
| RECALL_SCORE, | ||
| UNKNOWN_RATIO, | ||
| ) | ||
| from ray.data import Dataset | ||
|
|
||
|
|
@@ -77,6 +78,15 @@ | |
| EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2), | ||
| EvalScore(name=PRECISION_SCORE, value=2 / 3), | ||
| EvalScore(name=RECALL_SCORE, value=2 / 3), | ||
| EvalScore(name=UNKNOWN_RATIO, value=0.0), | ||
| ] | ||
|
|
||
| DATASET_SCORES_WO_CONFIG = [ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain what this represents? I see that you replaced all instances of |
||
| EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=2 / 3), | ||
| EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2), | ||
| EvalScore(name=PRECISION_SCORE, value=0), | ||
| EvalScore(name=RECALL_SCORE, value=0), | ||
| EvalScore(name=UNKNOWN_RATIO, value=1 / 3), | ||
| ] | ||
|
|
||
| CATEGORY_SCORES = [ | ||
|
|
@@ -87,6 +97,7 @@ | |
| EvalScore(name=BALANCED_ACCURACY_SCORE, value=1.0), | ||
| EvalScore(name=PRECISION_SCORE, value=1.0), | ||
| EvalScore(name=RECALL_SCORE, value=1.0), | ||
| EvalScore(name=UNKNOWN_RATIO, value=0.0), | ||
| ], | ||
| ), | ||
| CategoryScore( | ||
|
|
@@ -96,6 +107,31 @@ | |
| EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2), | ||
| EvalScore(name=PRECISION_SCORE, value=1 / 2), | ||
| EvalScore(name=RECALL_SCORE, value=1 / 2), | ||
| EvalScore(name=UNKNOWN_RATIO, value=0.0), | ||
| ], | ||
| ), | ||
| ] | ||
|
|
||
|
|
||
| CATEGORY_SCORES_WO_CONFIG = [ | ||
| CategoryScore( | ||
| name="brownie", | ||
| scores=[ | ||
| EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1.0), | ||
| EvalScore(name=BALANCED_ACCURACY_SCORE, value=1.0), | ||
| EvalScore(name=PRECISION_SCORE, value=0), | ||
| EvalScore(name=RECALL_SCORE, value=0), | ||
| EvalScore(name=UNKNOWN_RATIO, value=0), | ||
| ], | ||
| ), | ||
| CategoryScore( | ||
| name="vanilla cake", | ||
| scores=[ | ||
| EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1 / 2), | ||
| EvalScore(name=BALANCED_ACCURACY_SCORE, value=1 / 2), | ||
| EvalScore(name=PRECISION_SCORE, value=0), | ||
| EvalScore(name=RECALL_SCORE, value=0), | ||
| EvalScore(name=UNKNOWN_RATIO, value=1 / 2), | ||
| ], | ||
| ), | ||
| ] | ||
|
|
@@ -185,8 +221,8 @@ def test_classification_accuracy_evaluate_without_model( | |
| eval_name="classification_accuracy", | ||
| dataset_name=WOMENS_CLOTHING_ECOMMERCE_REVIEWS, | ||
| prompt_template=BUILT_IN_DATASET_DEFAULT_PROMPT_TEMPLATES[WOMENS_CLOTHING_ECOMMERCE_REVIEWS], | ||
| dataset_scores=DATASET_SCORES, | ||
| category_scores=CATEGORY_SCORES, | ||
| dataset_scores=DATASET_SCORES_WO_CONFIG, | ||
| category_scores=CATEGORY_SCORES_WO_CONFIG, | ||
| output_path="/tmp/eval_results/classification_accuracy_womens_clothing_ecommerce_reviews.jsonl", | ||
| ), | ||
| ], | ||
|
|
@@ -209,8 +245,8 @@ def test_classification_accuracy_evaluate_without_model( | |
| eval_name="classification_accuracy", | ||
| dataset_name="my_custom_dataset", | ||
| prompt_template="Classify: $feature", | ||
| dataset_scores=DATASET_SCORES, | ||
| category_scores=CATEGORY_SCORES, | ||
| dataset_scores=DATASET_SCORES_WO_CONFIG, | ||
| category_scores=CATEGORY_SCORES_WO_CONFIG, | ||
| output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl", | ||
| ) | ||
| ], | ||
|
|
@@ -233,7 +269,7 @@ def test_classification_accuracy_evaluate_without_model( | |
| eval_name="classification_accuracy", | ||
| dataset_name="my_custom_dataset", | ||
| prompt_template="Classify: $feature", | ||
| dataset_scores=DATASET_SCORES, | ||
| dataset_scores=DATASET_SCORES_WO_CONFIG, | ||
| category_scores=None, | ||
| output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl", | ||
| ) | ||
|
|
@@ -257,7 +293,7 @@ def test_classification_accuracy_evaluate_without_model( | |
| eval_name="classification_accuracy", | ||
| dataset_name="my_custom_dataset", | ||
| prompt_template=DEFAULT_PROMPT_TEMPLATE, | ||
| dataset_scores=DATASET_SCORES, | ||
| dataset_scores=DATASET_SCORES_WO_CONFIG, | ||
| category_scores=None, | ||
| output_path="/tmp/eval_results/classification_accuracy_my_custom_dataset.jsonl", | ||
| ) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can't add new scores right now. We are very close to launch, this can break integration tests for us and clients.
Can we please stick to bug fixes and refactoring only? new scores should be added post launch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure we can put this one later on