diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 2db4abb..7f2ae94 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -1,19 +1,20 @@ import pandas as pd from sklearn.metrics import roc_auc_score, mean_squared_error -from numpy import sqrt import cobra.utils as utils -def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, - target_enc_selection_data: pd.DataFrame, - predictors: list, - target_column: str, - model_type: str = "classification", - preselect_auc_threshold: float = 0.053, - preselect_rmse_threshold: float = 5, - preselect_overtrain_threshold: float = 0.05 - ) -> pd.DataFrame: + +def compute_univariate_preselection( + target_enc_train_data: pd.DataFrame, + target_enc_selection_data: pd.DataFrame, + predictors: list, + target_column: str, + model_type: str = "classification", + preselect_auc_threshold: float = 0.053, + preselect_rmse_threshold: float = 5, + preselect_overtrain_threshold: float = 0.05 +) -> pd.DataFrame: """Perform a preselection of predictors based on an AUC (in case of classification) or a RMSE (in case of regression) threshold of a univariate model on a train and selection dataset and return a DataFrame @@ -64,69 +65,81 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, """ result = [] + # TODO: Change this to `if is_error_metric` or similar if model_type == "classification": - for predictor in predictors: - - cleaned_predictor = utils.clean_predictor_name(predictor) - - auc_train = roc_auc_score( - y_true=target_enc_train_data[target_column], - y_score=target_enc_train_data[predictor]) - - auc_selection = roc_auc_score( - y_true=target_enc_selection_data[target_column], - y_score=target_enc_selection_data[predictor]) - - result.append({"predictor": cleaned_predictor, - "AUC train": auc_train, - "AUC selection": auc_selection}) - - df_auc = pd.DataFrame(result) - - # Filter based on min. AUC - auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold - - # Identify those variables for which the AUC difference between train - # and selection is within a user-defined ratio - auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) - < preselect_overtrain_threshold) - - df_auc["preselection"] = auc_thresh & auc_overtrain - - df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True) - - elif model_type == "regression": - for predictor in predictors: - cleaned_predictor = utils.clean_predictor_name(predictor) - - rmse_train = sqrt(mean_squared_error( - y_true=target_enc_train_data[target_column], - y_pred=target_enc_train_data[predictor])) - - rmse_selection = sqrt(mean_squared_error( - y_true=target_enc_selection_data[target_column], - y_pred=target_enc_selection_data[predictor])) - - result.append({"predictor": cleaned_predictor, - "RMSE train": rmse_train, - "RMSE selection": rmse_selection}) + scoring_method = roc_auc_score + kwargs = {} + scoring_method_str = "AUC" + else: + scoring_method = mean_squared_error + kwargs = {"squared": False} + scoring_method_str = "RMSE" + + for predictor in predictors: + cleaned_predictor = utils.clean_predictor_name(predictor) + + score_train = scoring_method( + target_enc_train_data[target_column], + target_enc_train_data[predictor], + **kwargs + ) + + score_selection = scoring_method( + target_enc_selection_data[target_column], + target_enc_selection_data[predictor], + **kwargs + ) + + result.append( + { + "predictor": cleaned_predictor, + f"{scoring_method_str} train": score_train, + f"{scoring_method_str} selection": score_selection + } + ) + + df_score = pd.DataFrame(result) + + # TODO: This should be `if error_metric` instead of classification vs regression + # This opens the door to customised scoring methods + if model_type == "classification": + df_out = filter_preselection_score_based(df_score, preselect_auc_threshold, preselect_overtrain_threshold, scoring_method_str) + else: + # What if they fill in something else than `regression`? + df_out = filter_preselection_error_based(df_score, preselect_rmse_threshold, preselect_overtrain_threshold, scoring_method_str) + return df_out - df_rmse = pd.DataFrame(result) - # Filter based on max. RMSE - rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold +def filter_preselection_error_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame: + """Filter the dataframe based on the given thresholds for error-based metrics.""" + score_thresh = df.loc[:, f"{scoring_method} selection"] < preselect_threshold - # Identify those variables for which the RMSE difference between train - # and selection is within a user-defined ratio - rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"]) # flip subtraction vs. AUC - < preselect_overtrain_threshold) + # Identify those variables for which the error metric difference between train + # and selection is within a user-defined ratio + score_overtrain = ( + (df[f"{scoring_method} selection"] - df[f"{scoring_method} train"]) + < preselect_overtrain + ) + df["preselection"] = score_thresh & score_overtrain + df_out = df.sort_values(by=f"{scoring_method} selection", ascending=True).reset_index(drop=True) + return df_out - df_rmse["preselection"] = rmse_thresh & rmse_overtrain - df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True) # lower is better +def filter_preselection_score_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame: + """Filter the dataframe based on the given thresholds for scoring-based metrics.""" + score_thresh = df.loc[:, f"{scoring_method} selection"] > preselect_threshold + # Identify those variables for which the score difference between train + # and selection is within a user-defined ratio + score_overtrain = ( + (df[f"{scoring_method} train"] - df[f"{scoring_method} selection"]) + < preselect_overtrain + ) + df["preselection"] = score_thresh & score_overtrain + df_out = df.sort_values(by=f"{scoring_method} selection", ascending=False).reset_index(drop=True) return df_out + def get_preselected_predictors(df_metric: pd.DataFrame) -> list: """Wrapper function to extract a list of predictors from df_metric. @@ -154,6 +167,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: return [col + "_enc" for col in predictor_list] + def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: """Given a DataFrame and a list of predictors, compute the correlations diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py index c69a4de..f119ec5 100644 --- a/tests/model_building/test_univariate_selection.py +++ b/tests/model_building/test_univariate_selection.py @@ -1,8 +1,11 @@ import pandas as pd +import pytest from cobra.model_building import univariate_selection + +@pytest.fixture def mock_data(): return pd.DataFrame({"var1_enc": [0.42] * 10, "var2_enc": [0.94] * 10, @@ -10,9 +13,8 @@ def mock_data(): class TestUnivariateSelection: - def test_preselection_classification(self): - - X = mock_data() + def test_preselection_classification(self, mock_data: pd.DataFrame): + X = mock_data y = pd.DataFrame([1] * 5 + [0] * 5, columns=["target"]) basetable = pd.concat([y, X], axis=1) @@ -29,14 +31,11 @@ def test_preselection_classification(self): assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"]) - preselected_predictors = (univariate_selection - .get_preselected_predictors(df_auc)) - + preselected_predictors = univariate_selection.get_preselected_predictors(df_auc) assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"] - def test_preselection_regression(self): - - X = mock_data() + def test_preselection_regression(self, mock_data: pd.DataFrame): + X = mock_data y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"]) basetable = pd.concat([y, X], axis=1) @@ -53,7 +52,87 @@ def test_preselection_regression(self): assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"]) - preselected_predictors = (univariate_selection - .get_preselected_predictors(df_rmse)) - + preselected_predictors = univariate_selection.get_preselected_predictors(df_rmse) assert preselected_predictors == ["var2_enc", "var3_enc"] + + def test_filter_preselection_error_based(self): + """Test filtering preselection data for an error-based metric.""" + test_input = pd.DataFrame( + [ + [0.1, 0.1], + [0.2, 0.2], + [0.3, 0.6], + [0.4, 0.4], + [0.5, 0.5], + [0.6, 0.6], + [0.7, 0.7], + [0.8, 0.8], + [0.9, 0.9], + [1.0, 1.0], + ], + columns=["RMSE train", "RMSE selection"] + ) + result = univariate_selection.filter_preselection_error_based( + test_input, + preselect_threshold=0.65, + preselect_overtrain=0.2, + scoring_method="RMSE" + ) + + target = pd.DataFrame( + [ + [0.1, 0.1, True], + [0.2, 0.2, True], + [0.4, 0.4, True], + [0.5, 0.5, True], + [0.3, 0.6, False], + [0.6, 0.6, True], + [0.7, 0.7, False], + [0.8, 0.8, False], + [0.9, 0.9, False], + [1.0, 1.0, False], + ], + columns=["RMSE train", "RMSE selection", "preselection"] + ) + assert target.equals(result) + + def test_filter_preselection_score_based(self): + """Test filtering preselection data for a score-based metric.""" + test_input = pd.DataFrame( + [ + [0.1, 0.1], + [0.2, 0.2], + [0.3, 0.6], + [0.4, 0.4], + [0.5, 0.5], + [0.6, 0.6], + [0.7, 0.7], + [0.8, 0.8], + [0.9, 0.9], + [1.0, 0.7], + ], + columns=["AUC train", "AUC selection"] + ) + result = univariate_selection.filter_preselection_score_based( + test_input, + preselect_threshold=0.65, + preselect_overtrain=0.2, + scoring_method="AUC" + ) + + target = pd.DataFrame( + [ + [0.9, 0.9, True], + [0.8, 0.8, True], + [0.7, 0.7, True], + [1.0, 0.7, False], + [0.3, 0.6, False], + [0.6, 0.6, False], + [0.5, 0.5, False], + [0.4, 0.4, False], + [0.2, 0.2, False], + [0.1, 0.1, False], + ], + columns=["AUC train", "AUC selection", "preselection"] + ) + assert target.equals(result)