PythonPredictions · ZlaTanskY · May 12, 2023 · May 12, 2023 · May 26, 2023
diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
@@ -1,19 +1,20 @@
 
 import pandas as pd
 from sklearn.metrics import roc_auc_score, mean_squared_error
-from numpy import sqrt
 
 import cobra.utils as utils
 
-def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
-                                    target_enc_selection_data: pd.DataFrame,
-                                    predictors: list,
-                                    target_column: str,
-                                    model_type: str = "classification",
-                                    preselect_auc_threshold: float = 0.053,
-                                    preselect_rmse_threshold: float = 5,
-                                    preselect_overtrain_threshold: float = 0.05
-                                    ) -> pd.DataFrame:
+
+def compute_univariate_preselection(
+    target_enc_train_data: pd.DataFrame,
+    target_enc_selection_data: pd.DataFrame,
+    predictors: list,
+    target_column: str,
+    model_type: str = "classification",
+    preselect_auc_threshold: float = 0.053,
+    preselect_rmse_threshold: float = 5,
+    preselect_overtrain_threshold: float = 0.05
+) -> pd.DataFrame:
     """Perform a preselection of predictors based on an AUC (in case of
     classification) or a RMSE (in case of regression) threshold of
     a univariate model on a train and selection dataset and return a DataFrame
@@ -64,69 +65,81 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
     """
     result = []
 
+    # TODO: Change this to `if is_error_metric` or similar
     if model_type == "classification":
-        for predictor in predictors:
-
-            cleaned_predictor = utils.clean_predictor_name(predictor)
-
-            auc_train = roc_auc_score(
-                y_true=target_enc_train_data[target_column],
-                y_score=target_enc_train_data[predictor])
-
-            auc_selection = roc_auc_score(
-                y_true=target_enc_selection_data[target_column],
-                y_score=target_enc_selection_data[predictor])
-
-            result.append({"predictor": cleaned_predictor,
-                           "AUC train": auc_train,
-                           "AUC selection": auc_selection})
-
-        df_auc = pd.DataFrame(result)
-
-        # Filter based on min. AUC
-        auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold
-
-        # Identify those variables for which the AUC difference between train
-        # and selection is within a user-defined ratio
-        auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
-                         < preselect_overtrain_threshold)
-
-        df_auc["preselection"] = auc_thresh & auc_overtrain
-
-        df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True)
-
-    elif model_type == "regression":
-        for predictor in predictors:
-            cleaned_predictor = utils.clean_predictor_name(predictor)
-
-            rmse_train = sqrt(mean_squared_error(
-                y_true=target_enc_train_data[target_column],
-                y_pred=target_enc_train_data[predictor]))
-
-            rmse_selection = sqrt(mean_squared_error(
-                y_true=target_enc_selection_data[target_column],
-                y_pred=target_enc_selection_data[predictor]))
-
-            result.append({"predictor": cleaned_predictor,
-                           "RMSE train": rmse_train,
-                           "RMSE selection": rmse_selection})
+        scoring_method = roc_auc_score
+        kwargs = {}
+        scoring_method_str = "AUC"
+    else:
+        scoring_method = mean_squared_error
+        kwargs = {"squared": False}
+        scoring_method_str = "RMSE"
+
+    for predictor in predictors:
+        cleaned_predictor = utils.clean_predictor_name(predictor)
+
+        score_train = scoring_method(
+            target_enc_train_data[target_column],
+            target_enc_train_data[predictor],
+            **kwargs
+        )
+
+        score_selection = scoring_method(
+            target_enc_selection_data[target_column],
+            target_enc_selection_data[predictor],
+            **kwargs
+        )
+
+        result.append(
+            {
+                "predictor": cleaned_predictor,
+                f"{scoring_method_str} train": score_train,
+                f"{scoring_method_str} selection": score_selection
+            }
+        )
+
+    df_score = pd.DataFrame(result)
+
+    # TODO: This should be `if error_metric` instead of classification vs regression
+    # This opens the door to customised scoring methods
+    if model_type == "classification":
+        df_out = filter_preselection_score_based(df_score, preselect_auc_threshold, preselect_overtrain_threshold, scoring_method_str)
+    else:
+        # What if they fill in something else than `regression`?
+        df_out = filter_preselection_error_based(df_score, preselect_rmse_threshold, preselect_overtrain_threshold, scoring_method_str)
+    return df_out
 
-        df_rmse = pd.DataFrame(result)
 
-        # Filter based on max. RMSE
-        rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold
+def filter_preselection_error_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame:
+    """Filter the dataframe based on the given thresholds for error-based metrics."""
+    score_thresh = df.loc[:, f"{scoring_method} selection"] < preselect_threshold
 
-        # Identify those variables for which the RMSE difference between train
-        # and selection is within a user-defined ratio
-        rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"])  # flip subtraction vs. AUC
-                          < preselect_overtrain_threshold)
+    # Identify those variables for which the error metric difference between train
+    # and selection is within a user-defined ratio
+    score_overtrain = (
+        (df[f"{scoring_method} selection"] - df[f"{scoring_method} train"])
+        < preselect_overtrain
+    )
+    df["preselection"] = score_thresh & score_overtrain
+    df_out = df.sort_values(by=f"{scoring_method} selection", ascending=True).reset_index(drop=True)
+    return df_out
 
-        df_rmse["preselection"] = rmse_thresh & rmse_overtrain
 
-        df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True)  # lower is better
+def filter_preselection_score_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame:
+    """Filter the dataframe based on the given thresholds for scoring-based metrics."""
+    score_thresh = df.loc[:, f"{scoring_method} selection"] > preselect_threshold
 
+    # Identify those variables for which the score difference between train
+    # and selection is within a user-defined ratio
+    score_overtrain = (
+        (df[f"{scoring_method} train"] - df[f"{scoring_method} selection"])
+        < preselect_overtrain
+    )
+    df["preselection"] = score_thresh & score_overtrain
+    df_out = df.sort_values(by=f"{scoring_method} selection", ascending=False).reset_index(drop=True)
     return df_out
 
+
 def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
     """Wrapper function to extract a list of predictors from df_metric.
 
@@ -154,6 +167,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
 
     return [col + "_enc" for col in predictor_list]
 
+
 def compute_correlations(target_enc_train_data: pd.DataFrame,
                          predictors: list) -> pd.DataFrame:
     """Given a DataFrame and a list of predictors, compute the correlations

diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py
@@ -1,18 +1,20 @@
 
 import pandas as pd
+import pytest
 
 from cobra.model_building import univariate_selection
 
+
+@pytest.fixture
 def mock_data():
     return pd.DataFrame({"var1_enc": [0.42] * 10,
                          "var2_enc": [0.94] * 10,
                          "var3_enc": [0.87] * 10})
 
 class TestUnivariateSelection:
 
-    def test_preselection_classification(self):
-
-        X = mock_data()
+    def test_preselection_classification(self, mock_data: pd.DataFrame):
+        X = mock_data
         y = pd.DataFrame([1] * 5 + [0] * 5, columns=["target"])
 
         basetable = pd.concat([y, X], axis=1)
@@ -29,14 +31,11 @@ def test_preselection_classification(self):
 
         assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"])
 
-        preselected_predictors = (univariate_selection
-                                  .get_preselected_predictors(df_auc))
-
+        preselected_predictors = univariate_selection.get_preselected_predictors(df_auc)
         assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"]
 
-    def test_preselection_regression(self):
-
-        X = mock_data()
+    def test_preselection_regression(self, mock_data: pd.DataFrame):
+        X = mock_data
         y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"])
 
         basetable = pd.concat([y, X], axis=1)
@@ -53,7 +52,87 @@ def test_preselection_regression(self):
 
         assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"])
 
-        preselected_predictors = (univariate_selection
-                                  .get_preselected_predictors(df_rmse))
-
+        preselected_predictors = univariate_selection.get_preselected_predictors(df_rmse)
         assert preselected_predictors == ["var2_enc", "var3_enc"]
+
+    def test_filter_preselection_error_based(self):
+        """Test filtering preselection data for an error-based metric."""
+        test_input = pd.DataFrame(
+            [
+                [0.1, 0.1],
+                [0.2, 0.2],
+                [0.3, 0.6],
+                [0.4, 0.4],
+                [0.5, 0.5],
+                [0.6, 0.6],
+                [0.7, 0.7],
+                [0.8, 0.8],
+                [0.9, 0.9],
+                [1.0, 1.0],
+            ],
+            columns=["RMSE train", "RMSE selection"]
+        )
+        result = univariate_selection.filter_preselection_error_based(
+            test_input,
+            preselect_threshold=0.65,
+            preselect_overtrain=0.2,
+            scoring_method="RMSE"
+        )
+
+        target = pd.DataFrame(
+            [
+                [0.1, 0.1, True],
+                [0.2, 0.2, True],
+                [0.4, 0.4, True],
+                [0.5, 0.5, True],
+                [0.3, 0.6, False],
+                [0.6, 0.6, True],
+                [0.7, 0.7, False],
+                [0.8, 0.8, False],
+                [0.9, 0.9, False],
+                [1.0, 1.0, False],
+            ],
+            columns=["RMSE train", "RMSE selection", "preselection"]
+        )
+        assert target.equals(result)
+
+    def test_filter_preselection_score_based(self):
+        """Test filtering preselection data for a score-based metric."""
+        test_input = pd.DataFrame(
+            [
+                [0.1, 0.1],
+                [0.2, 0.2],
+                [0.3, 0.6],
+                [0.4, 0.4],
+                [0.5, 0.5],
+                [0.6, 0.6],
+                [0.7, 0.7],
+                [0.8, 0.8],
+                [0.9, 0.9],
+                [1.0, 0.7],
+            ],
+            columns=["AUC train", "AUC selection"]
+        )
+        result = univariate_selection.filter_preselection_score_based(
+            test_input,
+            preselect_threshold=0.65,
+            preselect_overtrain=0.2,
+            scoring_method="AUC"
+        )
+
+        target = pd.DataFrame(
+            [
+                [0.9, 0.9, True],
+                [0.8, 0.8, True],
+                [0.7, 0.7, True],
+                [1.0, 0.7, False],
+                [0.3, 0.6, False],
+                [0.6, 0.6, False],
+                [0.5, 0.5, False],
+                [0.4, 0.4, False],
+                [0.2, 0.2, False],
+                [0.1, 0.1, False],
+            ],
+            columns=["AUC train", "AUC selection", "preselection"]
+        )
+        assert target.equals(result)