Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 78 additions & 64 deletions cobra/model_building/univariate_selection.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@

import pandas as pd
from sklearn.metrics import roc_auc_score, mean_squared_error
from numpy import sqrt

import cobra.utils as utils

def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
target_enc_selection_data: pd.DataFrame,
predictors: list,
target_column: str,
model_type: str = "classification",
preselect_auc_threshold: float = 0.053,
preselect_rmse_threshold: float = 5,
preselect_overtrain_threshold: float = 0.05
) -> pd.DataFrame:

def compute_univariate_preselection(
target_enc_train_data: pd.DataFrame,
target_enc_selection_data: pd.DataFrame,
predictors: list,
target_column: str,
model_type: str = "classification",
preselect_auc_threshold: float = 0.053,
preselect_rmse_threshold: float = 5,
preselect_overtrain_threshold: float = 0.05
) -> pd.DataFrame:
"""Perform a preselection of predictors based on an AUC (in case of
classification) or a RMSE (in case of regression) threshold of
a univariate model on a train and selection dataset and return a DataFrame
Expand Down Expand Up @@ -64,69 +65,81 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
"""
result = []

# TODO: Change this to `if is_error_metric` or similar
if model_type == "classification":
for predictor in predictors:

cleaned_predictor = utils.clean_predictor_name(predictor)

auc_train = roc_auc_score(
y_true=target_enc_train_data[target_column],
y_score=target_enc_train_data[predictor])

auc_selection = roc_auc_score(
y_true=target_enc_selection_data[target_column],
y_score=target_enc_selection_data[predictor])

result.append({"predictor": cleaned_predictor,
"AUC train": auc_train,
"AUC selection": auc_selection})

df_auc = pd.DataFrame(result)

# Filter based on min. AUC
auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold

# Identify those variables for which the AUC difference between train
# and selection is within a user-defined ratio
auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
< preselect_overtrain_threshold)

df_auc["preselection"] = auc_thresh & auc_overtrain

df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True)

elif model_type == "regression":
for predictor in predictors:
cleaned_predictor = utils.clean_predictor_name(predictor)

rmse_train = sqrt(mean_squared_error(
y_true=target_enc_train_data[target_column],
y_pred=target_enc_train_data[predictor]))

rmse_selection = sqrt(mean_squared_error(
y_true=target_enc_selection_data[target_column],
y_pred=target_enc_selection_data[predictor]))

result.append({"predictor": cleaned_predictor,
"RMSE train": rmse_train,
"RMSE selection": rmse_selection})
scoring_method = roc_auc_score
kwargs = {}
scoring_method_str = "AUC"
else:
scoring_method = mean_squared_error
kwargs = {"squared": False}
scoring_method_str = "RMSE"

for predictor in predictors:
cleaned_predictor = utils.clean_predictor_name(predictor)

score_train = scoring_method(
target_enc_train_data[target_column],
target_enc_train_data[predictor],
**kwargs
)

score_selection = scoring_method(
target_enc_selection_data[target_column],
target_enc_selection_data[predictor],
**kwargs
)

result.append(
{
"predictor": cleaned_predictor,
f"{scoring_method_str} train": score_train,
f"{scoring_method_str} selection": score_selection
}
)

df_score = pd.DataFrame(result)

# TODO: This should be `if error_metric` instead of classification vs regression
# This opens the door to customised scoring methods
if model_type == "classification":
df_out = filter_preselection_score_based(df_score, preselect_auc_threshold, preselect_overtrain_threshold, scoring_method_str)
else:
# What if they fill in something else than `regression`?
df_out = filter_preselection_error_based(df_score, preselect_rmse_threshold, preselect_overtrain_threshold, scoring_method_str)
return df_out

df_rmse = pd.DataFrame(result)

# Filter based on max. RMSE
rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold
def filter_preselection_error_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame:
"""Filter the dataframe based on the given thresholds for error-based metrics."""
score_thresh = df.loc[:, f"{scoring_method} selection"] < preselect_threshold

# Identify those variables for which the RMSE difference between train
# and selection is within a user-defined ratio
rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"]) # flip subtraction vs. AUC
< preselect_overtrain_threshold)
# Identify those variables for which the error metric difference between train
# and selection is within a user-defined ratio
score_overtrain = (
(df[f"{scoring_method} selection"] - df[f"{scoring_method} train"])
< preselect_overtrain
)
df["preselection"] = score_thresh & score_overtrain
df_out = df.sort_values(by=f"{scoring_method} selection", ascending=True).reset_index(drop=True)
return df_out

df_rmse["preselection"] = rmse_thresh & rmse_overtrain

df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True) # lower is better
def filter_preselection_score_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame:
"""Filter the dataframe based on the given thresholds for scoring-based metrics."""
score_thresh = df.loc[:, f"{scoring_method} selection"] > preselect_threshold

# Identify those variables for which the score difference between train
# and selection is within a user-defined ratio
score_overtrain = (
(df[f"{scoring_method} train"] - df[f"{scoring_method} selection"])
< preselect_overtrain
)
df["preselection"] = score_thresh & score_overtrain
df_out = df.sort_values(by=f"{scoring_method} selection", ascending=False).reset_index(drop=True)
return df_out


def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
"""Wrapper function to extract a list of predictors from df_metric.

Expand Down Expand Up @@ -154,6 +167,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:

return [col + "_enc" for col in predictor_list]


def compute_correlations(target_enc_train_data: pd.DataFrame,
predictors: list) -> pd.DataFrame:
"""Given a DataFrame and a list of predictors, compute the correlations
Expand Down
103 changes: 91 additions & 12 deletions tests/model_building/test_univariate_selection.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@

import pandas as pd
import pytest

from cobra.model_building import univariate_selection


@pytest.fixture
def mock_data():
return pd.DataFrame({"var1_enc": [0.42] * 10,
"var2_enc": [0.94] * 10,
"var3_enc": [0.87] * 10})

class TestUnivariateSelection:

def test_preselection_classification(self):

X = mock_data()
def test_preselection_classification(self, mock_data: pd.DataFrame):
X = mock_data
y = pd.DataFrame([1] * 5 + [0] * 5, columns=["target"])

basetable = pd.concat([y, X], axis=1)
Expand All @@ -29,14 +31,11 @@ def test_preselection_classification(self):

assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"])

preselected_predictors = (univariate_selection
.get_preselected_predictors(df_auc))

preselected_predictors = univariate_selection.get_preselected_predictors(df_auc)
assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"]

def test_preselection_regression(self):

X = mock_data()
def test_preselection_regression(self, mock_data: pd.DataFrame):
X = mock_data
y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"])

basetable = pd.concat([y, X], axis=1)
Expand All @@ -53,7 +52,87 @@ def test_preselection_regression(self):

assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"])

preselected_predictors = (univariate_selection
.get_preselected_predictors(df_rmse))

preselected_predictors = univariate_selection.get_preselected_predictors(df_rmse)
assert preselected_predictors == ["var2_enc", "var3_enc"]

def test_filter_preselection_error_based(self):
"""Test filtering preselection data for an error-based metric."""
test_input = pd.DataFrame(
[
[0.1, 0.1],
[0.2, 0.2],
[0.3, 0.6],
[0.4, 0.4],
[0.5, 0.5],
[0.6, 0.6],
[0.7, 0.7],
[0.8, 0.8],
[0.9, 0.9],
[1.0, 1.0],
],
columns=["RMSE train", "RMSE selection"]
)
result = univariate_selection.filter_preselection_error_based(
test_input,
preselect_threshold=0.65,
preselect_overtrain=0.2,
scoring_method="RMSE"
)

target = pd.DataFrame(
[
[0.1, 0.1, True],
[0.2, 0.2, True],
[0.4, 0.4, True],
[0.5, 0.5, True],
[0.3, 0.6, False],
[0.6, 0.6, True],
[0.7, 0.7, False],
[0.8, 0.8, False],
[0.9, 0.9, False],
[1.0, 1.0, False],
],
columns=["RMSE train", "RMSE selection", "preselection"]
)
assert target.equals(result)

def test_filter_preselection_score_based(self):
"""Test filtering preselection data for a score-based metric."""
test_input = pd.DataFrame(
[
[0.1, 0.1],
[0.2, 0.2],
[0.3, 0.6],
[0.4, 0.4],
[0.5, 0.5],
[0.6, 0.6],
[0.7, 0.7],
[0.8, 0.8],
[0.9, 0.9],
[1.0, 0.7],
],
columns=["AUC train", "AUC selection"]
)
result = univariate_selection.filter_preselection_score_based(
test_input,
preselect_threshold=0.65,
preselect_overtrain=0.2,
scoring_method="AUC"
)

target = pd.DataFrame(
[
[0.9, 0.9, True],
[0.8, 0.8, True],
[0.7, 0.7, True],
[1.0, 0.7, False],
[0.3, 0.6, False],
[0.6, 0.6, False],
[0.5, 0.5, False],
[0.4, 0.4, False],
[0.2, 0.2, False],
[0.1, 0.1, False],
],
columns=["AUC train", "AUC selection", "preselection"]
)
assert target.equals(result)