diff --git a/causal_testing/estimation/abstract_regression_estimator.py b/causal_testing/estimation/abstract_regression_estimator.py index e0bdb8b9..3001fdef 100644 --- a/causal_testing/estimation/abstract_regression_estimator.py +++ b/causal_testing/estimation/abstract_regression_estimator.py @@ -5,7 +5,7 @@ from typing import Any import pandas as pd -from patsy import dmatrix # pylint: disable = no-name-in-module +from patsy import dmatrix, dmatrices # pylint: disable = no-name-in-module from statsmodels.regression.linear_model import RegressionResultsWrapper from causal_testing.estimation.abstract_estimator import Estimator @@ -56,6 +56,22 @@ def __init__( [base_test_case.treatment_variable.name] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers)) ) self.formula = f"{base_test_case.outcome_variable.name} ~ {'+'.join(terms)}" + self.setup_covariates() + + def setup_covariates(self): + """ + Parse the formula and set up the covariates from the design matrix so we can use them in the statsmodels array + API. This allows us to only parse the formula once, rather than using the formula API, which parses it every + time the regression model is fit, which can be a lot if using causal test adequacy. + """ + if self.df is not None: + _, covariate_data = dmatrices(self.formula, self.df, return_type="dataframe") + self.covariates = covariate_data.columns + self.df = pd.concat( + [self.df, covariate_data[[col for col in covariate_data.columns if col not in self.df]]], axis=1 + ) + else: + self.covariates = None @property @abstractmethod @@ -85,7 +101,16 @@ def fit_model(self, data=None) -> RegressionResultsWrapper: """ if data is None: data = self.df - model = self.regressor(formula=self.formula, data=data).fit(disp=0) + if self.covariates is None: + _, covariate_data = dmatrices(self.formula, data, return_type="dataframe") + covariates = covariate_data.columns + data = pd.concat( + [data, covariate_data[[col for col in covariate_data.columns if col not in data]]], axis=1 + ).dropna() + model = self.regressor(data[self.base_test_case.outcome_variable.name], data[covariates]).fit(disp=0) + else: + data = data.dropna(subset=self.covariates) + model = self.regressor(data[self.base_test_case.outcome_variable.name], data[self.covariates]).fit(disp=0) return model def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame: @@ -115,6 +140,4 @@ def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame: if str(x.dtypes[col]) == "object": x = pd.get_dummies(x, columns=[col], drop_first=True) - # This has to be here in case the treatment variable is in an I(...) block in the self.formula - x[self.base_test_case.treatment_variable.name] = [self.treatment_value, self.control_value] return model.get_prediction(x).summary_frame() diff --git a/causal_testing/estimation/cubic_spline_estimator.py b/causal_testing/estimation/cubic_spline_estimator.py index 1b7d8607..fa4cedbe 100644 --- a/causal_testing/estimation/cubic_spline_estimator.py +++ b/causal_testing/estimation/cubic_spline_estimator.py @@ -3,6 +3,8 @@ import logging from typing import Any +import statsmodels.formula.api as smf +from statsmodels.regression.linear_model import RegressionResultsWrapper import pandas as pd @@ -19,6 +21,8 @@ class CubicSplineRegressionEstimator(LinearRegressionEstimator): combination of parameters and basis functions of the variables. """ + regressor = smf.ols + def __init__( # pylint: disable=too-many-arguments self, @@ -48,6 +52,16 @@ def __init__( ) self.formula = f"{base_test_case.outcome_variable.name} ~ cr({'+'.join(terms)}, df={basis})" + def fit_model(self, data=None) -> RegressionResultsWrapper: + """Run linear regression of the treatment and adjustment set against the outcome and return the model. + + :return: The model after fitting to data. + """ + if data is None: + data = self.df + model = self.regressor(formula=self.formula, data=data).fit(disp=0) + return model + def estimate_ate_calculated(self, adjustment_config: dict = None) -> EffectEstimate: """Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused by changing the treatment variable from the control value to the treatment value. Here, we actually @@ -62,7 +76,7 @@ def estimate_ate_calculated(self, adjustment_config: dict = None) -> EffectEstim """ model = self.fit_model() - x = {"Intercept": 1, self.base_test_case.treatment_variable.name: self.treatment_value} + x = pd.DataFrame({"Intercept": [1], self.base_test_case.treatment_variable.name: [self.treatment_value]}) if adjustment_config is not None: for k, v in adjustment_config.items(): x[k] = v diff --git a/causal_testing/estimation/linear_regression_estimator.py b/causal_testing/estimation/linear_regression_estimator.py index d59f04c3..6d564d7c 100644 --- a/causal_testing/estimation/linear_regression_estimator.py +++ b/causal_testing/estimation/linear_regression_estimator.py @@ -4,7 +4,7 @@ from typing import Any import pandas as pd -import statsmodels.formula.api as smf +import statsmodels.api as sm from patsy import ModelDesc, dmatrix # pylint: disable = no-name-in-module from causal_testing.estimation.abstract_regression_estimator import RegressionEstimator @@ -21,7 +21,7 @@ class LinearRegressionEstimator(RegressionEstimator): combination of parameters and functions of the variables (note these functions need not be linear). """ - regressor = smf.ols + regressor = sm.OLS def __init__( # pylint: disable=too-many-arguments @@ -92,6 +92,7 @@ def gp_formula( formula = gp.run_gp(ngen=ngen, pop_size=pop_size, num_offspring=num_offspring, seeds=seeds) formula = gp.simplify(formula) self.formula = f"{self.base_test_case.outcome_variable.name} ~ I({formula}) - 1" + self.setup_covariates() def estimate_coefficient(self) -> EffectEstimate: """Estimate the unit average treatment effect of the treatment on the outcome. That is, the change in outcome diff --git a/causal_testing/estimation/logistic_regression_estimator.py b/causal_testing/estimation/logistic_regression_estimator.py index 58bedab6..d6bfc5db 100644 --- a/causal_testing/estimation/logistic_regression_estimator.py +++ b/causal_testing/estimation/logistic_regression_estimator.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -import statsmodels.formula.api as smf +import statsmodels.api as sm from causal_testing.estimation.abstract_regression_estimator import RegressionEstimator from causal_testing.estimation.effect_estimate import EffectEstimate @@ -18,7 +18,7 @@ class LogisticRegressionEstimator(RegressionEstimator): for estimating categorical outcomes. """ - regressor = smf.logit + regressor = sm.Logit def add_modelling_assumptions(self): """ diff --git a/causal_testing/main.py b/causal_testing/main.py index 1c27114d..19a7f3d0 100644 --- a/causal_testing/main.py +++ b/causal_testing/main.py @@ -442,7 +442,6 @@ def save_results(self, results: List[CausalTestResult], output_path: str = None) result_index = 0 for test_config in test_configs["tests"]: - # Create a base output first of common entries base_output = { "name": test_config["name"], diff --git a/docs/source/tutorials/vaccinating_elderly/causal_test_results.json b/docs/source/tutorials/vaccinating_elderly/causal_test_results.json index 88fafed0..969dcf17 100644 --- a/docs/source/tutorials/vaccinating_elderly/causal_test_results.json +++ b/docs/source/tutorials/vaccinating_elderly/causal_test_results.json @@ -7,8 +7,8 @@ "expected_effect": { "cum_vaccinations": "NoEffect" }, - "formula": "cum_vaccinations ~ max_doses", "alpha": 0.05, + "formula": "cum_vaccinations ~ max_doses", "skip": false, "passed": false, "result": { @@ -35,8 +35,8 @@ "expected_effect": { "cum_vaccinated": "NoEffect" }, - "formula": "cum_vaccinated ~ max_doses", "alpha": 0.05, + "formula": "cum_vaccinated ~ max_doses", "skip": false, "passed": false, "result": { @@ -63,8 +63,8 @@ "expected_effect": { "cum_infections": "NoEffect" }, - "formula": "cum_infections ~ max_doses", "alpha": 0.05, + "formula": "cum_infections ~ max_doses", "skip": false, "passed": false, "result": { @@ -91,8 +91,8 @@ "expected_effect": { "cum_vaccinations": "SomeEffect" }, - "formula": "cum_vaccinations ~ vaccine", "alpha": 0.05, + "formula": "cum_vaccinations ~ vaccine", "skip": false, "passed": true, "result": { @@ -119,8 +119,8 @@ "expected_effect": { "cum_vaccinated": "SomeEffect" }, - "formula": "cum_vaccinated ~ vaccine", "alpha": 0.05, + "formula": "cum_vaccinated ~ vaccine", "skip": false, "passed": true, "result": { @@ -147,8 +147,8 @@ "expected_effect": { "cum_infections": "SomeEffect" }, - "formula": "cum_infections ~ vaccine", "alpha": 0.05, + "formula": "cum_infections ~ vaccine", "skip": false, "passed": true, "result": { @@ -175,8 +175,8 @@ "expected_effect": { "cum_vaccinated": "NoEffect" }, - "formula": "cum_vaccinated ~ cum_vaccinations + vaccine", "alpha": 0.05, + "formula": "cum_vaccinated ~ cum_vaccinations + vaccine", "skip": false, "passed": false, "result": { @@ -205,8 +205,8 @@ "expected_effect": { "cum_infections": "NoEffect" }, - "formula": "cum_infections ~ cum_vaccinations + vaccine", "alpha": 0.05, + "formula": "cum_infections ~ cum_vaccinations + vaccine", "skip": false, "passed": true, "result": { @@ -235,8 +235,8 @@ "expected_effect": { "cum_infections": "NoEffect" }, - "formula": "cum_infections ~ cum_vaccinated + vaccine", "alpha": 0.05, + "formula": "cum_infections ~ cum_vaccinated + vaccine", "skip": false, "passed": true, "result": { @@ -247,13 +247,13 @@ ], "effect_measure": "coefficient", "effect_estimate": { - "cum_vaccinated": 0.0017107387725956436 + "cum_vaccinated": 0.0017107387725947554 }, "ci_low": { - "cum_vaccinated": -0.03931269141189769 + "cum_vaccinated": -0.03931269141189859 }, "ci_high": { - "cum_vaccinated": 0.04273416895708898 + "cum_vaccinated": 0.0427341689570881 } } } diff --git a/pyproject.toml b/pyproject.toml index 1bd2ef9a..d7da1465 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,12 +15,14 @@ requires-python = ">=3.10,<3.14" license = { text = "MIT" } keywords = ["causal inference","verification"] dependencies = [ -"lifelines~=0.30.0", +"lifelines<=0.28.0; python_version <= '3.10'", +"lifelines; python_version > '3.10'", "networkx>=3.4,<3.5", "numpy>=1.26.0,<=2.2.0", "pandas>=2.1,<3", "scikit_learn~=1.4", -"scipy>=1.12.0,<=1.16.2", +"scipy>=1.12.0,<1.14; python_version <= '3.10'", +"scipy>=1.12.0,<=1.16.2; python_version > '3.10'", "statsmodels~=0.14", "tabulate~=0.9", "pydot~=2.0", diff --git a/tests/estimation_tests/test_linear_regression_estimator.py b/tests/estimation_tests/test_linear_regression_estimator.py index a5a1c9f0..b1457785 100644 --- a/tests/estimation_tests/test_linear_regression_estimator.py +++ b/tests/estimation_tests/test_linear_regression_estimator.py @@ -68,15 +68,15 @@ def setUpClass(cls) -> None: def test_query(self): df = self.nhefs_df linear_regression_estimator = LinearRegressionEstimator( - self.base_test_case, None, None, set(), df, query="sex==1" + self.program_15_base_test_case, None, None, set(), df, query="sex==1" ) self.assertTrue(linear_regression_estimator.df.sex.all()) def test_linear_regression_categorical_ate(self): df = self.scarf_df.copy() base_test_case = BaseTestCase(Input("color", float), Output("completed", float)) - logistic_regression_estimator = LinearRegressionEstimator(base_test_case, None, None, set(), df) - effect_estimate = logistic_regression_estimator.estimate_coefficient() + linear_regression_estimator = LinearRegressionEstimator(base_test_case, None, None, set(), df) + effect_estimate = linear_regression_estimator.estimate_coefficient() self.assertTrue( all([ci_low < 0 < ci_high for ci_low, ci_high in zip(effect_estimate.ci_low, effect_estimate.ci_high)]) ) diff --git a/tests/estimation_tests/test_logistic_regression_estimator.py b/tests/estimation_tests/test_logistic_regression_estimator.py index d7acc935..15a0ed4b 100644 --- a/tests/estimation_tests/test_logistic_regression_estimator.py +++ b/tests/estimation_tests/test_logistic_regression_estimator.py @@ -21,3 +21,12 @@ def test_odds_ratio(self): ) effect_estimate = logistic_regression_estimator.estimate_unit_odds_ratio() self.assertEqual(round(effect_estimate.value.iloc[0], 4), 0.8948) + + def test_odds_ratio_data(self): + df = self.scarf_df.copy() + logistic_regression_estimator = LogisticRegressionEstimator( + BaseTestCase(Input("length_in", float), Output("completed", bool)), 65, 55, set() + ) + logistic_regression_estimator.df = df + effect_estimate = logistic_regression_estimator.estimate_unit_odds_ratio() + self.assertEqual(round(effect_estimate.value.iloc[0], 4), 0.8948)