Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions causal_testing/estimation/abstract_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any

import pandas as pd
from patsy import dmatrix # pylint: disable = no-name-in-module
from patsy import dmatrix, dmatrices # pylint: disable = no-name-in-module
from statsmodels.regression.linear_model import RegressionResultsWrapper

from causal_testing.estimation.abstract_estimator import Estimator
Expand Down Expand Up @@ -56,6 +56,22 @@ def __init__(
[base_test_case.treatment_variable.name] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
)
self.formula = f"{base_test_case.outcome_variable.name} ~ {'+'.join(terms)}"
self.setup_covariates()

def setup_covariates(self):
"""
Parse the formula and set up the covariates from the design matrix so we can use them in the statsmodels array
API. This allows us to only parse the formula once, rather than using the formula API, which parses it every
time the regression model is fit, which can be a lot if using causal test adequacy.
"""
if self.df is not None:
_, covariate_data = dmatrices(self.formula, self.df, return_type="dataframe")
self.covariates = covariate_data.columns
self.df = pd.concat(
[self.df, covariate_data[[col for col in covariate_data.columns if col not in self.df]]], axis=1
)
else:
self.covariates = None

@property
@abstractmethod
Expand Down Expand Up @@ -85,7 +101,16 @@ def fit_model(self, data=None) -> RegressionResultsWrapper:
"""
if data is None:
data = self.df
model = self.regressor(formula=self.formula, data=data).fit(disp=0)
if self.covariates is None:
_, covariate_data = dmatrices(self.formula, data, return_type="dataframe")
covariates = covariate_data.columns
data = pd.concat(
[data, covariate_data[[col for col in covariate_data.columns if col not in data]]], axis=1
).dropna()
model = self.regressor(data[self.base_test_case.outcome_variable.name], data[covariates]).fit(disp=0)
else:
data = data.dropna(subset=self.covariates)
model = self.regressor(data[self.base_test_case.outcome_variable.name], data[self.covariates]).fit(disp=0)
return model

def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
Expand Down Expand Up @@ -115,6 +140,4 @@ def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
if str(x.dtypes[col]) == "object":
x = pd.get_dummies(x, columns=[col], drop_first=True)

# This has to be here in case the treatment variable is in an I(...) block in the self.formula
x[self.base_test_case.treatment_variable.name] = [self.treatment_value, self.control_value]
return model.get_prediction(x).summary_frame()
16 changes: 15 additions & 1 deletion causal_testing/estimation/cubic_spline_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import logging
from typing import Any
import statsmodels.formula.api as smf
from statsmodels.regression.linear_model import RegressionResultsWrapper

import pandas as pd

Expand All @@ -19,6 +21,8 @@ class CubicSplineRegressionEstimator(LinearRegressionEstimator):
combination of parameters and basis functions of the variables.
"""

regressor = smf.ols

def __init__(
# pylint: disable=too-many-arguments
self,
Expand Down Expand Up @@ -48,6 +52,16 @@ def __init__(
)
self.formula = f"{base_test_case.outcome_variable.name} ~ cr({'+'.join(terms)}, df={basis})"

def fit_model(self, data=None) -> RegressionResultsWrapper:
"""Run linear regression of the treatment and adjustment set against the outcome and return the model.

:return: The model after fitting to data.
"""
if data is None:
data = self.df
model = self.regressor(formula=self.formula, data=data).fit(disp=0)
return model

def estimate_ate_calculated(self, adjustment_config: dict = None) -> EffectEstimate:
"""Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused
by changing the treatment variable from the control value to the treatment value. Here, we actually
Expand All @@ -62,7 +76,7 @@ def estimate_ate_calculated(self, adjustment_config: dict = None) -> EffectEstim
"""
model = self.fit_model()

x = {"Intercept": 1, self.base_test_case.treatment_variable.name: self.treatment_value}
x = pd.DataFrame({"Intercept": [1], self.base_test_case.treatment_variable.name: [self.treatment_value]})
if adjustment_config is not None:
for k, v in adjustment_config.items():
x[k] = v
Expand Down
5 changes: 3 additions & 2 deletions causal_testing/estimation/linear_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Any

import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import ModelDesc, dmatrix # pylint: disable = no-name-in-module

from causal_testing.estimation.abstract_regression_estimator import RegressionEstimator
Expand All @@ -21,7 +21,7 @@ class LinearRegressionEstimator(RegressionEstimator):
combination of parameters and functions of the variables (note these functions need not be linear).
"""

regressor = smf.ols
regressor = sm.OLS

def __init__(
# pylint: disable=too-many-arguments
Expand Down Expand Up @@ -92,6 +92,7 @@ def gp_formula(
formula = gp.run_gp(ngen=ngen, pop_size=pop_size, num_offspring=num_offspring, seeds=seeds)
formula = gp.simplify(formula)
self.formula = f"{self.base_test_case.outcome_variable.name} ~ I({formula}) - 1"
self.setup_covariates()

def estimate_coefficient(self) -> EffectEstimate:
"""Estimate the unit average treatment effect of the treatment on the outcome. That is, the change in outcome
Expand Down
4 changes: 2 additions & 2 deletions causal_testing/estimation/logistic_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm

from causal_testing.estimation.abstract_regression_estimator import RegressionEstimator
from causal_testing.estimation.effect_estimate import EffectEstimate
Expand All @@ -18,7 +18,7 @@ class LogisticRegressionEstimator(RegressionEstimator):
for estimating categorical outcomes.
"""

regressor = smf.logit
regressor = sm.Logit

def add_modelling_assumptions(self):
"""
Expand Down
1 change: 0 additions & 1 deletion causal_testing/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,6 @@ def save_results(self, results: List[CausalTestResult], output_path: str = None)
result_index = 0

for test_config in test_configs["tests"]:

# Create a base output first of common entries
base_output = {
"name": test_config["name"],
Expand Down
24 changes: 12 additions & 12 deletions docs/source/tutorials/vaccinating_elderly/causal_test_results.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
"expected_effect": {
"cum_vaccinations": "NoEffect"
},
"formula": "cum_vaccinations ~ max_doses",
"alpha": 0.05,
"formula": "cum_vaccinations ~ max_doses",
"skip": false,
"passed": false,
"result": {
Expand All @@ -35,8 +35,8 @@
"expected_effect": {
"cum_vaccinated": "NoEffect"
},
"formula": "cum_vaccinated ~ max_doses",
"alpha": 0.05,
"formula": "cum_vaccinated ~ max_doses",
"skip": false,
"passed": false,
"result": {
Expand All @@ -63,8 +63,8 @@
"expected_effect": {
"cum_infections": "NoEffect"
},
"formula": "cum_infections ~ max_doses",
"alpha": 0.05,
"formula": "cum_infections ~ max_doses",
"skip": false,
"passed": false,
"result": {
Expand All @@ -91,8 +91,8 @@
"expected_effect": {
"cum_vaccinations": "SomeEffect"
},
"formula": "cum_vaccinations ~ vaccine",
"alpha": 0.05,
"formula": "cum_vaccinations ~ vaccine",
"skip": false,
"passed": true,
"result": {
Expand All @@ -119,8 +119,8 @@
"expected_effect": {
"cum_vaccinated": "SomeEffect"
},
"formula": "cum_vaccinated ~ vaccine",
"alpha": 0.05,
"formula": "cum_vaccinated ~ vaccine",
"skip": false,
"passed": true,
"result": {
Expand All @@ -147,8 +147,8 @@
"expected_effect": {
"cum_infections": "SomeEffect"
},
"formula": "cum_infections ~ vaccine",
"alpha": 0.05,
"formula": "cum_infections ~ vaccine",
"skip": false,
"passed": true,
"result": {
Expand All @@ -175,8 +175,8 @@
"expected_effect": {
"cum_vaccinated": "NoEffect"
},
"formula": "cum_vaccinated ~ cum_vaccinations + vaccine",
"alpha": 0.05,
"formula": "cum_vaccinated ~ cum_vaccinations + vaccine",
"skip": false,
"passed": false,
"result": {
Expand Down Expand Up @@ -205,8 +205,8 @@
"expected_effect": {
"cum_infections": "NoEffect"
},
"formula": "cum_infections ~ cum_vaccinations + vaccine",
"alpha": 0.05,
"formula": "cum_infections ~ cum_vaccinations + vaccine",
"skip": false,
"passed": true,
"result": {
Expand Down Expand Up @@ -235,8 +235,8 @@
"expected_effect": {
"cum_infections": "NoEffect"
},
"formula": "cum_infections ~ cum_vaccinated + vaccine",
"alpha": 0.05,
"formula": "cum_infections ~ cum_vaccinated + vaccine",
"skip": false,
"passed": true,
"result": {
Expand All @@ -247,13 +247,13 @@
],
"effect_measure": "coefficient",
"effect_estimate": {
"cum_vaccinated": 0.0017107387725956436
"cum_vaccinated": 0.0017107387725947554
},
"ci_low": {
"cum_vaccinated": -0.03931269141189769
"cum_vaccinated": -0.03931269141189859
},
"ci_high": {
"cum_vaccinated": 0.04273416895708898
"cum_vaccinated": 0.0427341689570881
}
}
}
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@ requires-python = ">=3.10,<3.14"
license = { text = "MIT" }
keywords = ["causal inference","verification"]
dependencies = [
"lifelines~=0.30.0",
"lifelines<=0.28.0; python_version <= '3.10'",
"lifelines; python_version > '3.10'",
"networkx>=3.4,<3.5",
"numpy>=1.26.0,<=2.2.0",
"pandas>=2.1,<3",
"scikit_learn~=1.4",
"scipy>=1.12.0,<=1.16.2",
"scipy>=1.12.0,<1.14; python_version <= '3.10'",
"scipy>=1.12.0,<=1.16.2; python_version > '3.10'",
"statsmodels~=0.14",
"tabulate~=0.9",
"pydot~=2.0",
Expand Down
6 changes: 3 additions & 3 deletions tests/estimation_tests/test_linear_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,15 @@ def setUpClass(cls) -> None:
def test_query(self):
df = self.nhefs_df
linear_regression_estimator = LinearRegressionEstimator(
self.base_test_case, None, None, set(), df, query="sex==1"
self.program_15_base_test_case, None, None, set(), df, query="sex==1"
)
self.assertTrue(linear_regression_estimator.df.sex.all())

def test_linear_regression_categorical_ate(self):
df = self.scarf_df.copy()
base_test_case = BaseTestCase(Input("color", float), Output("completed", float))
logistic_regression_estimator = LinearRegressionEstimator(base_test_case, None, None, set(), df)
effect_estimate = logistic_regression_estimator.estimate_coefficient()
linear_regression_estimator = LinearRegressionEstimator(base_test_case, None, None, set(), df)
effect_estimate = linear_regression_estimator.estimate_coefficient()
self.assertTrue(
all([ci_low < 0 < ci_high for ci_low, ci_high in zip(effect_estimate.ci_low, effect_estimate.ci_high)])
)
Expand Down
9 changes: 9 additions & 0 deletions tests/estimation_tests/test_logistic_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,12 @@ def test_odds_ratio(self):
)
effect_estimate = logistic_regression_estimator.estimate_unit_odds_ratio()
self.assertEqual(round(effect_estimate.value.iloc[0], 4), 0.8948)

def test_odds_ratio_data(self):
df = self.scarf_df.copy()
logistic_regression_estimator = LogisticRegressionEstimator(
BaseTestCase(Input("length_in", float), Output("completed", bool)), 65, 55, set()
)
logistic_regression_estimator.df = df
effect_estimate = logistic_regression_estimator.estimate_unit_odds_ratio()
self.assertEqual(round(effect_estimate.value.iloc[0], 4), 0.8948)