CITCOM-project · jmafoster1 · Feb 6, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/causal_testing/estimation/abstract_regression_estimator.py b/causal_testing/estimation/abstract_regression_estimator.py
@@ -5,7 +5,7 @@
 from typing import Any
 
 import pandas as pd
-from patsy import dmatrix  # pylint: disable = no-name-in-module
+from patsy import dmatrix, dmatrices  # pylint: disable = no-name-in-module
 from statsmodels.regression.linear_model import RegressionResultsWrapper
 
 from causal_testing.estimation.abstract_estimator import Estimator
@@ -56,6 +56,22 @@ def __init__(
                 [base_test_case.treatment_variable.name] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
             )
             self.formula = f"{base_test_case.outcome_variable.name} ~ {'+'.join(terms)}"
+        self.setup_covariates()
+
+    def setup_covariates(self):
+        """
+        Parse the formula and set up the covariates from the design matrix so we can use them in the statsmodels array
+        API. This allows us to only parse the formula once, rather than using the formula API, which parses it every
+        time the regression model is fit, which can be a lot if using causal test adequacy.
+        """
+        if self.df is not None:
+            _, covariate_data = dmatrices(self.formula, self.df, return_type="dataframe")
+            self.covariates = covariate_data.columns
+            self.df = pd.concat(
+                [self.df, covariate_data[[col for col in covariate_data.columns if col not in self.df]]], axis=1
+            )
+        else:
+            self.covariates = None
 
     @property
     @abstractmethod
@@ -85,7 +101,16 @@ def fit_model(self, data=None) -> RegressionResultsWrapper:
         """
         if data is None:
             data = self.df
-        model = self.regressor(formula=self.formula, data=data).fit(disp=0)
+        if self.covariates is None:
+            _, covariate_data = dmatrices(self.formula, data, return_type="dataframe")
+            covariates = covariate_data.columns
+            data = pd.concat(
+                [data, covariate_data[[col for col in covariate_data.columns if col not in data]]], axis=1
+            ).dropna()
+            model = self.regressor(data[self.base_test_case.outcome_variable.name], data[covariates]).fit(disp=0)
+        else:
+            data = data.dropna(subset=self.covariates)
+            model = self.regressor(data[self.base_test_case.outcome_variable.name], data[self.covariates]).fit(disp=0)
         return model
 
     def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
@@ -115,6 +140,4 @@ def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
             if str(x.dtypes[col]) == "object":
                 x = pd.get_dummies(x, columns=[col], drop_first=True)
 
-        # This has to be here in case the treatment variable is in an I(...) block in the self.formula
-        x[self.base_test_case.treatment_variable.name] = [self.treatment_value, self.control_value]
         return model.get_prediction(x).summary_frame()
diff --git a/causal_testing/estimation/cubic_spline_estimator.py b/causal_testing/estimation/cubic_spline_estimator.py
@@ -3,6 +3,8 @@
 
 import logging
 from typing import Any
+import statsmodels.formula.api as smf
+from statsmodels.regression.linear_model import RegressionResultsWrapper
 
 import pandas as pd
 
@@ -19,6 +21,8 @@ class CubicSplineRegressionEstimator(LinearRegressionEstimator):
     combination of parameters and basis functions of the variables.
     """
 
+    regressor = smf.ols
+
     def __init__(
         # pylint: disable=too-many-arguments
         self,
@@ -48,6 +52,16 @@ def __init__(
             )
             self.formula = f"{base_test_case.outcome_variable.name} ~ cr({'+'.join(terms)}, df={basis})"
 
+    def fit_model(self, data=None) -> RegressionResultsWrapper:
+        """Run linear regression of the treatment and adjustment set against the outcome and return the model.
+
+        :return: The model after fitting to data.
+        """
+        if data is None:
+            data = self.df
+        model = self.regressor(formula=self.formula, data=data).fit(disp=0)
+        return model
+
     def estimate_ate_calculated(self, adjustment_config: dict = None) -> EffectEstimate:
         """Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused
         by changing the treatment variable from the control value to the treatment value. Here, we actually
@@ -62,7 +76,7 @@ def estimate_ate_calculated(self, adjustment_config: dict = None) -> EffectEstim
         """
         model = self.fit_model()
 
-        x = {"Intercept": 1, self.base_test_case.treatment_variable.name: self.treatment_value}
+        x = pd.DataFrame({"Intercept": [1], self.base_test_case.treatment_variable.name: [self.treatment_value]})
         if adjustment_config is not None:
             for k, v in adjustment_config.items():
                 x[k] = v

diff --git a/causal_testing/estimation/linear_regression_estimator.py b/causal_testing/estimation/linear_regression_estimator.py
@@ -4,7 +4,7 @@
 from typing import Any
 
 import pandas as pd
-import statsmodels.formula.api as smf
+import statsmodels.api as sm
 from patsy import ModelDesc, dmatrix  # pylint: disable = no-name-in-module
 
 from causal_testing.estimation.abstract_regression_estimator import RegressionEstimator
@@ -21,7 +21,7 @@ class LinearRegressionEstimator(RegressionEstimator):
     combination of parameters and functions of the variables (note these functions need not be linear).
     """
 
-    regressor = smf.ols
+    regressor = sm.OLS
 
     def __init__(
         # pylint: disable=too-many-arguments
@@ -92,6 +92,7 @@ def gp_formula(
         formula = gp.run_gp(ngen=ngen, pop_size=pop_size, num_offspring=num_offspring, seeds=seeds)
         formula = gp.simplify(formula)
         self.formula = f"{self.base_test_case.outcome_variable.name} ~ I({formula}) - 1"
+        self.setup_covariates()
 
     def estimate_coefficient(self) -> EffectEstimate:
         """Estimate the unit average treatment effect of the treatment on the outcome. That is, the change in outcome

diff --git a/causal_testing/estimation/logistic_regression_estimator.py b/causal_testing/estimation/logistic_regression_estimator.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 import pandas as pd
-import statsmodels.formula.api as smf
+import statsmodels.api as sm
 
 from causal_testing.estimation.abstract_regression_estimator import RegressionEstimator
 from causal_testing.estimation.effect_estimate import EffectEstimate
@@ -18,7 +18,7 @@ class LogisticRegressionEstimator(RegressionEstimator):
     for estimating categorical outcomes.
     """
 
-    regressor = smf.logit
+    regressor = sm.Logit
 
     def add_modelling_assumptions(self):
         """

diff --git a/causal_testing/main.py b/causal_testing/main.py
@@ -442,7 +442,6 @@ def save_results(self, results: List[CausalTestResult], output_path: str = None)
         result_index = 0
 
         for test_config in test_configs["tests"]:
-
             # Create a base output first of common entries
             base_output = {
                 "name": test_config["name"],

diff --git a/docs/source/tutorials/vaccinating_elderly/causal_test_results.json b/docs/source/tutorials/vaccinating_elderly/causal_test_results.json
@@ -7,8 +7,8 @@
     "expected_effect": {
       "cum_vaccinations": "NoEffect"
     },
-    "formula": "cum_vaccinations ~ max_doses",
     "alpha": 0.05,
+    "formula": "cum_vaccinations ~ max_doses",
     "skip": false,
     "passed": false,
     "result": {
@@ -35,8 +35,8 @@
     "expected_effect": {
       "cum_vaccinated": "NoEffect"
     },
-    "formula": "cum_vaccinated ~ max_doses",
     "alpha": 0.05,
+    "formula": "cum_vaccinated ~ max_doses",
     "skip": false,
     "passed": false,
     "result": {
@@ -63,8 +63,8 @@
     "expected_effect": {
       "cum_infections": "NoEffect"
     },
-    "formula": "cum_infections ~ max_doses",
     "alpha": 0.05,
+    "formula": "cum_infections ~ max_doses",
     "skip": false,
     "passed": false,
     "result": {
@@ -91,8 +91,8 @@
     "expected_effect": {
       "cum_vaccinations": "SomeEffect"
     },
-    "formula": "cum_vaccinations ~ vaccine",
     "alpha": 0.05,
+    "formula": "cum_vaccinations ~ vaccine",
     "skip": false,
     "passed": true,
     "result": {
@@ -119,8 +119,8 @@
     "expected_effect": {
       "cum_vaccinated": "SomeEffect"
     },
-    "formula": "cum_vaccinated ~ vaccine",
     "alpha": 0.05,
+    "formula": "cum_vaccinated ~ vaccine",
     "skip": false,
     "passed": true,
     "result": {
@@ -147,8 +147,8 @@
     "expected_effect": {
       "cum_infections": "SomeEffect"
     },
-    "formula": "cum_infections ~ vaccine",
     "alpha": 0.05,
+    "formula": "cum_infections ~ vaccine",
     "skip": false,
     "passed": true,
     "result": {
@@ -175,8 +175,8 @@
     "expected_effect": {
       "cum_vaccinated": "NoEffect"
     },
-    "formula": "cum_vaccinated ~ cum_vaccinations + vaccine",
     "alpha": 0.05,
+    "formula": "cum_vaccinated ~ cum_vaccinations + vaccine",
     "skip": false,
     "passed": false,
     "result": {
@@ -205,8 +205,8 @@
     "expected_effect": {
       "cum_infections": "NoEffect"
     },
-    "formula": "cum_infections ~ cum_vaccinations + vaccine",
     "alpha": 0.05,
+    "formula": "cum_infections ~ cum_vaccinations + vaccine",
     "skip": false,
     "passed": true,
     "result": {
@@ -235,8 +235,8 @@
     "expected_effect": {
       "cum_infections": "NoEffect"
     },
-    "formula": "cum_infections ~ cum_vaccinated + vaccine",
     "alpha": 0.05,
+    "formula": "cum_infections ~ cum_vaccinated + vaccine",
     "skip": false,
     "passed": true,
     "result": {
@@ -247,13 +247,13 @@
       ],
       "effect_measure": "coefficient",
       "effect_estimate": {
-        "cum_vaccinated": 0.0017107387725956436
+        "cum_vaccinated": 0.0017107387725947554
       },
       "ci_low": {
-        "cum_vaccinated": -0.03931269141189769
+        "cum_vaccinated": -0.03931269141189859
       },
       "ci_high": {
-        "cum_vaccinated": 0.04273416895708898
+        "cum_vaccinated": 0.0427341689570881
       }
     }
   }

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,12 +15,14 @@ requires-python = ">=3.10,<3.14"
 license = { text = "MIT" }
 keywords = ["causal inference","verification"]
 dependencies = [
-"lifelines~=0.30.0",
+"lifelines<=0.28.0; python_version <= '3.10'",
+"lifelines; python_version > '3.10'",
 "networkx>=3.4,<3.5",
 "numpy>=1.26.0,<=2.2.0",
 "pandas>=2.1,<3",
 "scikit_learn~=1.4",
-"scipy>=1.12.0,<=1.16.2",
+"scipy>=1.12.0,<1.14; python_version <= '3.10'",
+"scipy>=1.12.0,<=1.16.2; python_version > '3.10'",
 "statsmodels~=0.14",
 "tabulate~=0.9",
 "pydot~=2.0",

diff --git a/tests/estimation_tests/test_linear_regression_estimator.py b/tests/estimation_tests/test_linear_regression_estimator.py
@@ -68,15 +68,15 @@ def setUpClass(cls) -> None:
     def test_query(self):
         df = self.nhefs_df
         linear_regression_estimator = LinearRegressionEstimator(
-            self.base_test_case, None, None, set(), df, query="sex==1"
+            self.program_15_base_test_case, None, None, set(), df, query="sex==1"
         )
         self.assertTrue(linear_regression_estimator.df.sex.all())
 
     def test_linear_regression_categorical_ate(self):
         df = self.scarf_df.copy()
         base_test_case = BaseTestCase(Input("color", float), Output("completed", float))
-        logistic_regression_estimator = LinearRegressionEstimator(base_test_case, None, None, set(), df)
-        effect_estimate = logistic_regression_estimator.estimate_coefficient()
+        linear_regression_estimator = LinearRegressionEstimator(base_test_case, None, None, set(), df)
+        effect_estimate = linear_regression_estimator.estimate_coefficient()
         self.assertTrue(
             all([ci_low < 0 < ci_high for ci_low, ci_high in zip(effect_estimate.ci_low, effect_estimate.ci_high)])
         )

diff --git a/tests/estimation_tests/test_logistic_regression_estimator.py b/tests/estimation_tests/test_logistic_regression_estimator.py
@@ -21,3 +21,12 @@ def test_odds_ratio(self):
         )
         effect_estimate = logistic_regression_estimator.estimate_unit_odds_ratio()
         self.assertEqual(round(effect_estimate.value.iloc[0], 4), 0.8948)
+
+    def test_odds_ratio_data(self):
+        df = self.scarf_df.copy()
+        logistic_regression_estimator = LogisticRegressionEstimator(
+            BaseTestCase(Input("length_in", float), Output("completed", bool)), 65, 55, set()
+        )
+        logistic_regression_estimator.df = df
+        effect_estimate = logistic_regression_estimator.estimate_unit_odds_ratio()
+        self.assertEqual(round(effect_estimate.value.iloc[0], 4), 0.8948)