PolicyEngine · juaristi22 · Jul 25, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - L0 regularization logic.
diff --git a/docs/calibration.ipynb b/docs/calibration.ipynb
@@ -9,21 +9,25 @@
     "\n",
     "The `Calibration` class provides a way to adjust weights of observations in a dataset to match specified target values. This is commonly used in survey research and policy modeling for rebalancing datasets to better represent desired population characteristics. \n",
     "\n",
-    "The calibration process uses an optimization algorithm to find weights that minimize the distance from the original weights while achieving the target constraints.\n",
+    "The calibration process uses an optimization algorithm to find weights that minimize the loss between targets and totals of aggregating the targeted variables across all data records.\n",
     "\n",
     "## Basic usage\n",
     "\n",
     "### Parameters\n",
     "\n",
     "`__init__(data, weights, targets)`\n",
     "\n",
-    "- `data` (pd.DataFrame): The dataset to be calibrated. This should contain all the variables you want to use for calibration.\n",
     "- `weights` (np.ndarray): Initial weights for each observation in the dataset. Typically starts as an array of ones for equal weighting.\n",
     "- `targets` (np.ndarray): Target values that the calibration process should achieve. These correspond to the desired weighted sums.\n",
+    "- `estimate_matrix` (pd.DataFrame): matrix representing the contribution of each record to a given variable total.\n",
+    "- `estimate_function` (Callable): function that produces the estimate values for each targeted variable based on the weights and the contribution of each record to said targeted variable. The standard way of doing it if not provided is `estimate_matrix @ weights`.\n",
     "\n",
-    "Calibration can be easily done by initializing the `Calibration` class, passing in the parameters above. Then `calibrate()` method performs the actual calibration using the reweight function. This method:\n",
+    "Calibration can be easily done by initializing the `Calibration` class, and passing in the parameters above. Then the `calibrate()` method performs the actual calibration using the reweight function. This method:\n",
     "- Adjusts the weights to better match the target values\n",
-    "- Updates both `self.weights` and `self.data` with the calibrated results\n",
+    "- Updates `self.weights` with the calibrated results \n",
+    "- Produces a calibration log with performance metrics\n",
+    "\n",
+    "This module also supports regularization in case a sparse matrix that optimizes to reduce the data size simultaneously to calibration is desired. To use this functionality pass `regularize=True` to the `calibrate()` call. The method will update `self.sparse_weights` with the sparse calibrated results, which can then be used to drop records with a weight close to 0.\n",
     "\n",
     "## Example\n",
     "\n",
@@ -2332,7 +2336,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "policyengine",
+   "display_name": "pe",
    "language": "python",
    "name": "python3"
   },
@@ -2346,7 +2350,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.13"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,40 +10,39 @@ authors = [
 requires-python = ">=3.11"
 dependencies = [
     "torch>=2.7.0",
-    "numpy>=1.26.0,<2.0.0",
-    "pandas>=2.2.0,<3.0.0",
-    "plotly>=5.24.0,<6.0.0",
-    "tqdm>=4.65.0,<5.0.0",
+    "numpy",
+    "pandas",
+    "tqdm",
 ]
 
 [project.optional-dependencies]
 dev = [
-    "pytest>=8.0.0,<9.0.0",
-    "pytest-cov>=6.0.0,<7.0.0",
-    "flake8>=6.0.0,<7.0.0",
-    "black>=23.0.0",
-    "isort>=5.9.0,<6.0.0",
-    "mypy>=1.0.0,<2.0.0",
-    "build>=1.0.0,<2.0.0",
+    "pytest",
+    "pytest-cov",
+    "flake8>=6.0.0",
+    "black",
+    "isort",
+    "mypy",
+    "build",
     "linecheck",
     "yaml-changelog>=0.1.7",
 ]
 
 docs = [
-    "sphinx>=5.0.0,<6.0.0",
-    "docutils>=0.17.0,<0.18.0",
-    "jupyter-book>=0.16.0",
+    "sphinx>=5.0.0",
+    "docutils>=0.17.0",
+    "jupyter-book>=0.15.0",
     "sphinx-book-theme>=1.0.0",
     "sphinx-copybutton>=0.5.0",
     "sphinx-design>=0.3.0",
-    "ipywidgets>=7.8.0,<8.0.0",
-    "plotly>=5.24.0,<6.0.0",
+    "ipywidgets>=7.8.0",
+    "plotly",
     "sphinx-argparse>=0.5.0",
     "sphinx-math-dollar>=1.2.1",
-    "myst-parser==0.18.1",
-    "myst-nb==0.17.2",
+    "myst-parser>=0.18.1",
+    "myst-nb>=0.17.2",
     "pyyaml",
-    "furo==2022.12.7",
+    "furo>=2022.12.7",
     "h5py>=3.1.0,<4.0.0",
 ]
 

diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
@@ -20,11 +20,15 @@ def __init__(
         epochs: Optional[int] = 32,
         noise_level: Optional[float] = 10.0,
         learning_rate: Optional[float] = 1e-3,
-        dropout_rate: Optional[float] = 0.1,
+        dropout_rate: Optional[float] = 0,  # default to no dropout for now
         normalization_factor: Optional[torch.Tensor] = None,
         excluded_targets: Optional[List[str]] = None,
         csv_path: Optional[str] = None,
-        device: str = None,
+        device: str = "cpu",  # fix to cpu for now to avoid user device-specific issues
+        l0_lambda: float = 5e-6,  # best between 1e-6 and 1e-5
+        init_mean: float = 0.999,  # initial proportion with non-zero weights, set near 0
+        temperature: float = 0.5,  # usual values .5 to 3
+        regularize_with_l0: Optional[bool] = False,
     ):
         """Initialize the Calibration class.
 
@@ -42,6 +46,10 @@ def __init__(
             excluded_targets (Optional[List]): Optional List of targets to exclude from calibration. Defaults to None.
             csv_path (str): Optional path to save performance logs as CSV. Defaults to None.
             device (str): Optional device to run the calibration on. Defaults to None, which will use CUDA if available, otherwise MPS, otherwise CPU.
+            l0_lambda (float): Regularization parameter for L0 regularization. Defaults to 5e-6.
+            init_mean (float): Initial mean for L0 regularization, representing the initial proportion of non-zero weights. Defaults to 0.999.
+            temperature (float): Temperature parameter for L0 regularization, controlling the sparsity of the model. Defaults to 0.5.
+            regularize_with_l0 (Optional[bool]): Whether to apply L0 regularization. Defaults to False.
         """
         if device is not None:
             self.device = torch.device(device)
@@ -51,6 +59,7 @@ def __init__(
                 if torch.cuda.is_available()
                 else "mps" if torch.mps.is_available() else "cpu"
             )
+
         self.original_estimate_matrix = estimate_matrix
         self.original_targets = targets
         self.original_target_names = target_names
@@ -64,6 +73,11 @@ def __init__(
         self.normalization_factor = normalization_factor
         self.csv_path = csv_path
         self.performance_df = None
+        self.sparse_weights = None
+        self.l0_lambda = l0_lambda
+        self.init_mean = init_mean
+        self.temperature = temperature
+        self.regularize_with_l0 = regularize_with_l0
 
         self.estimate_matrix = None
         self.targets = None
@@ -120,7 +134,7 @@ def calibrate(self) -> None:
 
         from .reweight import reweight
 
-        new_weights, self.performance_df = reweight(
+        new_weights, sparse_weights, self.performance_df = reweight(
             original_weights=self.weights,
             estimate_function=self.estimate_function,
             targets_array=self.targets,
@@ -134,9 +148,14 @@ def calibrate(self) -> None:
             excluded_target_data=self.excluded_target_data,
             csv_path=self.csv_path,
             device=self.device,
+            l0_lambda=self.l0_lambda,
+            init_mean=self.init_mean,
+            temperature=self.temperature,
+            regularize_with_l0=self.regularize_with_l0,
         )
 
         self.weights = new_weights
+        self.sparse_weights = sparse_weights
 
         return self.performance_df
 
@@ -330,13 +349,11 @@ def _assess_targets(
             if estimate_matrix is not None:
                 # Check if estimate_matrix is a tensor or DataFrame
                 if hasattr(estimate_matrix, "iloc"):
-                    # It's a DataFrame
                     contributing_mask = estimate_matrix.iloc[:, i] != 0
                     contribution_ratio = (
                         contributing_mask.sum() / estimate_matrix.shape[0]
                     )
                 else:
-                    # It's a tensor
                     contributing_mask = estimate_matrix[:, i] != 0
                     contribution_ratio = (
                         contributing_mask.sum().item()

diff --git a/src/microcalibrate/reweight.py b/src/microcalibrate/reweight.py
@@ -1,14 +1,15 @@
 import logging
 import os
 from pathlib import Path
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 import torch
 from torch import Tensor
 from tqdm import tqdm
 
+from .utils.l0 import HardConcrete
 from .utils.log_performance import log_performance_over_epochs
 from .utils.metrics import loss, pct_close
 
@@ -20,6 +21,10 @@ def reweight(
     estimate_function: Callable[[Tensor], Tensor],
     targets_array: np.ndarray,
     target_names: np.ndarray,
+    l0_lambda: float,
+    init_mean: float,
+    temperature: float,
+    regularize_with_l0: bool,
     dropout_rate: Optional[float] = 0.05,
     epochs: Optional[int] = 2_000,
     noise_level: Optional[float] = 10.0,
@@ -29,14 +34,18 @@ def reweight(
     excluded_target_data: Optional[dict] = None,
     csv_path: Optional[str] = None,
     device: Optional[str] = None,
-) -> tuple[np.ndarray, np.ndarray]:
+) -> tuple[np.ndarray, Union[np.ndarray, None], pd.DataFrame]:
     """Reweight the original weights based on the loss matrix and targets.
 
     Args:
         original_weights (np.ndarray): Original weights to be reweighted.
         estimate_function (Callable[[Tensor], Tensor]): Function to estimate targets from weights.
         targets_array (np.ndarray): Array of target values.
         target_names (np.ndarray): Names of the targets.
+        l0_lambda (float): Regularization parameter for L0 regularization.
+        init_mean (float): Initial mean for L0 regularization, representing the initial proportion of non-zero weights.
+        temperature (float): Temperature parameter for L0 regularization, controlling the sparsity of the model.
+        regularize_with_l0 (bool): Whether to apply L0 regularization.
         dropout_rate (float): Optional probability of dropping weights during training.
         epochs (int): Optional number of epochs for training.
         noise_level (float): Optional level of noise to add to the original weights.
@@ -110,7 +119,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
     estimates_over_epochs = []
     pct_close_over_epochs = []
     max_epochs = epochs - 1 if epochs > 0 else 0
-    epochs = []
+    epochs_list = []
 
     for i in iterator:
         optimizer.zero_grad()
@@ -130,7 +139,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
             )
 
         if i % tracking_n == 0:
-            epochs.append(i)
+            epochs_list.append(i)
             loss_over_epochs.append(l.item())
             pct_close_over_epochs.append(close)
             estimates_over_epochs.append(estimate.detach().cpu().numpy())
@@ -150,7 +159,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
             optimizer.step()
 
     tracker_dict = {
-        "epochs": epochs,
+        "epochs": epochs_list,
         "loss": loss_over_epochs,
         "estimates": estimates_over_epochs,
     }
@@ -169,11 +178,106 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
         csv_path.parent.mkdir(parents=True, exist_ok=True)
         performance_df.to_csv(csv_path, index=True)
 
-    logger.info(f"Reweighting completed. Final sample size: {len(weights)}")
+    logger.info(
+        f"Dense reweighting completed. Final sample size: {len(weights)}"
+    )
 
     final_weights = torch.exp(weights_).detach().cpu().numpy()
 
+    if regularize_with_l0:
+        logger.info("Applying L0 regularization to the weights.")
+
+        # Sparse, regularized weights depending on temperature, init_mean, l0_lambda -----
+        weights = torch.tensor(
+            np.log(original_weights),
+            requires_grad=True,
+            dtype=torch.float32,
+            device=device,
+        )
+        gates = HardConcrete(
+            len(original_weights), init_mean=init_mean, temperature=temperature
+        ).to(device)
+        # NOTE: Results are pretty sensitve to learning rates
+        # optimizer breaks down somewhere near .005, does better at above .1
+        optimizer = torch.optim.Adam(
+            [weights] + list(gates.parameters()), lr=0.2
+        )
+        start_loss = None
+
+        loss_over_epochs_sparse = []
+        estimates_over_epochs_sparse = []
+        pct_close_over_epochs_sparse = []
+        epochs_sparse = []
+
+        iterator = tqdm(
+            range(epochs * 2), desc="Sparse reweighting progress", unit="epoch"
+        )  # lower learning rate, harder optimization
+
+        for i in iterator:
+            optimizer.zero_grad()
+            weights_ = dropout_weights(weights, dropout_rate)
+            masked = torch.exp(weights_) * gates()
+            estimate = estimate_function(masked)
+            l_main = loss(estimate, targets, normalization_factor)
+            l = l_main + l0_lambda * gates.get_penalty()
+            close = pct_close(estimate, targets)
+            if i % tracking_n / 2 == 0:
+                epochs_sparse.append(i)
+                loss_over_epochs_sparse.append(l.item())
+                pct_close_over_epochs_sparse.append(close)
+                estimates_over_epochs_sparse.append(
+                    estimate.detach().cpu().numpy()
+                )
+
+                logger.info(
+                    f"Within 10% from targets in sparse calibration: {close:.2%} \n"
+                )
+
+                if len(loss_over_epochs_sparse) > 1:
+                    loss_change = loss_over_epochs_sparse[-2] - l.item()
+                    logger.info(
+                        f"Epoch {i:4d}: Loss = {l.item():.6f}, "
+                        f"Change = {loss_change:.6f} "
+                        f"({'improving' if loss_change > 0 else 'worsening'})"
+                    )
+            if start_loss is None:
+                start_loss = l.item()
+            loss_rel_change = (l.item() - start_loss) / start_loss
+            l.backward()
+            iterator.set_postfix(
+                {"loss": l.item(), "loss_rel_change": loss_rel_change}
+            )
+            optimizer.step()
+
+        gates.eval()
+        final_weights_sparse = (
+            (torch.exp(weights) * gates()).detach().cpu().numpy()
+        )
+
+        tracker_dict_sparse = {
+            "epochs": epochs_sparse,
+            "loss": loss_over_epochs_sparse,
+            "estimates": estimates_over_epochs_sparse,
+        }
+
+        sparse_performance_df = log_performance_over_epochs(
+            tracker_dict_sparse,
+            targets,
+            target_names,
+            excluded_targets,
+            excluded_target_data,
+        )
+
+        if csv_path:
+            # Create directory if it doesn't exist
+            csv_path = Path(str(csv_path).replace(".csv", "_sparse.csv"))
+            csv_path.parent.mkdir(parents=True, exist_ok=True)
+            sparse_performance_df.to_csv(csv_path, index=True)
+    else:
+        final_weights_sparse = None
+
     return (
         final_weights,
+        final_weights_sparse,
         performance_df,
     )
diff --git a/src/microcalibrate/utils/__init__.py b/src/microcalibrate/utils/__init__.py
@@ -1,2 +1,3 @@
+from .l0 import HardConcrete, evaluate_sparse_weights
 from .log_performance import log_performance_over_epochs
 from .metrics import loss, pct_close