Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- L0 regularization logic.
16 changes: 10 additions & 6 deletions docs/calibration.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,25 @@
"\n",
"The `Calibration` class provides a way to adjust weights of observations in a dataset to match specified target values. This is commonly used in survey research and policy modeling for rebalancing datasets to better represent desired population characteristics. \n",
"\n",
"The calibration process uses an optimization algorithm to find weights that minimize the distance from the original weights while achieving the target constraints.\n",
"The calibration process uses an optimization algorithm to find weights that minimize the loss between targets and totals of aggregating the targeted variables across all data records.\n",
"\n",
"## Basic usage\n",
"\n",
"### Parameters\n",
"\n",
"`__init__(data, weights, targets)`\n",
"\n",
"- `data` (pd.DataFrame): The dataset to be calibrated. This should contain all the variables you want to use for calibration.\n",
"- `weights` (np.ndarray): Initial weights for each observation in the dataset. Typically starts as an array of ones for equal weighting.\n",
"- `targets` (np.ndarray): Target values that the calibration process should achieve. These correspond to the desired weighted sums.\n",
"- `estimate_matrix` (pd.DataFrame): matrix representing the contribution of each record to a given variable total.\n",
"- `estimate_function` (Callable): function that produces the estimate values for each targeted variable based on the weights and the contribution of each record to said targeted variable. The standard way of doing it if not provided is `estimate_matrix @ weights`.\n",
"\n",
"Calibration can be easily done by initializing the `Calibration` class, passing in the parameters above. Then `calibrate()` method performs the actual calibration using the reweight function. This method:\n",
"Calibration can be easily done by initializing the `Calibration` class, and passing in the parameters above. Then the `calibrate()` method performs the actual calibration using the reweight function. This method:\n",
"- Adjusts the weights to better match the target values\n",
"- Updates both `self.weights` and `self.data` with the calibrated results\n",
"- Updates `self.weights` with the calibrated results \n",
"- Produces a calibration log with performance metrics\n",
"\n",
"This module also supports regularization in case a sparse matrix that optimizes to reduce the data size simultaneously to calibration is desired. To use this functionality pass `regularize=True` to the `calibrate()` call. The method will update `self.sparse_weights` with the sparse calibrated results, which can then be used to drop records with a weight close to 0.\n",
"\n",
"## Example\n",
"\n",
Expand Down Expand Up @@ -2332,7 +2336,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "policyengine",
"display_name": "pe",
"language": "python",
"name": "python3"
},
Expand All @@ -2346,7 +2350,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
"version": "3.11.11"
}
},
"nbformat": 4,
Expand Down
37 changes: 18 additions & 19 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,40 +10,39 @@ authors = [
requires-python = ">=3.11"
dependencies = [
"torch>=2.7.0",
"numpy>=1.26.0,<2.0.0",
"pandas>=2.2.0,<3.0.0",
"plotly>=5.24.0,<6.0.0",
"tqdm>=4.65.0,<5.0.0",
"numpy",
"pandas",
"tqdm",
]

[project.optional-dependencies]
dev = [
"pytest>=8.0.0,<9.0.0",
"pytest-cov>=6.0.0,<7.0.0",
"flake8>=6.0.0,<7.0.0",
"black>=23.0.0",
"isort>=5.9.0,<6.0.0",
"mypy>=1.0.0,<2.0.0",
"build>=1.0.0,<2.0.0",
"pytest",
"pytest-cov",
"flake8>=6.0.0",
"black",
"isort",
"mypy",
"build",
"linecheck",
"yaml-changelog>=0.1.7",
]

docs = [
"sphinx>=5.0.0,<6.0.0",
"docutils>=0.17.0,<0.18.0",
"jupyter-book>=0.16.0",
"sphinx>=5.0.0",
"docutils>=0.17.0",
"jupyter-book>=0.15.0",
"sphinx-book-theme>=1.0.0",
"sphinx-copybutton>=0.5.0",
"sphinx-design>=0.3.0",
"ipywidgets>=7.8.0,<8.0.0",
"plotly>=5.24.0,<6.0.0",
"ipywidgets>=7.8.0",
"plotly",
"sphinx-argparse>=0.5.0",
"sphinx-math-dollar>=1.2.1",
"myst-parser==0.18.1",
"myst-nb==0.17.2",
"myst-parser>=0.18.1",
"myst-nb>=0.17.2",
"pyyaml",
"furo==2022.12.7",
"furo>=2022.12.7",
"h5py>=3.1.0,<4.0.0",
]

Expand Down
27 changes: 22 additions & 5 deletions src/microcalibrate/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,15 @@ def __init__(
epochs: Optional[int] = 32,
noise_level: Optional[float] = 10.0,
learning_rate: Optional[float] = 1e-3,
dropout_rate: Optional[float] = 0.1,
dropout_rate: Optional[float] = 0, # default to no dropout for now
normalization_factor: Optional[torch.Tensor] = None,
excluded_targets: Optional[List[str]] = None,
csv_path: Optional[str] = None,
device: str = None,
device: str = "cpu", # fix to cpu for now to avoid user device-specific issues
l0_lambda: float = 5e-6, # best between 1e-6 and 1e-5
init_mean: float = 0.999, # initial proportion with non-zero weights, set near 0
temperature: float = 0.5, # usual values .5 to 3
regularize_with_l0: Optional[bool] = False,
):
"""Initialize the Calibration class.

Expand All @@ -42,6 +46,10 @@ def __init__(
excluded_targets (Optional[List]): Optional List of targets to exclude from calibration. Defaults to None.
csv_path (str): Optional path to save performance logs as CSV. Defaults to None.
device (str): Optional device to run the calibration on. Defaults to None, which will use CUDA if available, otherwise MPS, otherwise CPU.
l0_lambda (float): Regularization parameter for L0 regularization. Defaults to 5e-6.
init_mean (float): Initial mean for L0 regularization, representing the initial proportion of non-zero weights. Defaults to 0.999.
temperature (float): Temperature parameter for L0 regularization, controlling the sparsity of the model. Defaults to 0.5.
regularize_with_l0 (Optional[bool]): Whether to apply L0 regularization. Defaults to False.
"""
if device is not None:
self.device = torch.device(device)
Expand All @@ -51,6 +59,7 @@ def __init__(
if torch.cuda.is_available()
else "mps" if torch.mps.is_available() else "cpu"
)

self.original_estimate_matrix = estimate_matrix
self.original_targets = targets
self.original_target_names = target_names
Expand All @@ -64,6 +73,11 @@ def __init__(
self.normalization_factor = normalization_factor
self.csv_path = csv_path
self.performance_df = None
self.sparse_weights = None
self.l0_lambda = l0_lambda
self.init_mean = init_mean
self.temperature = temperature
self.regularize_with_l0 = regularize_with_l0

self.estimate_matrix = None
self.targets = None
Expand Down Expand Up @@ -120,7 +134,7 @@ def calibrate(self) -> None:

from .reweight import reweight

new_weights, self.performance_df = reweight(
new_weights, sparse_weights, self.performance_df = reweight(
original_weights=self.weights,
estimate_function=self.estimate_function,
targets_array=self.targets,
Expand All @@ -134,9 +148,14 @@ def calibrate(self) -> None:
excluded_target_data=self.excluded_target_data,
csv_path=self.csv_path,
device=self.device,
l0_lambda=self.l0_lambda,
init_mean=self.init_mean,
temperature=self.temperature,
regularize_with_l0=self.regularize_with_l0,
)

self.weights = new_weights
self.sparse_weights = sparse_weights

return self.performance_df

Expand Down Expand Up @@ -330,13 +349,11 @@ def _assess_targets(
if estimate_matrix is not None:
# Check if estimate_matrix is a tensor or DataFrame
if hasattr(estimate_matrix, "iloc"):
# It's a DataFrame
contributing_mask = estimate_matrix.iloc[:, i] != 0
contribution_ratio = (
contributing_mask.sum() / estimate_matrix.shape[0]
)
else:
# It's a tensor
contributing_mask = estimate_matrix[:, i] != 0
contribution_ratio = (
contributing_mask.sum().item()
Expand Down
116 changes: 110 additions & 6 deletions src/microcalibrate/reweight.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import logging
import os
from pathlib import Path
from typing import Callable, List, Optional
from typing import Callable, List, Optional, Union

import numpy as np
import pandas as pd
import torch
from torch import Tensor
from tqdm import tqdm

from .utils.l0 import HardConcrete
from .utils.log_performance import log_performance_over_epochs
from .utils.metrics import loss, pct_close

Expand All @@ -20,6 +21,10 @@ def reweight(
estimate_function: Callable[[Tensor], Tensor],
targets_array: np.ndarray,
target_names: np.ndarray,
l0_lambda: float,
init_mean: float,
temperature: float,
regularize_with_l0: bool,
dropout_rate: Optional[float] = 0.05,
epochs: Optional[int] = 2_000,
noise_level: Optional[float] = 10.0,
Expand All @@ -29,14 +34,18 @@ def reweight(
excluded_target_data: Optional[dict] = None,
csv_path: Optional[str] = None,
device: Optional[str] = None,
) -> tuple[np.ndarray, np.ndarray]:
) -> tuple[np.ndarray, Union[np.ndarray, None], pd.DataFrame]:
"""Reweight the original weights based on the loss matrix and targets.

Args:
original_weights (np.ndarray): Original weights to be reweighted.
estimate_function (Callable[[Tensor], Tensor]): Function to estimate targets from weights.
targets_array (np.ndarray): Array of target values.
target_names (np.ndarray): Names of the targets.
l0_lambda (float): Regularization parameter for L0 regularization.
init_mean (float): Initial mean for L0 regularization, representing the initial proportion of non-zero weights.
temperature (float): Temperature parameter for L0 regularization, controlling the sparsity of the model.
regularize_with_l0 (bool): Whether to apply L0 regularization.
dropout_rate (float): Optional probability of dropping weights during training.
epochs (int): Optional number of epochs for training.
noise_level (float): Optional level of noise to add to the original weights.
Expand Down Expand Up @@ -110,7 +119,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
estimates_over_epochs = []
pct_close_over_epochs = []
max_epochs = epochs - 1 if epochs > 0 else 0
epochs = []
epochs_list = []

for i in iterator:
optimizer.zero_grad()
Expand All @@ -130,7 +139,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
)

if i % tracking_n == 0:
epochs.append(i)
epochs_list.append(i)
loss_over_epochs.append(l.item())
pct_close_over_epochs.append(close)
estimates_over_epochs.append(estimate.detach().cpu().numpy())
Expand All @@ -150,7 +159,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
optimizer.step()

tracker_dict = {
"epochs": epochs,
"epochs": epochs_list,
"loss": loss_over_epochs,
"estimates": estimates_over_epochs,
}
Expand All @@ -169,11 +178,106 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
csv_path.parent.mkdir(parents=True, exist_ok=True)
performance_df.to_csv(csv_path, index=True)

logger.info(f"Reweighting completed. Final sample size: {len(weights)}")
logger.info(
f"Dense reweighting completed. Final sample size: {len(weights)}"
)

final_weights = torch.exp(weights_).detach().cpu().numpy()

if regularize_with_l0:
logger.info("Applying L0 regularization to the weights.")

# Sparse, regularized weights depending on temperature, init_mean, l0_lambda -----
weights = torch.tensor(
np.log(original_weights),
requires_grad=True,
dtype=torch.float32,
device=device,
)
gates = HardConcrete(
len(original_weights), init_mean=init_mean, temperature=temperature
).to(device)
# NOTE: Results are pretty sensitve to learning rates
# optimizer breaks down somewhere near .005, does better at above .1
optimizer = torch.optim.Adam(
[weights] + list(gates.parameters()), lr=0.2
)
start_loss = None

loss_over_epochs_sparse = []
estimates_over_epochs_sparse = []
pct_close_over_epochs_sparse = []
epochs_sparse = []

iterator = tqdm(
range(epochs * 2), desc="Sparse reweighting progress", unit="epoch"
) # lower learning rate, harder optimization

for i in iterator:
optimizer.zero_grad()
weights_ = dropout_weights(weights, dropout_rate)
masked = torch.exp(weights_) * gates()
estimate = estimate_function(masked)
l_main = loss(estimate, targets, normalization_factor)
l = l_main + l0_lambda * gates.get_penalty()
close = pct_close(estimate, targets)
if i % tracking_n / 2 == 0:
epochs_sparse.append(i)
loss_over_epochs_sparse.append(l.item())
pct_close_over_epochs_sparse.append(close)
estimates_over_epochs_sparse.append(
estimate.detach().cpu().numpy()
)

logger.info(
f"Within 10% from targets in sparse calibration: {close:.2%} \n"
)

if len(loss_over_epochs_sparse) > 1:
loss_change = loss_over_epochs_sparse[-2] - l.item()
logger.info(
f"Epoch {i:4d}: Loss = {l.item():.6f}, "
f"Change = {loss_change:.6f} "
f"({'improving' if loss_change > 0 else 'worsening'})"
)
if start_loss is None:
start_loss = l.item()
loss_rel_change = (l.item() - start_loss) / start_loss
l.backward()
iterator.set_postfix(
{"loss": l.item(), "loss_rel_change": loss_rel_change}
)
optimizer.step()

gates.eval()
final_weights_sparse = (
(torch.exp(weights) * gates()).detach().cpu().numpy()
)

tracker_dict_sparse = {
"epochs": epochs_sparse,
"loss": loss_over_epochs_sparse,
"estimates": estimates_over_epochs_sparse,
}

sparse_performance_df = log_performance_over_epochs(
tracker_dict_sparse,
targets,
target_names,
excluded_targets,
excluded_target_data,
)

if csv_path:
# Create directory if it doesn't exist
csv_path = Path(str(csv_path).replace(".csv", "_sparse.csv"))
csv_path.parent.mkdir(parents=True, exist_ok=True)
sparse_performance_df.to_csv(csv_path, index=True)
else:
final_weights_sparse = None

return (
final_weights,
final_weights_sparse,
performance_df,
)
1 change: 1 addition & 0 deletions src/microcalibrate/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .l0 import HardConcrete, evaluate_sparse_weights
from .log_performance import log_performance_over_epochs
from .metrics import loss, pct_close
Loading