diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29..ff551ee 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - L0 regularization logic. diff --git a/docs/calibration.ipynb b/docs/calibration.ipynb index 3aefbb2..68c2a59 100644 --- a/docs/calibration.ipynb +++ b/docs/calibration.ipynb @@ -9,7 +9,7 @@ "\n", "The `Calibration` class provides a way to adjust weights of observations in a dataset to match specified target values. This is commonly used in survey research and policy modeling for rebalancing datasets to better represent desired population characteristics. \n", "\n", - "The calibration process uses an optimization algorithm to find weights that minimize the distance from the original weights while achieving the target constraints.\n", + "The calibration process uses an optimization algorithm to find weights that minimize the loss between targets and totals of aggregating the targeted variables across all data records.\n", "\n", "## Basic usage\n", "\n", @@ -17,13 +17,17 @@ "\n", "`__init__(data, weights, targets)`\n", "\n", - "- `data` (pd.DataFrame): The dataset to be calibrated. This should contain all the variables you want to use for calibration.\n", "- `weights` (np.ndarray): Initial weights for each observation in the dataset. Typically starts as an array of ones for equal weighting.\n", "- `targets` (np.ndarray): Target values that the calibration process should achieve. These correspond to the desired weighted sums.\n", + "- `estimate_matrix` (pd.DataFrame): matrix representing the contribution of each record to a given variable total.\n", + "- `estimate_function` (Callable): function that produces the estimate values for each targeted variable based on the weights and the contribution of each record to said targeted variable. The standard way of doing it if not provided is `estimate_matrix @ weights`.\n", "\n", - "Calibration can be easily done by initializing the `Calibration` class, passing in the parameters above. Then `calibrate()` method performs the actual calibration using the reweight function. This method:\n", + "Calibration can be easily done by initializing the `Calibration` class, and passing in the parameters above. Then the `calibrate()` method performs the actual calibration using the reweight function. This method:\n", "- Adjusts the weights to better match the target values\n", - "- Updates both `self.weights` and `self.data` with the calibrated results\n", + "- Updates `self.weights` with the calibrated results \n", + "- Produces a calibration log with performance metrics\n", + "\n", + "This module also supports regularization in case a sparse matrix that optimizes to reduce the data size simultaneously to calibration is desired. To use this functionality pass `regularize=True` to the `calibrate()` call. The method will update `self.sparse_weights` with the sparse calibrated results, which can then be used to drop records with a weight close to 0.\n", "\n", "## Example\n", "\n", @@ -2332,7 +2336,7 @@ ], "metadata": { "kernelspec": { - "display_name": "policyengine", + "display_name": "pe", "language": "python", "name": "python3" }, @@ -2346,7 +2350,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.13" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index ea6e46b..96dcd3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,40 +10,39 @@ authors = [ requires-python = ">=3.11" dependencies = [ "torch>=2.7.0", - "numpy>=1.26.0,<2.0.0", - "pandas>=2.2.0,<3.0.0", - "plotly>=5.24.0,<6.0.0", - "tqdm>=4.65.0,<5.0.0", + "numpy", + "pandas", + "tqdm", ] [project.optional-dependencies] dev = [ - "pytest>=8.0.0,<9.0.0", - "pytest-cov>=6.0.0,<7.0.0", - "flake8>=6.0.0,<7.0.0", - "black>=23.0.0", - "isort>=5.9.0,<6.0.0", - "mypy>=1.0.0,<2.0.0", - "build>=1.0.0,<2.0.0", + "pytest", + "pytest-cov", + "flake8>=6.0.0", + "black", + "isort", + "mypy", + "build", "linecheck", "yaml-changelog>=0.1.7", ] docs = [ - "sphinx>=5.0.0,<6.0.0", - "docutils>=0.17.0,<0.18.0", - "jupyter-book>=0.16.0", + "sphinx>=5.0.0", + "docutils>=0.17.0", + "jupyter-book>=0.15.0", "sphinx-book-theme>=1.0.0", "sphinx-copybutton>=0.5.0", "sphinx-design>=0.3.0", - "ipywidgets>=7.8.0,<8.0.0", - "plotly>=5.24.0,<6.0.0", + "ipywidgets>=7.8.0", + "plotly", "sphinx-argparse>=0.5.0", "sphinx-math-dollar>=1.2.1", - "myst-parser==0.18.1", - "myst-nb==0.17.2", + "myst-parser>=0.18.1", + "myst-nb>=0.17.2", "pyyaml", - "furo==2022.12.7", + "furo>=2022.12.7", "h5py>=3.1.0,<4.0.0", ] diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py index 4deb179..9a9f240 100644 --- a/src/microcalibrate/calibration.py +++ b/src/microcalibrate/calibration.py @@ -20,11 +20,15 @@ def __init__( epochs: Optional[int] = 32, noise_level: Optional[float] = 10.0, learning_rate: Optional[float] = 1e-3, - dropout_rate: Optional[float] = 0.1, + dropout_rate: Optional[float] = 0, # default to no dropout for now normalization_factor: Optional[torch.Tensor] = None, excluded_targets: Optional[List[str]] = None, csv_path: Optional[str] = None, - device: str = None, + device: str = "cpu", # fix to cpu for now to avoid user device-specific issues + l0_lambda: float = 5e-6, # best between 1e-6 and 1e-5 + init_mean: float = 0.999, # initial proportion with non-zero weights, set near 0 + temperature: float = 0.5, # usual values .5 to 3 + regularize_with_l0: Optional[bool] = False, ): """Initialize the Calibration class. @@ -42,6 +46,10 @@ def __init__( excluded_targets (Optional[List]): Optional List of targets to exclude from calibration. Defaults to None. csv_path (str): Optional path to save performance logs as CSV. Defaults to None. device (str): Optional device to run the calibration on. Defaults to None, which will use CUDA if available, otherwise MPS, otherwise CPU. + l0_lambda (float): Regularization parameter for L0 regularization. Defaults to 5e-6. + init_mean (float): Initial mean for L0 regularization, representing the initial proportion of non-zero weights. Defaults to 0.999. + temperature (float): Temperature parameter for L0 regularization, controlling the sparsity of the model. Defaults to 0.5. + regularize_with_l0 (Optional[bool]): Whether to apply L0 regularization. Defaults to False. """ if device is not None: self.device = torch.device(device) @@ -51,6 +59,7 @@ def __init__( if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu" ) + self.original_estimate_matrix = estimate_matrix self.original_targets = targets self.original_target_names = target_names @@ -64,6 +73,11 @@ def __init__( self.normalization_factor = normalization_factor self.csv_path = csv_path self.performance_df = None + self.sparse_weights = None + self.l0_lambda = l0_lambda + self.init_mean = init_mean + self.temperature = temperature + self.regularize_with_l0 = regularize_with_l0 self.estimate_matrix = None self.targets = None @@ -120,7 +134,7 @@ def calibrate(self) -> None: from .reweight import reweight - new_weights, self.performance_df = reweight( + new_weights, sparse_weights, self.performance_df = reweight( original_weights=self.weights, estimate_function=self.estimate_function, targets_array=self.targets, @@ -134,9 +148,14 @@ def calibrate(self) -> None: excluded_target_data=self.excluded_target_data, csv_path=self.csv_path, device=self.device, + l0_lambda=self.l0_lambda, + init_mean=self.init_mean, + temperature=self.temperature, + regularize_with_l0=self.regularize_with_l0, ) self.weights = new_weights + self.sparse_weights = sparse_weights return self.performance_df @@ -330,13 +349,11 @@ def _assess_targets( if estimate_matrix is not None: # Check if estimate_matrix is a tensor or DataFrame if hasattr(estimate_matrix, "iloc"): - # It's a DataFrame contributing_mask = estimate_matrix.iloc[:, i] != 0 contribution_ratio = ( contributing_mask.sum() / estimate_matrix.shape[0] ) else: - # It's a tensor contributing_mask = estimate_matrix[:, i] != 0 contribution_ratio = ( contributing_mask.sum().item() diff --git a/src/microcalibrate/reweight.py b/src/microcalibrate/reweight.py index 1bc40f1..738795a 100644 --- a/src/microcalibrate/reweight.py +++ b/src/microcalibrate/reweight.py @@ -1,7 +1,7 @@ import logging import os from pathlib import Path -from typing import Callable, List, Optional +from typing import Callable, List, Optional, Union import numpy as np import pandas as pd @@ -9,6 +9,7 @@ from torch import Tensor from tqdm import tqdm +from .utils.l0 import HardConcrete from .utils.log_performance import log_performance_over_epochs from .utils.metrics import loss, pct_close @@ -20,6 +21,10 @@ def reweight( estimate_function: Callable[[Tensor], Tensor], targets_array: np.ndarray, target_names: np.ndarray, + l0_lambda: float, + init_mean: float, + temperature: float, + regularize_with_l0: bool, dropout_rate: Optional[float] = 0.05, epochs: Optional[int] = 2_000, noise_level: Optional[float] = 10.0, @@ -29,7 +34,7 @@ def reweight( excluded_target_data: Optional[dict] = None, csv_path: Optional[str] = None, device: Optional[str] = None, -) -> tuple[np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, Union[np.ndarray, None], pd.DataFrame]: """Reweight the original weights based on the loss matrix and targets. Args: @@ -37,6 +42,10 @@ def reweight( estimate_function (Callable[[Tensor], Tensor]): Function to estimate targets from weights. targets_array (np.ndarray): Array of target values. target_names (np.ndarray): Names of the targets. + l0_lambda (float): Regularization parameter for L0 regularization. + init_mean (float): Initial mean for L0 regularization, representing the initial proportion of non-zero weights. + temperature (float): Temperature parameter for L0 regularization, controlling the sparsity of the model. + regularize_with_l0 (bool): Whether to apply L0 regularization. dropout_rate (float): Optional probability of dropping weights during training. epochs (int): Optional number of epochs for training. noise_level (float): Optional level of noise to add to the original weights. @@ -110,7 +119,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor: estimates_over_epochs = [] pct_close_over_epochs = [] max_epochs = epochs - 1 if epochs > 0 else 0 - epochs = [] + epochs_list = [] for i in iterator: optimizer.zero_grad() @@ -130,7 +139,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor: ) if i % tracking_n == 0: - epochs.append(i) + epochs_list.append(i) loss_over_epochs.append(l.item()) pct_close_over_epochs.append(close) estimates_over_epochs.append(estimate.detach().cpu().numpy()) @@ -150,7 +159,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor: optimizer.step() tracker_dict = { - "epochs": epochs, + "epochs": epochs_list, "loss": loss_over_epochs, "estimates": estimates_over_epochs, } @@ -169,11 +178,106 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor: csv_path.parent.mkdir(parents=True, exist_ok=True) performance_df.to_csv(csv_path, index=True) - logger.info(f"Reweighting completed. Final sample size: {len(weights)}") + logger.info( + f"Dense reweighting completed. Final sample size: {len(weights)}" + ) final_weights = torch.exp(weights_).detach().cpu().numpy() + if regularize_with_l0: + logger.info("Applying L0 regularization to the weights.") + + # Sparse, regularized weights depending on temperature, init_mean, l0_lambda ----- + weights = torch.tensor( + np.log(original_weights), + requires_grad=True, + dtype=torch.float32, + device=device, + ) + gates = HardConcrete( + len(original_weights), init_mean=init_mean, temperature=temperature + ).to(device) + # NOTE: Results are pretty sensitve to learning rates + # optimizer breaks down somewhere near .005, does better at above .1 + optimizer = torch.optim.Adam( + [weights] + list(gates.parameters()), lr=0.2 + ) + start_loss = None + + loss_over_epochs_sparse = [] + estimates_over_epochs_sparse = [] + pct_close_over_epochs_sparse = [] + epochs_sparse = [] + + iterator = tqdm( + range(epochs * 2), desc="Sparse reweighting progress", unit="epoch" + ) # lower learning rate, harder optimization + + for i in iterator: + optimizer.zero_grad() + weights_ = dropout_weights(weights, dropout_rate) + masked = torch.exp(weights_) * gates() + estimate = estimate_function(masked) + l_main = loss(estimate, targets, normalization_factor) + l = l_main + l0_lambda * gates.get_penalty() + close = pct_close(estimate, targets) + if i % tracking_n / 2 == 0: + epochs_sparse.append(i) + loss_over_epochs_sparse.append(l.item()) + pct_close_over_epochs_sparse.append(close) + estimates_over_epochs_sparse.append( + estimate.detach().cpu().numpy() + ) + + logger.info( + f"Within 10% from targets in sparse calibration: {close:.2%} \n" + ) + + if len(loss_over_epochs_sparse) > 1: + loss_change = loss_over_epochs_sparse[-2] - l.item() + logger.info( + f"Epoch {i:4d}: Loss = {l.item():.6f}, " + f"Change = {loss_change:.6f} " + f"({'improving' if loss_change > 0 else 'worsening'})" + ) + if start_loss is None: + start_loss = l.item() + loss_rel_change = (l.item() - start_loss) / start_loss + l.backward() + iterator.set_postfix( + {"loss": l.item(), "loss_rel_change": loss_rel_change} + ) + optimizer.step() + + gates.eval() + final_weights_sparse = ( + (torch.exp(weights) * gates()).detach().cpu().numpy() + ) + + tracker_dict_sparse = { + "epochs": epochs_sparse, + "loss": loss_over_epochs_sparse, + "estimates": estimates_over_epochs_sparse, + } + + sparse_performance_df = log_performance_over_epochs( + tracker_dict_sparse, + targets, + target_names, + excluded_targets, + excluded_target_data, + ) + + if csv_path: + # Create directory if it doesn't exist + csv_path = Path(str(csv_path).replace(".csv", "_sparse.csv")) + csv_path.parent.mkdir(parents=True, exist_ok=True) + sparse_performance_df.to_csv(csv_path, index=True) + else: + final_weights_sparse = None + return ( final_weights, + final_weights_sparse, performance_df, ) diff --git a/src/microcalibrate/utils/__init__.py b/src/microcalibrate/utils/__init__.py index 0125483..f8877b8 100644 --- a/src/microcalibrate/utils/__init__.py +++ b/src/microcalibrate/utils/__init__.py @@ -1,2 +1,3 @@ +from .l0 import HardConcrete, evaluate_sparse_weights from .log_performance import log_performance_over_epochs from .metrics import loss, pct_close diff --git a/src/microcalibrate/utils/l0.py b/src/microcalibrate/utils/l0.py new file mode 100644 index 0000000..a01242f --- /dev/null +++ b/src/microcalibrate/utils/l0.py @@ -0,0 +1,147 @@ +import logging +import math +from typing import Optional, Union + +import numpy as np +import torch +import torch.nn as nn + + +class HardConcrete(nn.Module): + """HardConcrete distribution for L0 regularization.""" + + def __init__( + self, + input_dim, + output_dim=None, + temperature=0.5, + stretch=0.1, + init_mean=0.5, + ): + super().__init__() + if output_dim is None: + self.gate_size = (input_dim,) + else: + self.gate_size = (input_dim, output_dim) + self.qz_logits = nn.Parameter(torch.zeros(self.gate_size)) + self.temperature = temperature + self.stretch = stretch + self.gamma = -0.1 + self.zeta = 1.1 + self.init_mean = init_mean + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.init_mean is not None: + init_val = math.log(self.init_mean / (1 - self.init_mean)) + self.qz_logits.data.fill_(init_val) + + def forward( + self, input_shape: Optional[torch.Size] = None + ) -> torch.Tensor: + if self.training: + gates = self._sample_gates() + else: + gates = self._deterministic_gates() + if input_shape is not None and len(input_shape) > len(gates.shape): + gates = gates.unsqueeze(-1).unsqueeze(-1) + return gates + + def _sample_gates(self) -> torch.Tensor: + u = torch.zeros_like(self.qz_logits).uniform_(1e-8, 1.0 - 1e-8) + s = torch.log(u) - torch.log(1 - u) + self.qz_logits + s = torch.sigmoid(s / self.temperature) + s = s * (self.zeta - self.gamma) + self.gamma + gates = torch.clamp(s, 0, 1) + return gates + + def _deterministic_gates(self) -> torch.Tensor: + probs = torch.sigmoid(self.qz_logits) + gates = probs * (self.zeta - self.gamma) + self.gamma + return torch.clamp(gates, 0, 1) + + def get_penalty(self) -> torch.Tensor: + logits_shifted = self.qz_logits - self.temperature * math.log( + -self.gamma / self.zeta + ) + prob_active = torch.sigmoid(logits_shifted) + return prob_active.sum() + + def get_active_prob(self) -> torch.Tensor: + logits_shifted = self.qz_logits - self.temperature * math.log( + -self.gamma / self.zeta + ) + return torch.sigmoid(logits_shifted) + + +def evaluate_sparse_weights( + optimised_weights: Union[torch.Tensor, np.ndarray], + estimate_matrix: Union[torch.Tensor, np.ndarray], + targets_array: Union[torch.Tensor, np.ndarray], + label: Optional[str] = "L0 Sparse Weights", +) -> float: + """ + Evaluate the performance of sparse weights against targets. + + Args: + optimised_weights (torch.Tensor or np.ndarray): The optimised weights. + estimate_matrix (torch.Tensor or pd.DataFrame): The estimate matrix. + targets_array (torch.Tensor or np.ndarray): The target values. + label (str): A label for logging purposes. + + Returns: + float: The percentage of estimates within 10% of the targets. + """ + # Convert all inputs to NumPy arrays right at the start + optimised_weights_np = ( + optimised_weights.numpy() + if hasattr(optimised_weights, "numpy") + else np.asarray(optimised_weights) + ) + estimate_matrix_np = ( + estimate_matrix.numpy() + if hasattr(estimate_matrix, "numpy") + else np.asarray(estimate_matrix) + ) + targets_array_np = ( + targets_array.numpy() + if hasattr(targets_array, "numpy") + else np.asarray(targets_array) + ) + + logging.info(f"\n\n---{label}: reweighting quick diagnostics----\n") + logging.info( + f"{np.sum(optimised_weights_np == 0)} are zero, " + f"{np.sum(optimised_weights_np != 0)} weights are nonzero" + ) + + # All subsequent calculations use the guaranteed NumPy versions + estimate = optimised_weights_np @ estimate_matrix_np + + rel_error = ( + ((estimate - targets_array_np) + 1) / (targets_array_np + 1) + ) ** 2 + within_10_percent_mask = np.abs(estimate - targets_array_np) <= ( + 0.10 * np.abs(targets_array_np) + ) + percent_within_10 = np.mean(within_10_percent_mask) * 100 + logging.info( + f"rel_error: min: {np.min(rel_error):.2f}\n" + f"max: {np.max(rel_error):.2f}\n" + f"mean: {np.mean(rel_error):.2f}\n" + f"median: {np.median(rel_error):.2f}\n" + f"Within 10% of target: {percent_within_10:.2f}%" + ) + logging.info("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + # Keep this check, as Tensors won't have a .columns attribute + if hasattr(estimate_matrix, "columns"): + logging.info(f"target_name: {estimate_matrix.columns[i]}") + else: + logging.info(f"target_index: {i}") + + logging.info(f"target_value: {targets_array_np[i]}") + logging.info(f"estimate_value: {estimate[i]}") + logging.info(f"has rel_error: {rel_error[i]:.2f}\n") + logging.info("---End of reweighting quick diagnostics------") + return percent_within_10 diff --git a/tests/test_regularization.py b/tests/test_regularization.py new file mode 100644 index 0000000..34bf707 --- /dev/null +++ b/tests/test_regularization.py @@ -0,0 +1,108 @@ +""" +Test the calibration process with L0 regularization. +""" + +from microcalibrate.calibration import Calibration +from microcalibrate.utils.l0 import evaluate_sparse_weights +import logging +import numpy as np +import pandas as pd + + +def test_calibration_with_l0_regularization() -> None: + # Create a sample dataset for testing + random_generator = np.random.default_rng(0) + data = pd.DataFrame( + { + "age": np.append(random_generator.integers(18, 70, size=500), 71), + "income": random_generator.normal(40000, 10000, size=501), + } + ) + + weights = np.ones(len(data)) + + # Calculate target values: + targets_matrix = pd.DataFrame( + { + "income_aged_20_30": ( + (data["age"] >= 20) & (data["age"] < 30) + ).astype(float) + * data["income"], + "income_aged_30_40": ( + (data["age"] >= 30) & (data["age"] < 40) + ).astype(float) + * data["income"], + "income_aged_40_50": ( + (data["age"] >= 40) & (data["age"] < 50) + ).astype(float) + * data["income"], + "income_aged_50_60": ( + (data["age"] >= 50) & (data["age"] < 60) + ).astype(float) + * data["income"], + "income_aged_60_70": ( + (data["age"] >= 60) & (data["age"] <= 70) + ).astype(float) + * data["income"], + } + ) + targets = np.array( + [ + (targets_matrix["income_aged_20_30"] * weights).sum() * 1.2, + (targets_matrix["income_aged_30_40"] * weights).sum() * 1.3, + (targets_matrix["income_aged_40_50"] * weights).sum() * 0.9, + (targets_matrix["income_aged_50_60"] * weights).sum() * 1.5, + (targets_matrix["income_aged_60_70"] * weights).sum() * 1.2, + ] + ) + + calibrator = Calibration( + estimate_matrix=targets_matrix, + weights=weights, + targets=targets, + noise_level=0.05, + epochs=128, + learning_rate=0.01, + dropout_rate=0, + regularize_with_l0=True, + csv_path="tests/calibration_log.csv", + ) + + performance_df = calibrator.calibrate() + weights = calibrator.weights + sparse_weights = calibrator.sparse_weights + + percentage_within_10 = evaluate_sparse_weights( + optimised_weights=sparse_weights, + estimate_matrix=targets_matrix, + targets_array=targets, + ) + + sparse_calibration_log = pd.read_csv( + str(calibrator.csv_path).replace(".csv", "_sparse.csv") + ) + + # Get the final epoch average relative absolute error from the dense calibration log + final_epoch = performance_df["epoch"].max() + final_epoch_data = performance_df[performance_df["epoch"] == final_epoch] + avg_error_dense_final_epoch = final_epoch_data["rel_abs_error"].mean() + + # Get final epoch data from sparse calibration log + sparse_final_epoch = sparse_calibration_log["epoch"].max() + sparse_final_epoch_data = sparse_calibration_log[ + sparse_calibration_log["epoch"] == sparse_final_epoch + ] + avg_error_sparse_final_epoch = sparse_final_epoch_data[ + "rel_abs_error" + ].mean() + + assert ( + avg_error_sparse_final_epoch < 0.05 + ), "Final average relative absolute error is more than 5%." + + percentage_below_threshold = ( + (sparse_weights < 0.5).sum() / len(sparse_weights) * 100 + ) + assert ( + percentage_below_threshold > 10 + ), f"Only {percentage_below_threshold:.1f}% of sparse weights are below 0.5 (expected > 10%)"