From 0ef90b57d2c2e7bfadfcd12b623c7456eb713ecd Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 2 Feb 2026 10:36:17 -0500 Subject: [PATCH 1/5] Fix stale calibration targets by deriving time_period from dataset - Remove hardcoded CBO_YEAR and TREASURY_YEAR constants - Add --dataset CLI argument to etl_national_targets.py - Derive time_period from sim.default_calculation_period - Default to HuggingFace production dataset The dataset itself is now the single source of truth for the calibration year, preventing future drift when updating to new base years. Closes #503 Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 3 + .../db/etl_national_targets.py | 117 +++++++++++------- 2 files changed, 73 insertions(+), 47 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..a2210db7e 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,3 @@ +- date: 2026-02-02 + type: fixed + description: Fix stale 2022-2023 calibration targets in policy_data.db by deriving time_period from the dataset instead of hardcoding year constants diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 7e02d6f09..fd97b83f4 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -1,3 +1,5 @@ +import argparse + from sqlmodel import Session, create_engine import pandas as pd @@ -12,11 +14,19 @@ get_or_create_source, ) +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + -def extract_national_targets(): +def extract_national_targets(dataset: str = DEFAULT_DATASET): """ Extract national calibration targets from various sources. + Parameters + ---------- + dataset : str + Path to the calibration dataset (local path or HuggingFace URL). + The time period is derived from the dataset's default_calculation_period. + Returns ------- dict @@ -26,18 +36,17 @@ def extract_national_targets(): - conditional_count_targets: Enrollment counts requiring constraints - cbo_targets: List of CBO projection targets - treasury_targets: List of Treasury/JCT targets + - time_period: The year derived from the dataset """ - - # Initialize PolicyEngine for parameter access from policyengine_us import Microsimulation - sim = Microsimulation( - dataset="hf://policyengine/policyengine-us-data/cps_2023.h5" - ) + print(f"Loading dataset: {dataset}") + sim = Microsimulation(dataset=dataset) + + time_period = int(sim.default_calculation_period) + print(f"Derived time_period from dataset: {time_period}") - # Direct sum targets - these are regular variables that can be summed - # Store with their actual source year (2024 for hardcoded values from loss.py) - HARDCODED_YEAR = 2024 + # Direct sum targets - use the time_period derived from the dataset # Separate tax-related targets that need filer constraint tax_filer_targets = [ @@ -46,35 +55,35 @@ def extract_national_targets(): "value": 21.247e9, "source": "Joint Committee on Taxation", "notes": "SALT deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medical_expense_deduction", "value": 11.4e9, "source": "Joint Committee on Taxation", "notes": "Medical expense deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "charitable_deduction", "value": 65.301e9, "source": "Joint Committee on Taxation", "notes": "Charitable deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "interest_deduction", "value": 24.8e9, "source": "Joint Committee on Taxation", "notes": "Mortgage interest deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "qualified_business_income_deduction", "value": 63.1e9, "source": "Joint Committee on Taxation", "notes": "QBI deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -84,112 +93,112 @@ def extract_national_targets(): "value": 13e9, "source": "Survey-reported (post-TCJA grandfathered)", "notes": "Alimony received - survey reported, not tax-filer restricted", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "alimony_expense", "value": 13e9, "source": "Survey-reported (post-TCJA grandfathered)", "notes": "Alimony paid - survey reported, not tax-filer restricted", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medicaid", "value": 871.7e9, "source": "https://www.cms.gov/files/document/highlights.pdf", "notes": "CMS 2023 highlights document - total Medicaid spending", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "net_worth", "value": 160e12, "source": "Federal Reserve SCF", "notes": "Total household net worth", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "health_insurance_premiums_without_medicare_part_b", "value": 385e9, "source": "MEPS/NHEA", "notes": "Health insurance premiums excluding Medicare Part B", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "other_medical_expenses", "value": 278e9, "source": "MEPS/NHEA", "notes": "Out-of-pocket medical expenses", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medicare_part_b_premiums", "value": 112e9, "source": "CMS Medicare data", "notes": "Medicare Part B premium payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "over_the_counter_health_expenses", "value": 72e9, "source": "Consumer Expenditure Survey", "notes": "OTC health products and supplies", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "child_support_expense", "value": 33e9, "source": "Census Bureau", "notes": "Child support payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "child_support_received", "value": 33e9, "source": "Census Bureau", "notes": "Child support received", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "spm_unit_capped_work_childcare_expenses", "value": 348e9, "source": "Census Bureau SPM", "notes": "Work and childcare expenses for SPM", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "spm_unit_capped_housing_subsidy", "value": 35e9, "source": "HUD/Census", "notes": "Housing subsidies", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "tanf", "value": 9e9, "source": "HHS/ACF", "notes": "TANF cash assistance", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "real_estate_taxes", "value": 500e9, "source": "Census Bureau", "notes": "Property taxes paid", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "rent", "value": 735e9, "source": "Census Bureau/BLS", "notes": "Rental payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "tip_income", "value": 53.2e9, "source": "IRS Form W-2 Box 7 statistics", "notes": "Social security tips uprated 40% to account for underreporting", - "year": HARDCODED_YEAR, + "year": time_period, }, # SSA benefit-type totals derived from trust fund data and # SSA fact sheet type shares @@ -198,28 +207,28 @@ def extract_national_targets(): "value": 1_060e9, "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", "notes": "~73% of total OASDI ($1,452B CBO projection)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_disability", "value": 148e9, "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", "notes": "~10.2% of total OASDI (disabled workers)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_survivors", "value": 160e9, "source": "https://www.ssa.gov/OACT/FACTS/", "notes": "~11.0% of total OASDI (widows, children of deceased)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_dependents", "value": 84e9, "source": "https://www.ssa.gov/OACT/FACTS/", "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)", - "year": HARDCODED_YEAR, + "year": time_period, }, # IRA contribution totals from IRS SOI accumulation tables { @@ -227,14 +236,14 @@ def extract_national_targets(): "value": 25e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "roth_ira_contributions", "value": 39e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -247,7 +256,7 @@ def extract_national_targets(): "person_count": 72_429_055, "source": "CMS/HHS administrative data", "notes": "Medicaid enrollment count", - "year": HARDCODED_YEAR, + "year": time_period, }, { "constraint_variable": "aca_ptc", @@ -255,7 +264,7 @@ def extract_national_targets(): "person_count": 19_743_689, "source": "CMS marketplace data", "notes": "ACA Premium Tax Credit recipients", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -302,8 +311,7 @@ def extract_national_targets(): conditional_count_targets.extend(ssn_none_targets_by_year) - # CBO projection targets - get for a specific year - CBO_YEAR = 2023 # Year the CBO projections are for + # CBO projection targets - use time_period derived from dataset cbo_vars = [ # Note: income_tax_positive matches CBO's receipts definition # where refundable credit payments in excess of liability are @@ -326,7 +334,7 @@ def extract_national_targets(): param_name = cbo_param_name_map.get(variable_name, variable_name) try: value = sim.tax_benefit_system.parameters( - CBO_YEAR + time_period ).calibration.gov.cbo._children[param_name] cbo_targets.append( { @@ -334,7 +342,7 @@ def extract_national_targets(): "value": float(value), "source": "CBO Budget Projections", "notes": f"CBO projection for {variable_name}", - "year": CBO_YEAR, + "year": time_period, } ) except (KeyError, AttributeError) as e: @@ -343,11 +351,10 @@ def extract_national_targets(): f"{variable_name} (param: {param_name}): {e}" ) - # Treasury/JCT targets (EITC) - get for a specific year - TREASURY_YEAR = 2023 + # Treasury/JCT targets (EITC) - use time_period derived from dataset try: eitc_value = sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc( - TREASURY_YEAR + time_period ) treasury_targets = [ { @@ -355,7 +362,7 @@ def extract_national_targets(): "value": float(eitc_value), "source": "Treasury/JCT Tax Expenditures", "notes": "EITC tax expenditure", - "year": TREASURY_YEAR, + "year": time_period, } ] except (KeyError, AttributeError) as e: @@ -368,6 +375,7 @@ def extract_national_targets(): "conditional_count_targets": conditional_count_targets, "cbo_targets": cbo_targets, "treasury_targets": treasury_targets, + "time_period": time_period, } @@ -707,10 +715,25 @@ def load_national_targets( def main(): """Main ETL pipeline for national targets.""" + parser = argparse.ArgumentParser( + description="ETL for national calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The time_period for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() # Extract print("Extracting national targets...") - raw_targets = extract_national_targets() + raw_targets = extract_national_targets(dataset=args.dataset) + time_period = raw_targets["time_period"] + print(f"Using time_period={time_period} for CBO/Treasury targets") # Transform print("Transforming targets...") From 69406d699d74ec5d5d6f886b7691687de62e2dda Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 2 Feb 2026 13:04:03 -0500 Subject: [PATCH 2/5] Use income_tax_positive for CBO calibration in loss.py The CBO income_tax parameter represents positive-only receipts (refundable credit payments in excess of liability are classified as outlays, not negative receipts). Using income_tax_positive matches this definition. Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 2 +- policyengine_us_data/utils/loss.py | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index a2210db7e..6ea6b894d 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,3 +1,3 @@ - date: 2026-02-02 type: fixed - description: Fix stale 2022-2023 calibration targets in policy_data.db by deriving time_period from the dataset instead of hardcoding year constants + description: Fix stale calibration targets by deriving time_period from dataset and using income_tax_positive for CBO calibration diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index f798c0dc6..e9916641a 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -220,26 +220,35 @@ def build_loss_matrix(dataset: type, time_period): targets_array.append(populations[year]) # CBO projections + # Note: income_tax_positive matches CBO's receipts definition where + # refundable credit payments in excess of liability are classified as + # outlays, not negative receipts. See: https://www.cbo.gov/publication/43767 - PROGRAMS = [ - "income_tax", + CBO_PROGRAMS = [ + "income_tax_positive", "snap", "social_security", "ssi", "unemployment_compensation", ] - for variable_name in PROGRAMS: + # Mapping from variable name to CBO parameter name (when different) + CBO_PARAM_NAME_MAP = { + "income_tax_positive": "income_tax", + } + + for variable_name in CBO_PROGRAMS: label = f"nation/cbo/{variable_name}" loss_matrix[label] = sim.calculate( variable_name, map_to="household" ).values if any(loss_matrix[label].isna()): raise ValueError(f"Missing values for {label}") + param_name = CBO_PARAM_NAME_MAP.get(variable_name, variable_name) targets_array.append( sim.tax_benefit_system.parameters( time_period - ).calibration.gov.cbo._children[variable_name] + ).calibration.gov.cbo._children[param_name] ) # 1. Medicaid Spending From b952548c888034d39b8abc0a864471ad961b58c2 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 2 Feb 2026 13:28:51 -0500 Subject: [PATCH 3/5] Add --dataset argument to all database ETL scripts All ETL scripts now derive their target year from the dataset's default_calculation_period instead of hardcoding years. This ensures all calibration targets stay synchronized when updating to a new base year annually. Updated scripts: - create_initial_strata.py - etl_age.py - etl_irs_soi.py (with configurable --lag for IRS data delay) - etl_medicaid.py - etl_snap.py - etl_state_income_tax.py Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 2 +- .../db/create_initial_strata.py | 28 +++++++++++- policyengine_us_data/db/etl_age.py | 32 +++++++++++++- policyengine_us_data/db/etl_irs_soi.py | 43 +++++++++++++++++-- policyengine_us_data/db/etl_medicaid.py | 25 ++++++++++- policyengine_us_data/db/etl_snap.py | 25 ++++++++++- .../db/etl_state_income_tax.py | 25 ++++++++++- 7 files changed, 169 insertions(+), 11 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 6ea6b894d..4bbfcf6f0 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,3 +1,3 @@ - date: 2026-02-02 type: fixed - description: Fix stale calibration targets by deriving time_period from dataset and using income_tax_positive for CBO calibration + description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts and using income_tax_positive for CBO calibration diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py index f3edb1b41..8dda76e29 100644 --- a/policyengine_us_data/db/create_initial_strata.py +++ b/policyengine_us_data/db/create_initial_strata.py @@ -1,3 +1,4 @@ +import argparse import logging from typing import Dict @@ -6,6 +7,8 @@ from sqlmodel import Session, create_engine from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -68,6 +71,28 @@ def fetch_congressional_districts(year): def main(): + parser = argparse.ArgumentParser( + description="Create initial geographic strata for calibration" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for Census API calls is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") + # State FIPS to name/abbreviation mapping STATE_NAMES = { 1: "Alabama (AL)", @@ -123,8 +148,7 @@ def main(): 56: "Wyoming (WY)", } - # Fetch congressional district data for year 2023 - year = 2023 + # Fetch congressional district data cd_df = fetch_congressional_districts(year) DATABASE_URL = ( diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 39ffedf22..13853ca44 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -1,9 +1,13 @@ +import argparse + import pandas as pd import numpy as np from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -279,10 +283,30 @@ def load_age_data(df_long, geo, year): session.commit() -if __name__ == "__main__": +def main(): + parser = argparse.ArgumentParser( + description="ETL for age calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for Census API calls is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # --- ETL: Extract, Transform, Load ---- - year = 2023 # ---- Extract ---------- docs = get_census_docs(year) @@ -301,3 +325,7 @@ def load_age_data(df_long, geo, year): load_age_data(long_national_df, "National", year) load_age_data(long_state_df, "State", year) load_age_data(long_district_df, "District", year) + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index ed4da4e5c..873d7a072 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -1,3 +1,4 @@ +import argparse import logging from typing import Optional @@ -7,6 +8,11 @@ from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + +# IRS SOI data is typically available ~2 years after the tax year +IRS_SOI_LAG_YEARS = 2 from policyengine_us_data.utils.raw_cache import ( is_cached, cache_path, @@ -1207,9 +1213,40 @@ def load_soi_data(long_dfs, year): def main(): - # NOTE: predates the finalization of the 2020 Census redistricting - # and there is district mapping in the Transform step - year = 2022 + parser = argparse.ArgumentParser( + description="ETL for IRS SOI calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for IRS SOI data is derived from the dataset's " + "default_calculation_period minus IRS_SOI_LAG_YEARS. " + "Default: %(default)s" + ), + ) + parser.add_argument( + "--lag", + type=int, + default=IRS_SOI_LAG_YEARS, + help=( + "Years to subtract from dataset year for IRS SOI data " + "(default: %(default)s, since IRS data is ~2 years behind)" + ), + ) + args = parser.parse_args() + + # Derive year from dataset with lag applied + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + dataset_year = int(sim.default_calculation_period) + year = dataset_year - args.lag + print( + f"Dataset year: {dataset_year}, IRS SOI year: {year} (lag={args.lag})" + ) # Extract ----------------------- raw_df = extract_soi_data() diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index ed1841447..435ccd42c 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -1,3 +1,4 @@ +import argparse import logging import requests @@ -7,6 +8,8 @@ from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -325,7 +328,27 @@ def load_medicaid_data(long_state, long_cd, year): def main(): - year = 2024 + parser = argparse.ArgumentParser( + description="ETL for Medicaid calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # Extract ------------------------------ state_admin_df = extract_administrative_medicaid_data(year) diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index 48c1eb832..a8a80f5ce 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -1,3 +1,4 @@ +import argparse import logging import requests import zipfile @@ -10,6 +11,8 @@ from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -363,7 +366,27 @@ def load_survey_snap_data(survey_df, year, snap_stratum_lookup): def main(): - year = 2023 + parser = argparse.ArgumentParser( + description="ETL for SNAP calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # Extract --------- zip_file_admin = extract_administrative_snap_data() diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py index df0f40a6c..9da8d8390 100644 --- a/policyengine_us_data/db/etl_state_income_tax.py +++ b/policyengine_us_data/db/etl_state_income_tax.py @@ -10,12 +10,15 @@ Stratum Group ID: 7 (State Income Tax) """ +import argparse import logging import pandas as pd import numpy as np from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -342,12 +345,32 @@ def load_state_income_tax_data(df: pd.DataFrame, year: int) -> dict: def main(): """Run the full ETL pipeline for state income tax targets.""" + parser = argparse.ArgumentParser( + description="ETL for state income tax calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) - year = 2023 + # Derive year from dataset + from policyengine_us import Microsimulation + + logger.info(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + logger.info(f"Derived year from dataset: {year}") logger.info(f"Extracting Census STC data for FY{year}...") raw_df = extract_state_income_tax_data(year) From bc139995717629b7eca5d63785f496a38b300b67 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 2 Feb 2026 14:54:40 -0500 Subject: [PATCH 4/5] Add 119th Congress district code support for 2024 ACS data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update parse_ucgid to recognize both 5001800US (118th) and 5001900US (119th Congress) - Expand Puerto Rico and territory filters to handle both Congress code formats - Update TERRITORY_UCGIDS and NON_VOTING_GEO_IDS with 119th Congress codes This ensures consistent redistricting alignment: 2024 ACS data uses 119th Congress codes natively, and IRS SOI data is converted via the 116th→119th mapping matrix. Co-Authored-By: Claude Haiku 4.5 --- changelog_entry.yaml | 2 +- policyengine_us_data/db/etl_age.py | 7 +++++-- policyengine_us_data/db/etl_snap.py | 3 ++- .../storage/calibration_targets/pull_soi_targets.py | 7 +++++++ policyengine_us_data/utils/census.py | 7 +++++++ policyengine_us_data/utils/db.py | 5 ++++- 6 files changed, 26 insertions(+), 5 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 4bbfcf6f0..1d930f19e 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,3 +1,3 @@ - date: 2026-02-02 type: fixed - description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts and using income_tax_positive for CBO calibration + description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts, using income_tax_positive for CBO calibration, and adding 119th Congress district code support for consistent redistricting alignment diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 13853ca44..2e213d92b 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -70,9 +70,12 @@ def transform_age_data(age_data, docs): df = df.rename({"GEO_ID": "ucgid_str"}, axis=1) df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)] - # Filter out Puerto Rico's district and state records, if needed + # Filter out Puerto Rico's district and state records + # 5001800US7298 = 118th Congress, 5001900US7298 = 119th Congress df_geos = df_data[ - ~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"]) + ~df_data["ucgid_str"].isin( + ["5001800US7298", "5001900US7298", "0400000US72"] + ) ].copy() df = df_geos[["ucgid_str"] + AGE_COLS] diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index a8a80f5ce..554f67ec1 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -152,9 +152,10 @@ def transform_survey_snap_data(raw_df): {"GEO_ID": "ucgid_str", "S2201_C03_001E": "snap_household_ct"}, axis=1 )[ ~df["GEO_ID"].isin( - [ # Puerto Rico's state and district + [ # Puerto Rico's state and district (118th and 119th Congress) "0400000US72", "5001800US7298", + "5001900US7298", ] ) ] diff --git a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py index 0b1f3dcb6..c3f159191 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py @@ -41,11 +41,18 @@ NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} NON_VOTING_GEO_IDS = { "0400000US72", # Puerto Rico (state level) + # 118th Congress codes "5001800US7298", # Puerto Rico "5001800US6098", # American Samoa "5001800US6698", # Guam "5001800US6998", # Northern Mariana Islands "5001800US7898", # U.S. Virgin Islands + # 119th Congress codes + "5001900US7298", # Puerto Rico + "5001900US6098", # American Samoa + "5001900US6698", # Guam + "5001900US6998", # Northern Mariana Islands + "5001900US7898", # U.S. Virgin Islands } # after skipping the first 7 rows, the national SOI file has targets as row indices [COUNT_INDEX, AMOUNT_INDEX] diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index cb9d0b5d8..c61cc166d 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -123,11 +123,18 @@ TERRITORY_UCGIDS = { "0400000US72", # Puerto Rico (state level) + # 118th Congress codes "5001800US7298", # Puerto Rico "5001800US6098", # American Samoa "5001800US6698", # Guam "5001800US6998", # Northern Mariana Islands "5001800US7898", # U.S. Virgin Islands + # 119th Congress codes + "5001900US7298", # Puerto Rico + "5001900US6098", # American Samoa + "5001900US6698", # Guam + "5001900US6998", # Northern Mariana Islands + "5001900US7898", # U.S. Virgin Islands } diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py index 6c7b1a4ed..4de79c44f 100644 --- a/policyengine_us_data/utils/db.py +++ b/policyengine_us_data/utils/db.py @@ -82,7 +82,10 @@ def parse_ucgid(ucgid_str: str) -> Dict: elif ucgid_str.startswith("0400000US"): state_fips = int(ucgid_str[9:]) return {"type": "state", "state_fips": state_fips} - elif ucgid_str.startswith("5001800US"): + elif ucgid_str.startswith("5001800US") or ucgid_str.startswith( + "5001900US" + ): + # 5001800US = 118th Congress, 5001900US = 119th Congress state_and_district = ucgid_str[9:] state_fips = int(state_and_district[:2]) district_number = int(state_and_district[2:]) From 634a75d047232961302d62466b79138dc7175511 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Tue, 3 Feb 2026 08:36:14 -0500 Subject: [PATCH 5/5] Use deterministic hash for medicaid_take_up_seed --- policyengine_us_data/datasets/cps/cps.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 84f01a8bb..9fdea7978 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -208,11 +208,20 @@ def add_takeup(self): data["takes_up_dc_ptc"] = ( generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate ) + # Deterministic seed for medicaid: hash person_id to [0, 1) range + # Uses Knuth multiplicative hash for good distribution + # This ensures same person_id always yields same seed, making + # enrollment determination reproducible across dataset rebuilds + HASH_MULTIPLIER = 2654435761 # Knuth's constant + HASH_MODULUS = 2**32 + data["medicaid_take_up_seed"] = ( + (data["person_id"].astype(np.uint64) * HASH_MULTIPLIER) % HASH_MODULUS + ) / HASH_MODULUS + + # SNAP and ACA seeds remain random for now (template above for conversion) generator = np.random.default_rng(seed=100) - data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"])) data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"])) - data["medicaid_take_up_seed"] = generator.random(len(data["person_id"])) self.save_dataset(data)