From 0ef90b57d2c2e7bfadfcd12b623c7456eb713ecd Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 2 Feb 2026 10:36:17 -0500
Subject: [PATCH 1/5] Fix stale calibration targets by deriving time_period
 from dataset

- Remove hardcoded CBO_YEAR and TREASURY_YEAR constants
- Add --dataset CLI argument to etl_national_targets.py
- Derive time_period from sim.default_calculation_period
- Default to HuggingFace production dataset

The dataset itself is now the single source of truth for the
calibration year, preventing future drift when updating to new
base years.

Closes #503

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 changelog_entry.yaml                          |   3 +
 .../db/etl_national_targets.py                | 117 +++++++++++-------
 2 files changed, 73 insertions(+), 47 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29bb..a2210db7e 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,3 @@
+- date: 2026-02-02
+  type: fixed
+  description: Fix stale 2022-2023 calibration targets in policy_data.db by deriving time_period from the dataset instead of hardcoding year constants
diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py
index 7e02d6f09..fd97b83f4 100644
--- a/policyengine_us_data/db/etl_national_targets.py
+++ b/policyengine_us_data/db/etl_national_targets.py
@@ -1,3 +1,5 @@
+import argparse
+
 from sqlmodel import Session, create_engine
 import pandas as pd
 
@@ -12,11 +14,19 @@
     get_or_create_source,
 )
 
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
+
 
-def extract_national_targets():
+def extract_national_targets(dataset: str = DEFAULT_DATASET):
     """
     Extract national calibration targets from various sources.
 
+    Parameters
+    ----------
+    dataset : str
+        Path to the calibration dataset (local path or HuggingFace URL).
+        The time period is derived from the dataset's default_calculation_period.
+
     Returns
     -------
     dict
@@ -26,18 +36,17 @@ def extract_national_targets():
         - conditional_count_targets: Enrollment counts requiring constraints
         - cbo_targets: List of CBO projection targets
         - treasury_targets: List of Treasury/JCT targets
+        - time_period: The year derived from the dataset
     """
-
-    # Initialize PolicyEngine for parameter access
     from policyengine_us import Microsimulation
 
-    sim = Microsimulation(
-        dataset="hf://policyengine/policyengine-us-data/cps_2023.h5"
-    )
+    print(f"Loading dataset: {dataset}")
+    sim = Microsimulation(dataset=dataset)
+
+    time_period = int(sim.default_calculation_period)
+    print(f"Derived time_period from dataset: {time_period}")
 
-    # Direct sum targets - these are regular variables that can be summed
-    # Store with their actual source year (2024 for hardcoded values from loss.py)
-    HARDCODED_YEAR = 2024
+    # Direct sum targets - use the time_period derived from the dataset
 
     # Separate tax-related targets that need filer constraint
     tax_filer_targets = [
@@ -46,35 +55,35 @@ def extract_national_targets():
             "value": 21.247e9,
             "source": "Joint Committee on Taxation",
             "notes": "SALT deduction tax expenditure",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "medical_expense_deduction",
             "value": 11.4e9,
             "source": "Joint Committee on Taxation",
             "notes": "Medical expense deduction tax expenditure",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "charitable_deduction",
             "value": 65.301e9,
             "source": "Joint Committee on Taxation",
             "notes": "Charitable deduction tax expenditure",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "interest_deduction",
             "value": 24.8e9,
             "source": "Joint Committee on Taxation",
             "notes": "Mortgage interest deduction tax expenditure",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "qualified_business_income_deduction",
             "value": 63.1e9,
             "source": "Joint Committee on Taxation",
             "notes": "QBI deduction tax expenditure",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
     ]
 
@@ -84,112 +93,112 @@ def extract_national_targets():
             "value": 13e9,
             "source": "Survey-reported (post-TCJA grandfathered)",
             "notes": "Alimony received - survey reported, not tax-filer restricted",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "alimony_expense",
             "value": 13e9,
             "source": "Survey-reported (post-TCJA grandfathered)",
             "notes": "Alimony paid - survey reported, not tax-filer restricted",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "medicaid",
             "value": 871.7e9,
             "source": "https://www.cms.gov/files/document/highlights.pdf",
             "notes": "CMS 2023 highlights document - total Medicaid spending",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "net_worth",
             "value": 160e12,
             "source": "Federal Reserve SCF",
             "notes": "Total household net worth",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "health_insurance_premiums_without_medicare_part_b",
             "value": 385e9,
             "source": "MEPS/NHEA",
             "notes": "Health insurance premiums excluding Medicare Part B",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "other_medical_expenses",
             "value": 278e9,
             "source": "MEPS/NHEA",
             "notes": "Out-of-pocket medical expenses",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "medicare_part_b_premiums",
             "value": 112e9,
             "source": "CMS Medicare data",
             "notes": "Medicare Part B premium payments",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "over_the_counter_health_expenses",
             "value": 72e9,
             "source": "Consumer Expenditure Survey",
             "notes": "OTC health products and supplies",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "child_support_expense",
             "value": 33e9,
             "source": "Census Bureau",
             "notes": "Child support payments",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "child_support_received",
             "value": 33e9,
             "source": "Census Bureau",
             "notes": "Child support received",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "spm_unit_capped_work_childcare_expenses",
             "value": 348e9,
             "source": "Census Bureau SPM",
             "notes": "Work and childcare expenses for SPM",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "spm_unit_capped_housing_subsidy",
             "value": 35e9,
             "source": "HUD/Census",
             "notes": "Housing subsidies",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "tanf",
             "value": 9e9,
             "source": "HHS/ACF",
             "notes": "TANF cash assistance",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "real_estate_taxes",
             "value": 500e9,
             "source": "Census Bureau",
             "notes": "Property taxes paid",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "rent",
             "value": 735e9,
             "source": "Census Bureau/BLS",
             "notes": "Rental payments",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "tip_income",
             "value": 53.2e9,
             "source": "IRS Form W-2 Box 7 statistics",
             "notes": "Social security tips uprated 40% to account for underreporting",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         # SSA benefit-type totals derived from trust fund data and
         # SSA fact sheet type shares
@@ -198,28 +207,28 @@ def extract_national_targets():
             "value": 1_060e9,
             "source": "https://www.ssa.gov/OACT/STATS/table4a3.html",
             "notes": "~73% of total OASDI ($1,452B CBO projection)",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "social_security_disability",
             "value": 148e9,
             "source": "https://www.ssa.gov/OACT/STATS/table4a3.html",
             "notes": "~10.2% of total OASDI (disabled workers)",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "social_security_survivors",
             "value": 160e9,
             "source": "https://www.ssa.gov/OACT/FACTS/",
             "notes": "~11.0% of total OASDI (widows, children of deceased)",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "social_security_dependents",
             "value": 84e9,
             "source": "https://www.ssa.gov/OACT/FACTS/",
             "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         # IRA contribution totals from IRS SOI accumulation tables
         {
@@ -227,14 +236,14 @@ def extract_national_targets():
             "value": 25e9,
             "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements",
             "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "variable": "roth_ira_contributions",
             "value": 39e9,
             "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements",
             "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
     ]
 
@@ -247,7 +256,7 @@ def extract_national_targets():
             "person_count": 72_429_055,
             "source": "CMS/HHS administrative data",
             "notes": "Medicaid enrollment count",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
         {
             "constraint_variable": "aca_ptc",
@@ -255,7 +264,7 @@ def extract_national_targets():
             "person_count": 19_743_689,
             "source": "CMS marketplace data",
             "notes": "ACA Premium Tax Credit recipients",
-            "year": HARDCODED_YEAR,
+            "year": time_period,
         },
     ]
 
@@ -302,8 +311,7 @@ def extract_national_targets():
 
     conditional_count_targets.extend(ssn_none_targets_by_year)
 
-    # CBO projection targets - get for a specific year
-    CBO_YEAR = 2023  # Year the CBO projections are for
+    # CBO projection targets - use time_period derived from dataset
     cbo_vars = [
         # Note: income_tax_positive matches CBO's receipts definition
         # where refundable credit payments in excess of liability are
@@ -326,7 +334,7 @@ def extract_national_targets():
         param_name = cbo_param_name_map.get(variable_name, variable_name)
         try:
             value = sim.tax_benefit_system.parameters(
-                CBO_YEAR
+                time_period
             ).calibration.gov.cbo._children[param_name]
             cbo_targets.append(
                 {
@@ -334,7 +342,7 @@ def extract_national_targets():
                     "value": float(value),
                     "source": "CBO Budget Projections",
                     "notes": f"CBO projection for {variable_name}",
-                    "year": CBO_YEAR,
+                    "year": time_period,
                 }
             )
         except (KeyError, AttributeError) as e:
@@ -343,11 +351,10 @@ def extract_national_targets():
                 f"{variable_name} (param: {param_name}): {e}"
             )
 
-    # Treasury/JCT targets (EITC) - get for a specific year
-    TREASURY_YEAR = 2023
+    # Treasury/JCT targets (EITC) - use time_period derived from dataset
     try:
         eitc_value = sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc(
-            TREASURY_YEAR
+            time_period
         )
         treasury_targets = [
             {
@@ -355,7 +362,7 @@ def extract_national_targets():
                 "value": float(eitc_value),
                 "source": "Treasury/JCT Tax Expenditures",
                 "notes": "EITC tax expenditure",
-                "year": TREASURY_YEAR,
+                "year": time_period,
             }
         ]
     except (KeyError, AttributeError) as e:
@@ -368,6 +375,7 @@ def extract_national_targets():
         "conditional_count_targets": conditional_count_targets,
         "cbo_targets": cbo_targets,
         "treasury_targets": treasury_targets,
+        "time_period": time_period,
     }
 
 
@@ -707,10 +715,25 @@ def load_national_targets(
 
 def main():
     """Main ETL pipeline for national targets."""
+    parser = argparse.ArgumentParser(
+        description="ETL for national calibration targets"
+    )
+    parser.add_argument(
+        "--dataset",
+        default=DEFAULT_DATASET,
+        help=(
+            "Source dataset (local path or HuggingFace URL). "
+            "The time_period for targets is derived from the dataset's "
+            "default_calculation_period. Default: %(default)s"
+        ),
+    )
+    args = parser.parse_args()
 
     # Extract
     print("Extracting national targets...")
-    raw_targets = extract_national_targets()
+    raw_targets = extract_national_targets(dataset=args.dataset)
+    time_period = raw_targets["time_period"]
+    print(f"Using time_period={time_period} for CBO/Treasury targets")
 
     # Transform
     print("Transforming targets...")

From 69406d699d74ec5d5d6f886b7691687de62e2dda Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 2 Feb 2026 13:04:03 -0500
Subject: [PATCH 2/5] Use income_tax_positive for CBO calibration in loss.py

The CBO income_tax parameter represents positive-only receipts (refundable
credit payments in excess of liability are classified as outlays, not
negative receipts). Using income_tax_positive matches this definition.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 changelog_entry.yaml               |  2 +-
 policyengine_us_data/utils/loss.py | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index a2210db7e..6ea6b894d 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,3 +1,3 @@
 - date: 2026-02-02
   type: fixed
-  description: Fix stale 2022-2023 calibration targets in policy_data.db by deriving time_period from the dataset instead of hardcoding year constants
+  description: Fix stale calibration targets by deriving time_period from dataset and using income_tax_positive for CBO calibration
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
index f798c0dc6..e9916641a 100644
--- a/policyengine_us_data/utils/loss.py
+++ b/policyengine_us_data/utils/loss.py
@@ -220,26 +220,35 @@ def build_loss_matrix(dataset: type, time_period):
         targets_array.append(populations[year])
 
     # CBO projections
+    # Note: income_tax_positive matches CBO's receipts definition where
+    # refundable credit payments in excess of liability are classified as
+    # outlays, not negative receipts. See: https://www.cbo.gov/publication/43767
 
-    PROGRAMS = [
-        "income_tax",
+    CBO_PROGRAMS = [
+        "income_tax_positive",
         "snap",
         "social_security",
         "ssi",
         "unemployment_compensation",
     ]
 
-    for variable_name in PROGRAMS:
+    # Mapping from variable name to CBO parameter name (when different)
+    CBO_PARAM_NAME_MAP = {
+        "income_tax_positive": "income_tax",
+    }
+
+    for variable_name in CBO_PROGRAMS:
         label = f"nation/cbo/{variable_name}"
         loss_matrix[label] = sim.calculate(
             variable_name, map_to="household"
         ).values
         if any(loss_matrix[label].isna()):
             raise ValueError(f"Missing values for {label}")
+        param_name = CBO_PARAM_NAME_MAP.get(variable_name, variable_name)
         targets_array.append(
             sim.tax_benefit_system.parameters(
                 time_period
-            ).calibration.gov.cbo._children[variable_name]
+            ).calibration.gov.cbo._children[param_name]
         )
 
     # 1. Medicaid Spending

From b952548c888034d39b8abc0a864471ad961b58c2 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 2 Feb 2026 13:28:51 -0500
Subject: [PATCH 3/5] Add --dataset argument to all database ETL scripts

All ETL scripts now derive their target year from the dataset's
default_calculation_period instead of hardcoding years. This ensures
all calibration targets stay synchronized when updating to a new
base year annually.

Updated scripts:
- create_initial_strata.py
- etl_age.py
- etl_irs_soi.py (with configurable --lag for IRS data delay)
- etl_medicaid.py
- etl_snap.py
- etl_state_income_tax.py

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 changelog_entry.yaml                          |  2 +-
 .../db/create_initial_strata.py               | 28 +++++++++++-
 policyengine_us_data/db/etl_age.py            | 32 +++++++++++++-
 policyengine_us_data/db/etl_irs_soi.py        | 43 +++++++++++++++++--
 policyengine_us_data/db/etl_medicaid.py       | 25 ++++++++++-
 policyengine_us_data/db/etl_snap.py           | 25 ++++++++++-
 .../db/etl_state_income_tax.py                | 25 ++++++++++-
 7 files changed, 169 insertions(+), 11 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 6ea6b894d..4bbfcf6f0 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,3 +1,3 @@
 - date: 2026-02-02
   type: fixed
-  description: Fix stale calibration targets by deriving time_period from dataset and using income_tax_positive for CBO calibration
+  description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts and using income_tax_positive for CBO calibration
diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py
index f3edb1b41..8dda76e29 100644
--- a/policyengine_us_data/db/create_initial_strata.py
+++ b/policyengine_us_data/db/create_initial_strata.py
@@ -1,3 +1,4 @@
+import argparse
 import logging
 from typing import Dict
 
@@ -6,6 +7,8 @@
 from sqlmodel import Session, create_engine
 
 from policyengine_us_data.storage import STORAGE_FOLDER
+
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -68,6 +71,28 @@ def fetch_congressional_districts(year):
 
 
 def main():
+    parser = argparse.ArgumentParser(
+        description="Create initial geographic strata for calibration"
+    )
+    parser.add_argument(
+        "--dataset",
+        default=DEFAULT_DATASET,
+        help=(
+            "Source dataset (local path or HuggingFace URL). "
+            "The year for Census API calls is derived from the dataset's "
+            "default_calculation_period. Default: %(default)s"
+        ),
+    )
+    args = parser.parse_args()
+
+    # Derive year from dataset
+    from policyengine_us import Microsimulation
+
+    print(f"Loading dataset: {args.dataset}")
+    sim = Microsimulation(dataset=args.dataset)
+    year = int(sim.default_calculation_period)
+    print(f"Derived year from dataset: {year}")
+
     # State FIPS to name/abbreviation mapping
     STATE_NAMES = {
         1: "Alabama (AL)",
@@ -123,8 +148,7 @@ def main():
         56: "Wyoming (WY)",
     }
 
-    # Fetch congressional district data for year 2023
-    year = 2023
+    # Fetch congressional district data
     cd_df = fetch_congressional_districts(year)
 
     DATABASE_URL = (
diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index 39ffedf22..13853ca44 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -1,9 +1,13 @@
+import argparse
+
 import pandas as pd
 import numpy as np
 from sqlmodel import Session, create_engine, select
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
+
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -279,10 +283,30 @@ def load_age_data(df_long, geo, year):
         session.commit()
 
 
-if __name__ == "__main__":
+def main():
+    parser = argparse.ArgumentParser(
+        description="ETL for age calibration targets"
+    )
+    parser.add_argument(
+        "--dataset",
+        default=DEFAULT_DATASET,
+        help=(
+            "Source dataset (local path or HuggingFace URL). "
+            "The year for Census API calls is derived from the dataset's "
+            "default_calculation_period. Default: %(default)s"
+        ),
+    )
+    args = parser.parse_args()
+
+    # Derive year from dataset
+    from policyengine_us import Microsimulation
+
+    print(f"Loading dataset: {args.dataset}")
+    sim = Microsimulation(dataset=args.dataset)
+    year = int(sim.default_calculation_period)
+    print(f"Derived year from dataset: {year}")
 
     # --- ETL: Extract, Transform, Load ----
-    year = 2023
 
     # ---- Extract ----------
     docs = get_census_docs(year)
@@ -301,3 +325,7 @@ def load_age_data(df_long, geo, year):
     load_age_data(long_national_df, "National", year)
     load_age_data(long_state_df, "State", year)
     load_age_data(long_district_df, "District", year)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index ed4da4e5c..873d7a072 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -1,3 +1,4 @@
+import argparse
 import logging
 from typing import Optional
 
@@ -7,6 +8,11 @@
 from sqlmodel import Session, create_engine, select
 
 from policyengine_us_data.storage import STORAGE_FOLDER
+
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
+
+# IRS SOI data is typically available ~2 years after the tax year
+IRS_SOI_LAG_YEARS = 2
 from policyengine_us_data.utils.raw_cache import (
     is_cached,
     cache_path,
@@ -1207,9 +1213,40 @@ def load_soi_data(long_dfs, year):
 
 
 def main():
-    # NOTE: predates the finalization of the 2020 Census redistricting
-    # and there is district mapping in the Transform step
-    year = 2022
+    parser = argparse.ArgumentParser(
+        description="ETL for IRS SOI calibration targets"
+    )
+    parser.add_argument(
+        "--dataset",
+        default=DEFAULT_DATASET,
+        help=(
+            "Source dataset (local path or HuggingFace URL). "
+            "The year for IRS SOI data is derived from the dataset's "
+            "default_calculation_period minus IRS_SOI_LAG_YEARS. "
+            "Default: %(default)s"
+        ),
+    )
+    parser.add_argument(
+        "--lag",
+        type=int,
+        default=IRS_SOI_LAG_YEARS,
+        help=(
+            "Years to subtract from dataset year for IRS SOI data "
+            "(default: %(default)s, since IRS data is ~2 years behind)"
+        ),
+    )
+    args = parser.parse_args()
+
+    # Derive year from dataset with lag applied
+    from policyengine_us import Microsimulation
+
+    print(f"Loading dataset: {args.dataset}")
+    sim = Microsimulation(dataset=args.dataset)
+    dataset_year = int(sim.default_calculation_period)
+    year = dataset_year - args.lag
+    print(
+        f"Dataset year: {dataset_year}, IRS SOI year: {year} (lag={args.lag})"
+    )
 
     # Extract -----------------------
     raw_df = extract_soi_data()
diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
index ed1841447..435ccd42c 100644
--- a/policyengine_us_data/db/etl_medicaid.py
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -1,3 +1,4 @@
+import argparse
 import logging
 
 import requests
@@ -7,6 +8,8 @@
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
+
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -325,7 +328,27 @@ def load_medicaid_data(long_state, long_cd, year):
 
 
 def main():
-    year = 2024
+    parser = argparse.ArgumentParser(
+        description="ETL for Medicaid calibration targets"
+    )
+    parser.add_argument(
+        "--dataset",
+        default=DEFAULT_DATASET,
+        help=(
+            "Source dataset (local path or HuggingFace URL). "
+            "The year for targets is derived from the dataset's "
+            "default_calculation_period. Default: %(default)s"
+        ),
+    )
+    args = parser.parse_args()
+
+    # Derive year from dataset
+    from policyengine_us import Microsimulation
+
+    print(f"Loading dataset: {args.dataset}")
+    sim = Microsimulation(dataset=args.dataset)
+    year = int(sim.default_calculation_period)
+    print(f"Derived year from dataset: {year}")
 
     # Extract ------------------------------
     state_admin_df = extract_administrative_medicaid_data(year)
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
index 48c1eb832..a8a80f5ce 100644
--- a/policyengine_us_data/db/etl_snap.py
+++ b/policyengine_us_data/db/etl_snap.py
@@ -1,3 +1,4 @@
+import argparse
 import logging
 import requests
 import zipfile
@@ -10,6 +11,8 @@
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
+
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -363,7 +366,27 @@ def load_survey_snap_data(survey_df, year, snap_stratum_lookup):
 
 
 def main():
-    year = 2023
+    parser = argparse.ArgumentParser(
+        description="ETL for SNAP calibration targets"
+    )
+    parser.add_argument(
+        "--dataset",
+        default=DEFAULT_DATASET,
+        help=(
+            "Source dataset (local path or HuggingFace URL). "
+            "The year for targets is derived from the dataset's "
+            "default_calculation_period. Default: %(default)s"
+        ),
+    )
+    args = parser.parse_args()
+
+    # Derive year from dataset
+    from policyengine_us import Microsimulation
+
+    print(f"Loading dataset: {args.dataset}")
+    sim = Microsimulation(dataset=args.dataset)
+    year = int(sim.default_calculation_period)
+    print(f"Derived year from dataset: {year}")
 
     # Extract ---------
     zip_file_admin = extract_administrative_snap_data()
diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py
index df0f40a6c..9da8d8390 100644
--- a/policyengine_us_data/db/etl_state_income_tax.py
+++ b/policyengine_us_data/db/etl_state_income_tax.py
@@ -10,12 +10,15 @@
 Stratum Group ID: 7 (State Income Tax)
 """
 
+import argparse
 import logging
 import pandas as pd
 import numpy as np
 from sqlmodel import Session, create_engine, select
 
 from policyengine_us_data.storage import STORAGE_FOLDER
+
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -342,12 +345,32 @@ def load_state_income_tax_data(df: pd.DataFrame, year: int) -> dict:
 
 def main():
     """Run the full ETL pipeline for state income tax targets."""
+    parser = argparse.ArgumentParser(
+        description="ETL for state income tax calibration targets"
+    )
+    parser.add_argument(
+        "--dataset",
+        default=DEFAULT_DATASET,
+        help=(
+            "Source dataset (local path or HuggingFace URL). "
+            "The year for targets is derived from the dataset's "
+            "default_calculation_period. Default: %(default)s"
+        ),
+    )
+    args = parser.parse_args()
+
     logging.basicConfig(
         level=logging.INFO,
         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     )
 
-    year = 2023
+    # Derive year from dataset
+    from policyengine_us import Microsimulation
+
+    logger.info(f"Loading dataset: {args.dataset}")
+    sim = Microsimulation(dataset=args.dataset)
+    year = int(sim.default_calculation_period)
+    logger.info(f"Derived year from dataset: {year}")
 
     logger.info(f"Extracting Census STC data for FY{year}...")
     raw_df = extract_state_income_tax_data(year)

From bc139995717629b7eca5d63785f496a38b300b67 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 2 Feb 2026 14:54:40 -0500
Subject: [PATCH 4/5] Add 119th Congress district code support for 2024 ACS
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update parse_ucgid to recognize both 5001800US (118th) and 5001900US (119th Congress)
- Expand Puerto Rico and territory filters to handle both Congress code formats
- Update TERRITORY_UCGIDS and NON_VOTING_GEO_IDS with 119th Congress codes

This ensures consistent redistricting alignment: 2024 ACS data uses 119th Congress
codes natively, and IRS SOI data is converted via the 116th→119th mapping matrix.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 changelog_entry.yaml                                       | 2 +-
 policyengine_us_data/db/etl_age.py                         | 7 +++++--
 policyengine_us_data/db/etl_snap.py                        | 3 ++-
 .../storage/calibration_targets/pull_soi_targets.py        | 7 +++++++
 policyengine_us_data/utils/census.py                       | 7 +++++++
 policyengine_us_data/utils/db.py                           | 5 ++++-
 6 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 4bbfcf6f0..1d930f19e 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,3 +1,3 @@
 - date: 2026-02-02
   type: fixed
-  description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts and using income_tax_positive for CBO calibration
+  description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts, using income_tax_positive for CBO calibration, and adding 119th Congress district code support for consistent redistricting alignment
diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index 13853ca44..2e213d92b 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -70,9 +70,12 @@ def transform_age_data(age_data, docs):
     df = df.rename({"GEO_ID": "ucgid_str"}, axis=1)
     df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)]
 
-    # Filter out Puerto Rico's district and state records, if needed
+    # Filter out Puerto Rico's district and state records
+    # 5001800US7298 = 118th Congress, 5001900US7298 = 119th Congress
     df_geos = df_data[
-        ~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"])
+        ~df_data["ucgid_str"].isin(
+            ["5001800US7298", "5001900US7298", "0400000US72"]
+        )
     ].copy()
 
     df = df_geos[["ucgid_str"] + AGE_COLS]
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
index a8a80f5ce..554f67ec1 100644
--- a/policyengine_us_data/db/etl_snap.py
+++ b/policyengine_us_data/db/etl_snap.py
@@ -152,9 +152,10 @@ def transform_survey_snap_data(raw_df):
         {"GEO_ID": "ucgid_str", "S2201_C03_001E": "snap_household_ct"}, axis=1
     )[
         ~df["GEO_ID"].isin(
-            [  # Puerto Rico's state and district
+            [  # Puerto Rico's state and district (118th and 119th Congress)
                 "0400000US72",
                 "5001800US7298",
+                "5001900US7298",
             ]
         )
     ]
diff --git a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py
index 0b1f3dcb6..c3f159191 100644
--- a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py
+++ b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py
@@ -41,11 +41,18 @@
 NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
 NON_VOTING_GEO_IDS = {
     "0400000US72",  # Puerto Rico (state level)
+    # 118th Congress codes
     "5001800US7298",  # Puerto Rico
     "5001800US6098",  # American Samoa
     "5001800US6698",  # Guam
     "5001800US6998",  # Northern Mariana Islands
     "5001800US7898",  # U.S. Virgin Islands
+    # 119th Congress codes
+    "5001900US7298",  # Puerto Rico
+    "5001900US6098",  # American Samoa
+    "5001900US6698",  # Guam
+    "5001900US6998",  # Northern Mariana Islands
+    "5001900US7898",  # U.S. Virgin Islands
 }
 
 # after skipping the first 7 rows, the national SOI file has targets as row indices [COUNT_INDEX, AMOUNT_INDEX]
diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py
index cb9d0b5d8..c61cc166d 100644
--- a/policyengine_us_data/utils/census.py
+++ b/policyengine_us_data/utils/census.py
@@ -123,11 +123,18 @@
 
 TERRITORY_UCGIDS = {
     "0400000US72",  # Puerto Rico (state level)
+    # 118th Congress codes
     "5001800US7298",  # Puerto Rico
     "5001800US6098",  # American Samoa
     "5001800US6698",  # Guam
     "5001800US6998",  # Northern Mariana Islands
     "5001800US7898",  # U.S. Virgin Islands
+    # 119th Congress codes
+    "5001900US7298",  # Puerto Rico
+    "5001900US6098",  # American Samoa
+    "5001900US6698",  # Guam
+    "5001900US6998",  # Northern Mariana Islands
+    "5001900US7898",  # U.S. Virgin Islands
 }
 
 
diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py
index 6c7b1a4ed..4de79c44f 100644
--- a/policyengine_us_data/utils/db.py
+++ b/policyengine_us_data/utils/db.py
@@ -82,7 +82,10 @@ def parse_ucgid(ucgid_str: str) -> Dict:
     elif ucgid_str.startswith("0400000US"):
         state_fips = int(ucgid_str[9:])
         return {"type": "state", "state_fips": state_fips}
-    elif ucgid_str.startswith("5001800US"):
+    elif ucgid_str.startswith("5001800US") or ucgid_str.startswith(
+        "5001900US"
+    ):
+        # 5001800US = 118th Congress, 5001900US = 119th Congress
         state_and_district = ucgid_str[9:]
         state_fips = int(state_and_district[:2])
         district_number = int(state_and_district[2:])

From 634a75d047232961302d62466b79138dc7175511 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Tue, 3 Feb 2026 08:36:14 -0500
Subject: [PATCH 5/5] Use deterministic hash for medicaid_take_up_seed

---
 policyengine_us_data/datasets/cps/cps.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 84f01a8bb..9fdea7978 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -208,11 +208,20 @@ def add_takeup(self):
     data["takes_up_dc_ptc"] = (
         generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate
     )
+    # Deterministic seed for medicaid: hash person_id to [0, 1) range
+    # Uses Knuth multiplicative hash for good distribution
+    # This ensures same person_id always yields same seed, making
+    # enrollment determination reproducible across dataset rebuilds
+    HASH_MULTIPLIER = 2654435761  # Knuth's constant
+    HASH_MODULUS = 2**32
+    data["medicaid_take_up_seed"] = (
+        (data["person_id"].astype(np.uint64) * HASH_MULTIPLIER) % HASH_MODULUS
+    ) / HASH_MODULUS
+
+    # SNAP and ACA seeds remain random for now (template above for conversion)
     generator = np.random.default_rng(seed=100)
-
     data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"]))
     data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"]))
-    data["medicaid_take_up_seed"] = generator.random(len(data["person_id"]))
 
     self.save_dataset(data)