diff --git a/Makefile b/Makefile index b03e23d55..47e3c5a74 100644 --- a/Makefile +++ b/Makefile @@ -60,6 +60,7 @@ database: python policyengine_us_data/db/etl_medicaid.py python policyengine_us_data/db/etl_snap.py python policyengine_us_data/db/etl_irs_soi.py + python policyengine_us_data/db/scale_irs_soi_to_cbo.py python policyengine_us_data/db/validate_database.py data: diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f932e0d5e..ae9e37904 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -15,7 +15,6 @@ from microimpute.models.qrf import QRF import logging - test_lite = os.environ.get("TEST_LITE") == "true" print(f"TEST_LITE == {test_lite}") diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 8bbe67bcc..4eb0a660b 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -22,7 +22,6 @@ from pathlib import Path import logging - try: import torch except ImportError: diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index cac9ad61a..7650d9e7f 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -15,7 +15,6 @@ create_policyengine_uprating_factors_table, ) - rng = np.random.default_rng(seed=64) # Get Qualified Business Income simulation parameters --- diff --git a/policyengine_us_data/datasets/puf/uprate_puf.py b/policyengine_us_data/datasets/puf/uprate_puf.py index 1cf0eb9c6..961446156 100644 --- a/policyengine_us_data/datasets/puf/uprate_puf.py +++ b/policyengine_us_data/datasets/puf/uprate_puf.py @@ -2,7 +2,6 @@ import numpy as np from policyengine_us_data.storage import STORAGE_FOLDER - ITMDED_GROW_RATE = 0.02 # annual growth rate in itemized deduction amounts USE_VARIABLE_SPECIFIC_POPULATION_GROWTH_DIVISORS = False diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py index df03772d0..920d1449e 100644 --- a/policyengine_us_data/db/create_database_tables.py +++ b/policyengine_us_data/db/create_database_tables.py @@ -15,7 +15,6 @@ from policyengine_us_data.storage import STORAGE_FOLDER - logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index bb83067c4..2f8b4b3df 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -11,7 +11,6 @@ ) from policyengine_us_data.utils.census import get_census_docs, pull_acs_table - LABEL_TO_SHORT = { "Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4", "Estimate!!Total!!Total population!!AGE!!5 to 9 years": "5-9", @@ -185,7 +184,7 @@ def load_age_data(df_long, geo, year, stratum_lookup=None): if __name__ == "__main__": # --- ETL: Extract, Transform, Load ---- - year = 2023 + year = 2024 # ---- Extract ---------- docs = get_census_docs(year) diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 786abb1cc..6607a5dd6 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -24,7 +24,6 @@ get_district_mapping, ) - """See the 22incddocguide.docx manual from the IRS SOI""" # Let's make this work with strict inequalities # Language in the doc: '$10,000 under $25,000' diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index 926a0d88c..e3d46d5c7 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -194,7 +194,7 @@ def load_medicaid_data(long_state, long_cd, year): if __name__ == "__main__": - year = 2023 + year = 2024 # Extract ------------------------------ cd_survey_df, state_admin_df = extract_medicaid_data(year) diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index 1fba44a46..a39411ef5 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -20,7 +20,7 @@ ) -def extract_administrative_snap_data(year=2023): +def extract_administrative_snap_data(year=2024): """ Downloads and extracts annual state-level SNAP data from the USDA FNS zip file. """ @@ -280,7 +280,7 @@ def load_survey_snap_data(survey_df, year, stratum_lookup=None): def main(): - year = 2023 + year = 2024 # Extract --------- zip_file_admin = extract_administrative_snap_data() diff --git a/policyengine_us_data/db/scale_irs_soi_to_cbo.py b/policyengine_us_data/db/scale_irs_soi_to_cbo.py new file mode 100644 index 000000000..6d8cdccc9 --- /dev/null +++ b/policyengine_us_data/db/scale_irs_soi_to_cbo.py @@ -0,0 +1,230 @@ +"""Scale IRS SOI 2022 database targets to 2024 using CBO projections. + +The IRS SOI congressional-district data (22incd.csv) is the most recent +available; 2023/2024 CD data has not been published yet. To align the +DB targets with the 2024 simulation year we scale each variable's +aggregate to match the corresponding CBO / Treasury projection, using +the same parameter values that the enhanced CPS calibration uses in +loss.py. + +Only targets with source_id=5 (IRS SOI ETL) are affected. Census, +Medicaid, and SNAP targets already pull 2024 data directly from their +administrative sources. + +See: https://github.com/PolicyEngine/policyengine-us-data/issues/503 +""" + +import logging +from typing import Dict, Tuple + +from sqlalchemy import text +from sqlmodel import Session, create_engine, select + +from policyengine_us.system import system +from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.db.create_database_tables import Target + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + +SOI_SOURCE_ID = 5 +SOI_YEAR = 2022 +TARGET_YEAR = 2024 + + +def _get_cbo_targets(year: int) -> Dict[str, float]: + """CBO / Treasury national totals for IRS SOI variables. + + Mirrors the targets used in loss.py for enhanced CPS calibration. + + Args: + year: Simulation year. + + Returns: + Mapping of DB variable name to national total. + """ + p = system.parameters(year) + cbo = p.calibration.gov.cbo + cbo_inc = cbo.income_by_source + soi = p.calibration.gov.irs.soi + + # IRS SOI total returns (sum by filing status) + soi_total_returns = sum( + v + for v in soi.returns_by_filing_status._children.values() + if isinstance(v, (int, float)) + ) + + return { + # CBO budget projections + "income_tax": cbo._children["income_tax"], + "unemployment_compensation": cbo._children[ + "unemployment_compensation" + ], + # Treasury + "eitc": ( + system.parameters.calibration.gov.treasury.tax_expenditures.eitc( + year + ) + ), + # CBO income-by-source + "adjusted_gross_income": cbo_inc._children["adjusted_gross_income"], + "taxable_social_security": cbo_inc._children[ + "taxable_social_security" + ], + "taxable_pension_income": cbo_inc._children["taxable_pension_income"], + "net_capital_gain": cbo_inc._children["net_capital_gain"], + # IRS SOI projections + "qualified_dividend_income": soi._children[ + "qualified_dividend_income" + ], + "taxable_interest_income": soi._children["taxable_interest_income"], + "tax_exempt_interest_income": soi._children[ + "tax_exempt_interest_income" + ], + "tax_unit_partnership_s_corp_income": soi._children[ + "partnership_s_corp_income" + ], + "dividend_income": ( + soi._children["qualified_dividend_income"] + + soi._children["non_qualified_dividend_income"] + ), + # Return counts + "person_count": soi_total_returns, + } + + +def _compute_state_aggregate( + session: Session, variable: str +) -> Tuple[float, int]: + """Sum state-level IRS SOI targets for *variable*. + + Uses raw SQL to avoid the USVariable enum deserialization issue. + + Args: + session: Active database session. + variable: Target variable name. + + Returns: + (sum of state-level values, row count). + """ + result = session.execute( + text(""" + SELECT COALESCE(SUM(t.value), 0) AS total, + COUNT(*) AS cnt + FROM targets t + JOIN stratum_constraints sc + ON sc.stratum_id = t.stratum_id + WHERE t.variable = :variable + AND t.source_id = :source_id + AND t.active = 1 + AND sc.constraint_variable = 'ucgid_str' + AND sc.value LIKE '0400000US%' + """), + {"variable": variable, "source_id": SOI_SOURCE_ID}, + ) + row = result.one() + return float(row.total), int(row.cnt) + + +def _scale_targets( + session: Session, + variable: str, + scale_factor: float, + target_year: int, +) -> int: + """Scale all IRS SOI targets for *variable* and update period. + + Args: + session: Active database session. + variable: Target variable name. + scale_factor: Multiplicative adjustment. + target_year: New period value. + + Returns: + Number of rows updated. + """ + stmt = ( + select(Target) + .where(Target.variable == variable) + .where(Target.source_id == SOI_SOURCE_ID) + ) + all_targets = session.exec(stmt).all() + + updated = 0 + for t in all_targets: + if t.value is not None: + t.value *= scale_factor + t.period = target_year + session.add(t) + updated += 1 + + return updated + + +def scale_soi_to_cbo( + session: Session, + target_year: int = TARGET_YEAR, +) -> Dict[str, float]: + """Scale IRS SOI DB targets to CBO 2024 projections. + + For each variable with a CBO/Treasury projection: + 1. Sum current state-level DB targets. + 2. Compute scale factor = CBO target / DB aggregate. + 3. Apply proportionally to all geographic levels. + + Args: + session: Active database session. + target_year: Simulation year. + + Returns: + Mapping of variable name to scale factor applied. + """ + cbo_targets = _get_cbo_targets(target_year) + scale_factors: Dict[str, float] = {} + + for variable, cbo_value in cbo_targets.items(): + state_sum, state_count = _compute_state_aggregate(session, variable) + + if state_sum == 0: + logger.warning( + "Skipping '%s': no state-level SOI targets", + variable, + ) + continue + + scale = cbo_value / state_sum + pct = (scale - 1) * 100 + + logger.info( + "%-35s %4d states %.3g -> %.3g " "(x%.4f, %+.1f%%)", + variable, + state_count, + state_sum, + cbo_value, + scale, + pct, + ) + + n = _scale_targets(session, variable, scale, target_year) + logger.info(" Updated %d rows", n) + scale_factors[variable] = scale + + session.commit() + logger.info("IRS SOI -> CBO scaling complete.") + return scale_factors + + +def main() -> None: + db_url = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}" + engine = create_engine(db_url) + + with Session(engine) as session: + scale_soi_to_cbo(session) + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py index fee6a49dc..53ac09852 100644 --- a/policyengine_us_data/db/validate_database.py +++ b/policyengine_us_data/db/validate_database.py @@ -9,7 +9,6 @@ import pandas as pd from policyengine_us.system import system - conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db") stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn) diff --git a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py index 349e6fbdd..1830bdb3a 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py @@ -9,7 +9,6 @@ STATE_NAME_TO_ABBREV, ) - STATE_NAME_TO_FIPS = { "Alabama": "01", "Alaska": "02", diff --git a/policyengine_us_data/tests/test_datasets/test_county_fips.py b/policyengine_us_data/tests/test_datasets/test_county_fips.py index ad1f10c5c..d692cf559 100644 --- a/policyengine_us_data/tests/test_datasets/test_county_fips.py +++ b/policyengine_us_data/tests/test_datasets/test_county_fips.py @@ -10,7 +10,6 @@ LOCAL_FOLDER, ) - # Sample data that mimics the format from census.gov SAMPLE_CENSUS_DATA = """STATE|STATEFP|COUNTYFP|COUNTYNAME AL|01|001|Autauga County diff --git a/policyengine_us_data/tests/test_scale_irs_soi_to_cbo.py b/policyengine_us_data/tests/test_scale_irs_soi_to_cbo.py new file mode 100644 index 000000000..b947723e1 --- /dev/null +++ b/policyengine_us_data/tests/test_scale_irs_soi_to_cbo.py @@ -0,0 +1,137 @@ +"""Tests for CBO-based scaling of IRS SOI database targets.""" + +import pytest +from sqlmodel import Session, select + +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, + Target, + create_database, +) +from policyengine_us_data.db.scale_irs_soi_to_cbo import ( + SOI_SOURCE_ID, + TARGET_YEAR, + _compute_state_aggregate, + _get_cbo_targets, + _scale_targets, + scale_soi_to_cbo, +) + + +@pytest.fixture +def engine(tmp_path): + db_uri = f"sqlite:///{tmp_path / 'test.db'}" + return create_database(db_uri) + + +def _make_stratum(session, ucgid): + """Helper: create a stratum with a ucgid_str constraint.""" + s = Stratum(stratum_group_id=0, notes=f"Geo: {ucgid}") + s.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=ucgid, + ) + ] + session.add(s) + session.flush() + return s + + +CBO_VARIABLES = [ + "income_tax", + "unemployment_compensation", + "eitc", + "adjusted_gross_income", + "taxable_social_security", + "taxable_pension_income", + "net_capital_gain", + "qualified_dividend_income", + "taxable_interest_income", + "tax_exempt_interest_income", + "tax_unit_partnership_s_corp_income", + "dividend_income", + "person_count", +] + + +def test_cbo_targets_are_positive(): + """All CBO targets return positive 2024 values.""" + targets = _get_cbo_targets(TARGET_YEAR) + for name in CBO_VARIABLES: + assert name in targets, f"'{name}' missing" + assert targets[name] > 0, f"{name} = {targets[name]}" + + +def test_only_soi_targets_are_scaled(engine): + """Scaling only affects source_id=5, not other sources.""" + with Session(engine) as session: + st = _make_stratum(session, "0400000US06") + st.targets_rel = [ + Target( + variable="income_tax", + period=2022, + value=300e9, + source_id=SOI_SOURCE_ID, + active=True, + ), + ] + st2 = _make_stratum(session, "0400000US48") + st2.targets_rel = [ + Target( + variable="person_count", + period=2023, + value=30_000_000, + source_id=1, # Census age, not SOI + active=True, + ), + ] + session.commit() + + _scale_targets(session, "income_tax", 1.5, 2024) + session.commit() + + soi_t = session.exec( + select(Target) + .where(Target.variable == "income_tax") + .where(Target.source_id == SOI_SOURCE_ID) + ).one() + assert soi_t.value == pytest.approx(450e9) + assert soi_t.period == 2024 + + census_t = session.exec( + select(Target) + .where(Target.variable == "person_count") + .where(Target.source_id == 1) + ).one() + assert census_t.value == pytest.approx(30_000_000) + assert census_t.period == 2023 + + +def test_end_to_end_scaling(engine): + """After scaling, state aggregate matches CBO target.""" + cbo = _get_cbo_targets(TARGET_YEAR) + cbo_income_tax = cbo["income_tax"] + + with Session(engine) as session: + for fips, share in [("01", 0.02), ("06", 0.15), ("48", 0.10)]: + val = cbo_income_tax * share * 0.5 # intentionally stale + st = _make_stratum(session, f"0400000US{fips}") + st.targets_rel = [ + Target( + variable="income_tax", + period=2022, + value=val, + source_id=SOI_SOURCE_ID, + active=True, + ) + ] + session.commit() + + factors = scale_soi_to_cbo(session, TARGET_YEAR) + + assert "income_tax" in factors + new_total, _ = _compute_state_aggregate(session, "income_tax") + assert new_total == pytest.approx(cbo_income_tax, rel=1e-6) diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index 2f424ccb8..8081b6162 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -4,7 +4,6 @@ import pandas as pd import numpy as np - STATE_NAME_TO_FIPS = { "Alabama": "01", "Alaska": "02", diff --git a/policyengine_us_data/utils/huggingface.py b/policyengine_us_data/utils/huggingface.py index 7b8c07928..d9b2f8a51 100644 --- a/policyengine_us_data/utils/huggingface.py +++ b/policyengine_us_data/utils/huggingface.py @@ -1,7 +1,6 @@ from huggingface_hub import hf_hub_download, login, HfApi import os - TOKEN = os.environ.get("HUGGING_FACE_TOKEN") if not TOKEN: raise ValueError( diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index cbea6dabb..e368d5048 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -9,7 +9,6 @@ from policyengine_core.reforms import Reform from policyengine_us_data.utils.soi import pe_to_soi, get_soi - # CPS-derived statistics # Medical expenses, sum of spm thresholds # Child support expenses