diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..1d930f19 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,3 @@ +- date: 2026-02-02 + type: fixed + description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts, using income_tax_positive for CBO calibration, and adding 119th Congress district code support for consistent redistricting alignment diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 84f01a8b..9fdea797 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -208,11 +208,20 @@ def add_takeup(self): data["takes_up_dc_ptc"] = ( generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate ) + # Deterministic seed for medicaid: hash person_id to [0, 1) range + # Uses Knuth multiplicative hash for good distribution + # This ensures same person_id always yields same seed, making + # enrollment determination reproducible across dataset rebuilds + HASH_MULTIPLIER = 2654435761 # Knuth's constant + HASH_MODULUS = 2**32 + data["medicaid_take_up_seed"] = ( + (data["person_id"].astype(np.uint64) * HASH_MULTIPLIER) % HASH_MODULUS + ) / HASH_MODULUS + + # SNAP and ACA seeds remain random for now (template above for conversion) generator = np.random.default_rng(seed=100) - data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"])) data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"])) - data["medicaid_take_up_seed"] = generator.random(len(data["person_id"])) self.save_dataset(data) diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py index f3edb1b4..8dda76e2 100644 --- a/policyengine_us_data/db/create_initial_strata.py +++ b/policyengine_us_data/db/create_initial_strata.py @@ -1,3 +1,4 @@ +import argparse import logging from typing import Dict @@ -6,6 +7,8 @@ from sqlmodel import Session, create_engine from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -68,6 +71,28 @@ def fetch_congressional_districts(year): def main(): + parser = argparse.ArgumentParser( + description="Create initial geographic strata for calibration" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for Census API calls is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") + # State FIPS to name/abbreviation mapping STATE_NAMES = { 1: "Alabama (AL)", @@ -123,8 +148,7 @@ def main(): 56: "Wyoming (WY)", } - # Fetch congressional district data for year 2023 - year = 2023 + # Fetch congressional district data cd_df = fetch_congressional_districts(year) DATABASE_URL = ( diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 39ffedf2..2e213d92 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -1,9 +1,13 @@ +import argparse + import pandas as pd import numpy as np from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -66,9 +70,12 @@ def transform_age_data(age_data, docs): df = df.rename({"GEO_ID": "ucgid_str"}, axis=1) df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)] - # Filter out Puerto Rico's district and state records, if needed + # Filter out Puerto Rico's district and state records + # 5001800US7298 = 118th Congress, 5001900US7298 = 119th Congress df_geos = df_data[ - ~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"]) + ~df_data["ucgid_str"].isin( + ["5001800US7298", "5001900US7298", "0400000US72"] + ) ].copy() df = df_geos[["ucgid_str"] + AGE_COLS] @@ -279,10 +286,30 @@ def load_age_data(df_long, geo, year): session.commit() -if __name__ == "__main__": +def main(): + parser = argparse.ArgumentParser( + description="ETL for age calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for Census API calls is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # --- ETL: Extract, Transform, Load ---- - year = 2023 # ---- Extract ---------- docs = get_census_docs(year) @@ -301,3 +328,7 @@ def load_age_data(df_long, geo, year): load_age_data(long_national_df, "National", year) load_age_data(long_state_df, "State", year) load_age_data(long_district_df, "District", year) + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index ed4da4e5..873d7a07 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -1,3 +1,4 @@ +import argparse import logging from typing import Optional @@ -7,6 +8,11 @@ from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + +# IRS SOI data is typically available ~2 years after the tax year +IRS_SOI_LAG_YEARS = 2 from policyengine_us_data.utils.raw_cache import ( is_cached, cache_path, @@ -1207,9 +1213,40 @@ def load_soi_data(long_dfs, year): def main(): - # NOTE: predates the finalization of the 2020 Census redistricting - # and there is district mapping in the Transform step - year = 2022 + parser = argparse.ArgumentParser( + description="ETL for IRS SOI calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for IRS SOI data is derived from the dataset's " + "default_calculation_period minus IRS_SOI_LAG_YEARS. " + "Default: %(default)s" + ), + ) + parser.add_argument( + "--lag", + type=int, + default=IRS_SOI_LAG_YEARS, + help=( + "Years to subtract from dataset year for IRS SOI data " + "(default: %(default)s, since IRS data is ~2 years behind)" + ), + ) + args = parser.parse_args() + + # Derive year from dataset with lag applied + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + dataset_year = int(sim.default_calculation_period) + year = dataset_year - args.lag + print( + f"Dataset year: {dataset_year}, IRS SOI year: {year} (lag={args.lag})" + ) # Extract ----------------------- raw_df = extract_soi_data() diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index ed184144..435ccd42 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -1,3 +1,4 @@ +import argparse import logging import requests @@ -7,6 +8,8 @@ from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -325,7 +328,27 @@ def load_medicaid_data(long_state, long_cd, year): def main(): - year = 2024 + parser = argparse.ArgumentParser( + description="ETL for Medicaid calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # Extract ------------------------------ state_admin_df = extract_administrative_medicaid_data(year) diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 7e02d6f0..fd97b83f 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -1,3 +1,5 @@ +import argparse + from sqlmodel import Session, create_engine import pandas as pd @@ -12,11 +14,19 @@ get_or_create_source, ) +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + -def extract_national_targets(): +def extract_national_targets(dataset: str = DEFAULT_DATASET): """ Extract national calibration targets from various sources. + Parameters + ---------- + dataset : str + Path to the calibration dataset (local path or HuggingFace URL). + The time period is derived from the dataset's default_calculation_period. + Returns ------- dict @@ -26,18 +36,17 @@ def extract_national_targets(): - conditional_count_targets: Enrollment counts requiring constraints - cbo_targets: List of CBO projection targets - treasury_targets: List of Treasury/JCT targets + - time_period: The year derived from the dataset """ - - # Initialize PolicyEngine for parameter access from policyengine_us import Microsimulation - sim = Microsimulation( - dataset="hf://policyengine/policyengine-us-data/cps_2023.h5" - ) + print(f"Loading dataset: {dataset}") + sim = Microsimulation(dataset=dataset) + + time_period = int(sim.default_calculation_period) + print(f"Derived time_period from dataset: {time_period}") - # Direct sum targets - these are regular variables that can be summed - # Store with their actual source year (2024 for hardcoded values from loss.py) - HARDCODED_YEAR = 2024 + # Direct sum targets - use the time_period derived from the dataset # Separate tax-related targets that need filer constraint tax_filer_targets = [ @@ -46,35 +55,35 @@ def extract_national_targets(): "value": 21.247e9, "source": "Joint Committee on Taxation", "notes": "SALT deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medical_expense_deduction", "value": 11.4e9, "source": "Joint Committee on Taxation", "notes": "Medical expense deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "charitable_deduction", "value": 65.301e9, "source": "Joint Committee on Taxation", "notes": "Charitable deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "interest_deduction", "value": 24.8e9, "source": "Joint Committee on Taxation", "notes": "Mortgage interest deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "qualified_business_income_deduction", "value": 63.1e9, "source": "Joint Committee on Taxation", "notes": "QBI deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -84,112 +93,112 @@ def extract_national_targets(): "value": 13e9, "source": "Survey-reported (post-TCJA grandfathered)", "notes": "Alimony received - survey reported, not tax-filer restricted", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "alimony_expense", "value": 13e9, "source": "Survey-reported (post-TCJA grandfathered)", "notes": "Alimony paid - survey reported, not tax-filer restricted", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medicaid", "value": 871.7e9, "source": "https://www.cms.gov/files/document/highlights.pdf", "notes": "CMS 2023 highlights document - total Medicaid spending", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "net_worth", "value": 160e12, "source": "Federal Reserve SCF", "notes": "Total household net worth", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "health_insurance_premiums_without_medicare_part_b", "value": 385e9, "source": "MEPS/NHEA", "notes": "Health insurance premiums excluding Medicare Part B", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "other_medical_expenses", "value": 278e9, "source": "MEPS/NHEA", "notes": "Out-of-pocket medical expenses", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medicare_part_b_premiums", "value": 112e9, "source": "CMS Medicare data", "notes": "Medicare Part B premium payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "over_the_counter_health_expenses", "value": 72e9, "source": "Consumer Expenditure Survey", "notes": "OTC health products and supplies", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "child_support_expense", "value": 33e9, "source": "Census Bureau", "notes": "Child support payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "child_support_received", "value": 33e9, "source": "Census Bureau", "notes": "Child support received", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "spm_unit_capped_work_childcare_expenses", "value": 348e9, "source": "Census Bureau SPM", "notes": "Work and childcare expenses for SPM", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "spm_unit_capped_housing_subsidy", "value": 35e9, "source": "HUD/Census", "notes": "Housing subsidies", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "tanf", "value": 9e9, "source": "HHS/ACF", "notes": "TANF cash assistance", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "real_estate_taxes", "value": 500e9, "source": "Census Bureau", "notes": "Property taxes paid", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "rent", "value": 735e9, "source": "Census Bureau/BLS", "notes": "Rental payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "tip_income", "value": 53.2e9, "source": "IRS Form W-2 Box 7 statistics", "notes": "Social security tips uprated 40% to account for underreporting", - "year": HARDCODED_YEAR, + "year": time_period, }, # SSA benefit-type totals derived from trust fund data and # SSA fact sheet type shares @@ -198,28 +207,28 @@ def extract_national_targets(): "value": 1_060e9, "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", "notes": "~73% of total OASDI ($1,452B CBO projection)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_disability", "value": 148e9, "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", "notes": "~10.2% of total OASDI (disabled workers)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_survivors", "value": 160e9, "source": "https://www.ssa.gov/OACT/FACTS/", "notes": "~11.0% of total OASDI (widows, children of deceased)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_dependents", "value": 84e9, "source": "https://www.ssa.gov/OACT/FACTS/", "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)", - "year": HARDCODED_YEAR, + "year": time_period, }, # IRA contribution totals from IRS SOI accumulation tables { @@ -227,14 +236,14 @@ def extract_national_targets(): "value": 25e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "roth_ira_contributions", "value": 39e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -247,7 +256,7 @@ def extract_national_targets(): "person_count": 72_429_055, "source": "CMS/HHS administrative data", "notes": "Medicaid enrollment count", - "year": HARDCODED_YEAR, + "year": time_period, }, { "constraint_variable": "aca_ptc", @@ -255,7 +264,7 @@ def extract_national_targets(): "person_count": 19_743_689, "source": "CMS marketplace data", "notes": "ACA Premium Tax Credit recipients", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -302,8 +311,7 @@ def extract_national_targets(): conditional_count_targets.extend(ssn_none_targets_by_year) - # CBO projection targets - get for a specific year - CBO_YEAR = 2023 # Year the CBO projections are for + # CBO projection targets - use time_period derived from dataset cbo_vars = [ # Note: income_tax_positive matches CBO's receipts definition # where refundable credit payments in excess of liability are @@ -326,7 +334,7 @@ def extract_national_targets(): param_name = cbo_param_name_map.get(variable_name, variable_name) try: value = sim.tax_benefit_system.parameters( - CBO_YEAR + time_period ).calibration.gov.cbo._children[param_name] cbo_targets.append( { @@ -334,7 +342,7 @@ def extract_national_targets(): "value": float(value), "source": "CBO Budget Projections", "notes": f"CBO projection for {variable_name}", - "year": CBO_YEAR, + "year": time_period, } ) except (KeyError, AttributeError) as e: @@ -343,11 +351,10 @@ def extract_national_targets(): f"{variable_name} (param: {param_name}): {e}" ) - # Treasury/JCT targets (EITC) - get for a specific year - TREASURY_YEAR = 2023 + # Treasury/JCT targets (EITC) - use time_period derived from dataset try: eitc_value = sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc( - TREASURY_YEAR + time_period ) treasury_targets = [ { @@ -355,7 +362,7 @@ def extract_national_targets(): "value": float(eitc_value), "source": "Treasury/JCT Tax Expenditures", "notes": "EITC tax expenditure", - "year": TREASURY_YEAR, + "year": time_period, } ] except (KeyError, AttributeError) as e: @@ -368,6 +375,7 @@ def extract_national_targets(): "conditional_count_targets": conditional_count_targets, "cbo_targets": cbo_targets, "treasury_targets": treasury_targets, + "time_period": time_period, } @@ -707,10 +715,25 @@ def load_national_targets( def main(): """Main ETL pipeline for national targets.""" + parser = argparse.ArgumentParser( + description="ETL for national calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The time_period for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() # Extract print("Extracting national targets...") - raw_targets = extract_national_targets() + raw_targets = extract_national_targets(dataset=args.dataset) + time_period = raw_targets["time_period"] + print(f"Using time_period={time_period} for CBO/Treasury targets") # Transform print("Transforming targets...") diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index 48c1eb83..554f67ec 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -1,3 +1,4 @@ +import argparse import logging import requests import zipfile @@ -10,6 +11,8 @@ from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -149,9 +152,10 @@ def transform_survey_snap_data(raw_df): {"GEO_ID": "ucgid_str", "S2201_C03_001E": "snap_household_ct"}, axis=1 )[ ~df["GEO_ID"].isin( - [ # Puerto Rico's state and district + [ # Puerto Rico's state and district (118th and 119th Congress) "0400000US72", "5001800US7298", + "5001900US7298", ] ) ] @@ -363,7 +367,27 @@ def load_survey_snap_data(survey_df, year, snap_stratum_lookup): def main(): - year = 2023 + parser = argparse.ArgumentParser( + description="ETL for SNAP calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # Extract --------- zip_file_admin = extract_administrative_snap_data() diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py index df0f40a6..9da8d839 100644 --- a/policyengine_us_data/db/etl_state_income_tax.py +++ b/policyengine_us_data/db/etl_state_income_tax.py @@ -10,12 +10,15 @@ Stratum Group ID: 7 (State Income Tax) """ +import argparse import logging import pandas as pd import numpy as np from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -342,12 +345,32 @@ def load_state_income_tax_data(df: pd.DataFrame, year: int) -> dict: def main(): """Run the full ETL pipeline for state income tax targets.""" + parser = argparse.ArgumentParser( + description="ETL for state income tax calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) - year = 2023 + # Derive year from dataset + from policyengine_us import Microsimulation + + logger.info(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + logger.info(f"Derived year from dataset: {year}") logger.info(f"Extracting Census STC data for FY{year}...") raw_df = extract_state_income_tax_data(year) diff --git a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py index 0b1f3dcb..c3f15919 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py @@ -41,11 +41,18 @@ NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} NON_VOTING_GEO_IDS = { "0400000US72", # Puerto Rico (state level) + # 118th Congress codes "5001800US7298", # Puerto Rico "5001800US6098", # American Samoa "5001800US6698", # Guam "5001800US6998", # Northern Mariana Islands "5001800US7898", # U.S. Virgin Islands + # 119th Congress codes + "5001900US7298", # Puerto Rico + "5001900US6098", # American Samoa + "5001900US6698", # Guam + "5001900US6998", # Northern Mariana Islands + "5001900US7898", # U.S. Virgin Islands } # after skipping the first 7 rows, the national SOI file has targets as row indices [COUNT_INDEX, AMOUNT_INDEX] diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index cb9d0b5d..c61cc166 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -123,11 +123,18 @@ TERRITORY_UCGIDS = { "0400000US72", # Puerto Rico (state level) + # 118th Congress codes "5001800US7298", # Puerto Rico "5001800US6098", # American Samoa "5001800US6698", # Guam "5001800US6998", # Northern Mariana Islands "5001800US7898", # U.S. Virgin Islands + # 119th Congress codes + "5001900US7298", # Puerto Rico + "5001900US6098", # American Samoa + "5001900US6698", # Guam + "5001900US6998", # Northern Mariana Islands + "5001900US7898", # U.S. Virgin Islands } diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py index 6c7b1a4e..4de79c44 100644 --- a/policyengine_us_data/utils/db.py +++ b/policyengine_us_data/utils/db.py @@ -82,7 +82,10 @@ def parse_ucgid(ucgid_str: str) -> Dict: elif ucgid_str.startswith("0400000US"): state_fips = int(ucgid_str[9:]) return {"type": "state", "state_fips": state_fips} - elif ucgid_str.startswith("5001800US"): + elif ucgid_str.startswith("5001800US") or ucgid_str.startswith( + "5001900US" + ): + # 5001800US = 118th Congress, 5001900US = 119th Congress state_and_district = ucgid_str[9:] state_fips = int(state_and_district[:2]) district_number = int(state_and_district[2:]) diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index f798c0dc..e9916641 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -220,26 +220,35 @@ def build_loss_matrix(dataset: type, time_period): targets_array.append(populations[year]) # CBO projections + # Note: income_tax_positive matches CBO's receipts definition where + # refundable credit payments in excess of liability are classified as + # outlays, not negative receipts. See: https://www.cbo.gov/publication/43767 - PROGRAMS = [ - "income_tax", + CBO_PROGRAMS = [ + "income_tax_positive", "snap", "social_security", "ssi", "unemployment_compensation", ] - for variable_name in PROGRAMS: + # Mapping from variable name to CBO parameter name (when different) + CBO_PARAM_NAME_MAP = { + "income_tax_positive": "income_tax", + } + + for variable_name in CBO_PROGRAMS: label = f"nation/cbo/{variable_name}" loss_matrix[label] = sim.calculate( variable_name, map_to="household" ).values if any(loss_matrix[label].isna()): raise ValueError(f"Missing values for {label}") + param_name = CBO_PARAM_NAME_MAP.get(variable_name, variable_name) targets_array.append( sim.tax_benefit_system.parameters( time_period - ).calibration.gov.cbo._children[variable_name] + ).calibration.gov.cbo._children[param_name] ) # 1. Medicaid Spending