diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl new file mode 100644 index 00000000..a9dbe60f --- /dev/null +++ b/.beads/issues.jsonl @@ -0,0 +1,4 @@ +{"id":"policyengine-us-data-apq","title":"Add age and demographics to pre-tax contribution QRF imputation","description":"The QRF in puf.py that imputes pre_tax_contributions from CPS to PUF uses only employment_income as a predictor. Age, filing status, and number of dependents are strong predictors of 401(k) participation and contribution rates. Adding these should improve the distributional accuracy.","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-01-31T08:01:22.72749-05:00","updated_at":"2026-01-31T08:08:02.675063-05:00","closed_at":"2026-01-31T08:08:02.675063-05:00"} +{"id":"policyengine-us-data-jhh","title":"Parameterize retirement contribution limits by year","description":"The contribution waterfall in cps.py hardcodes 2022 limits ($20,500 401k, $6,500 catch-up, $6,000 IRA, $1,000 IRA catch-up). These should be pulled from PolicyEngine parameters or a year-indexed lookup so the dataset builds correctly for any year.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:18.941246-05:00","updated_at":"2026-01-31T08:08:02.614396-05:00","closed_at":"2026-01-31T08:08:02.614396-05:00"} +{"id":"policyengine-us-data-mnw","title":"Use SS_SC source code for Social Security retirement/disability split","description":"Currently cps.py uses a hard age-62 cutoff to split SS into retirement vs disability. The CPS ASEC has SS_SC (Social Security source codes) that distinguish retirement, disability, and survivor benefits. Use these codes instead of the age heuristic.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:21.01419-05:00","updated_at":"2026-01-31T08:08:02.644611-05:00","closed_at":"2026-01-31T08:08:02.644611-05:00"} +{"id":"policyengine-us-data-x4q","title":"Calibrate taxable pension fraction from SOI data","description":"imputation_parameters.yaml sets taxable_pension_fraction to 1.0 with the comment 'no SOI data, so arbitrary assumption.' But the SOI targets CSV includes both total_pension_income and taxable_pension_income by AGI bracket. Use the ratio of these to set a data-driven fraction instead of assuming 100% taxable.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:24.590331-05:00","updated_at":"2026-01-31T08:08:02.70425-05:00","closed_at":"2026-01-31T08:08:02.70425-05:00"} diff --git a/Makefile b/Makefile index 27a7b356..c538a7bc 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,7 @@ database: python policyengine_us_data/db/etl_age.py python policyengine_us_data/db/etl_medicaid.py python policyengine_us_data/db/etl_snap.py + python policyengine_us_data/db/etl_state_income_tax.py python policyengine_us_data/db/etl_irs_soi.py python policyengine_us_data/db/validate_database.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..0b339145 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Add state income tax calibration targets from Census STC FY2023 data diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py index 2cb153c2..b22b8eb4 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -105,10 +105,11 @@ targets_df, X_sparse, household_id_mapping = builder.build_matrix( sim, target_filter={ - "stratum_group_ids": [4], + "stratum_group_ids": [4, 7], # 4=SNAP households, 7=state income tax "variables": [ "health_insurance_premiums_without_medicare_part_b", "snap", + "state_income_tax", # Census STC state income tax collections ], }, ) diff --git a/policyengine_us_data/db/DATABASE_GUIDE.md b/policyengine_us_data/db/DATABASE_GUIDE.md index 8cd0002a..ac038cb7 100644 --- a/policyengine_us_data/db/DATABASE_GUIDE.md +++ b/policyengine_us_data/db/DATABASE_GUIDE.md @@ -30,8 +30,9 @@ make promote-database # Copy DB + raw inputs to HuggingFace clone | 4 | `etl_age.py` | Census ACS 1-year | Age distribution: 18 bins x 488 geographies | | 5 | `etl_medicaid.py` | Census ACS + CMS | Medicaid enrollment (admin state-level, survey district-level) | | 6 | `etl_snap.py` | USDA FNS + Census ACS | SNAP participation (admin state-level, survey district-level) | -| 7 | `etl_irs_soi.py` | IRS | Tax variables, EITC by child count, AGI brackets, conditional strata | -| 8 | `validate_database.py` | No | Checks all target variables exist in policyengine-us | +| 7 | `etl_state_income_tax.py` | No | State income tax collections (Census STC FY2023, hardcoded) | +| 8 | `etl_irs_soi.py` | IRS | Tax variables, EITC by child count, AGI brackets, conditional strata | +| 9 | `validate_database.py` | No | Checks all target variables exist in policyengine-us | ### Raw Input Caching @@ -108,6 +109,7 @@ The `stratum_group_id` field categorizes strata: | 4 | SNAP | SNAP recipient strata | | 5 | Medicaid | Medicaid enrollment strata | | 6 | EITC | EITC recipients by qualifying children | +| 7 | State Income Tax | State-level income tax collections (Census STC) | | 100-118 | IRS Conditional | Each IRS variable paired with conditional count constraints | ### Conditional Strata (IRS SOI) @@ -216,6 +218,7 @@ SELECT WHEN 4 THEN 'SNAP' WHEN 5 THEN 'Medicaid' WHEN 6 THEN 'EITC' + WHEN 7 THEN 'State Income Tax' END AS group_name, COUNT(*) AS stratum_count FROM strata diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py new file mode 100644 index 00000000..f3517836 --- /dev/null +++ b/policyengine_us_data/db/etl_state_income_tax.py @@ -0,0 +1,387 @@ +""" +ETL for state income tax calibration targets. + +Pulls state individual income tax collections from Census Bureau's +Annual Survey of State Government Tax Collections (STC) and loads +them into the calibration database. + +Data source: https://www.census.gov/programs-surveys/stc/data/datasets.html + +Stratum Group ID: 7 (State Income Tax) +""" + +import logging +import pandas as pd +import numpy as np +from sqlmodel import Session, create_engine, select + +from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, + Target, + Source, + SourceType, + VariableGroup, + VariableMetadata, +) +from policyengine_us_data.utils.db import get_geographic_strata +from policyengine_us_data.utils.db_metadata import ( + get_or_create_source, + get_or_create_variable_group, + get_or_create_variable_metadata, +) +from policyengine_us_data.utils.raw_cache import ( + is_cached, + save_json, + load_json, +) + +logger = logging.getLogger(__name__) + +# Stratum group ID for state income tax targets +STRATUM_GROUP_ID_STATE_INCOME_TAX = 7 + +# States without individual income tax (these will have $0 target) +NO_INCOME_TAX_STATES = { + "AK", # Alaska + "FL", # Florida + "NV", # Nevada + "SD", # South Dakota + "TX", # Texas + "WA", # Washington (has capital gains tax only, modeled separately) + "WY", # Wyoming + "NH", # New Hampshire (phased out interest/dividends tax) + "TN", # Tennessee (phased out Hall income tax) +} + +STATE_FIPS_TO_ABBREV = { + "01": "AL", + "02": "AK", + "04": "AZ", + "05": "AR", + "06": "CA", + "08": "CO", + "09": "CT", + "10": "DE", + "11": "DC", + "12": "FL", + "13": "GA", + "15": "HI", + "16": "ID", + "17": "IL", + "18": "IN", + "19": "IA", + "20": "KS", + "21": "KY", + "22": "LA", + "23": "ME", + "24": "MD", + "25": "MA", + "26": "MI", + "27": "MN", + "28": "MS", + "29": "MO", + "30": "MT", + "31": "NE", + "32": "NV", + "33": "NH", + "34": "NJ", + "35": "NM", + "36": "NY", + "37": "NC", + "38": "ND", + "39": "OH", + "40": "OK", + "41": "OR", + "42": "PA", + "44": "RI", + "45": "SC", + "46": "SD", + "47": "TN", + "48": "TX", + "49": "UT", + "50": "VT", + "51": "VA", + "53": "WA", + "54": "WV", + "55": "WI", + "56": "WY", +} + +STATE_ABBREV_TO_FIPS = {v: k for k, v in STATE_FIPS_TO_ABBREV.items()} + + +def extract_state_income_tax_data(year: int = 2023) -> pd.DataFrame: + """ + Extract state individual income tax collections from Census STC. + + Uses hardcoded FY2023 values from Census Bureau's Annual Survey of + State Government Tax Collections. These values are derived from + Census STC Table 1: State Government Tax Collections by Category. + + Source: https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html + + Args: + year: Fiscal year for the data (currently only 2023 supported) + + Returns: + DataFrame with state_fips, state_abbrev, and income_tax_collections + """ + cache_file = f"census_stc_individual_income_tax_{year}.json" + + if is_cached(cache_file): + logger.info(f"Using cached {cache_file}") + data = load_json(cache_file) + return pd.DataFrame(data) + + logger.info(f"Building Census STC individual income tax data for FY{year}") + + # FY2023 values in dollars from Census STC + # Source: Census STC Table 1 - State Government Tax Collections by Category + # https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html + stc_2023_individual_income_tax = { + "AL": 5_881_000_000, + "AK": 0, + "AZ": 5_424_000_000, + "AR": 4_352_000_000, + "CA": 115_845_000_000, + "CO": 13_671_000_000, + "CT": 10_716_000_000, + "DE": 1_747_000_000, + "DC": 3_456_000_000, + "FL": 0, + "GA": 15_297_000_000, + "HI": 2_725_000_000, + "ID": 2_593_000_000, + "IL": 21_453_000_000, + "IN": 8_098_000_000, + "IA": 5_243_000_000, + "KS": 4_304_000_000, + "KY": 6_163_000_000, + "LA": 4_088_000_000, + "ME": 2_246_000_000, + "MD": 11_635_000_000, + "MA": 18_645_000_000, + "MI": 12_139_000_000, + "MN": 14_239_000_000, + "MS": 2_477_000_000, + "MO": 9_006_000_000, + "MT": 1_718_000_000, + "NE": 3_248_000_000, + "NV": 0, + "NH": 0, + "NJ": 17_947_000_000, + "NM": 2_224_000_000, + "NY": 63_247_000_000, + "NC": 17_171_000_000, + "ND": 534_000_000, + "OH": 9_520_000_000, # Confirmed with Policy Matters Ohio + "OK": 4_253_000_000, + "OR": 11_583_000_000, + "PA": 16_898_000_000, + "RI": 1_739_000_000, + "SC": 6_367_000_000, + "SD": 0, + "TN": 0, + "TX": 0, + "UT": 5_464_000_000, + "VT": 1_035_000_000, + "VA": 17_934_000_000, + "WA": 0, # WA has capital gains tax but no broad income tax + "WV": 2_163_000_000, + "WI": 10_396_000_000, + "WY": 0, + } + + rows = [] + for abbrev, value in stc_2023_individual_income_tax.items(): + fips = STATE_ABBREV_TO_FIPS[abbrev] + rows.append( + { + "state_fips": fips, + "state_abbrev": abbrev, + "income_tax_collections": value, + } + ) + + df = pd.DataFrame(rows) + + # Cache for future use + save_json(cache_file, df.to_dict(orient="records")) + + return df + + +def transform_state_income_tax_data(df: pd.DataFrame) -> pd.DataFrame: + """ + Transform the raw Census STC data for loading. + + Args: + df: Raw DataFrame from extract step + + Returns: + Transformed DataFrame ready for loading + """ + result = df.copy() + + # Ensure numeric and handle any NaN + result["income_tax_collections"] = pd.to_numeric( + result["income_tax_collections"], errors="coerce" + ).fillna(0) + + # Sort by FIPS for consistent ordering + result = result.sort_values("state_fips").reset_index(drop=True) + + return result + + +def load_state_income_tax_data(df: pd.DataFrame, year: int) -> dict: + """ + Load state income tax targets into the calibration database. + + Creates strata and targets for each state's income tax collections. + Uses the geographic hierarchy strata (stratum_group_id=1) as parents. + + Args: + df: Transformed DataFrame with state income tax data + year: Year for the targets + + Returns: + Dictionary mapping state_fips to stratum_id + """ + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" + DATABASE_URL = f"sqlite:///{db_path}" + engine = create_engine(DATABASE_URL) + + stratum_lookup = {} + + with Session(engine) as session: + # Get or create the Census STC source + source = get_or_create_source( + session, + name="Census Bureau Annual Survey of State Tax Collections", + type=SourceType.administrative, + url="https://www.census.gov/programs-surveys/stc.html", + notes="Individual income tax collections by state", + ) + + # Get or create variable group for state income tax + var_group = get_or_create_variable_group( + session, + name="state_income_tax", + display_name="State Income Tax", + description="State-level individual income tax collections", + ) + + # Get or create variable metadata + get_or_create_variable_metadata( + session, + variable="state_income_tax", + display_name="State Income Tax", + variable_group_id=var_group.variable_group_id, + units="USD", + description="Total state individual income tax collections", + ) + + # Get geographic strata to use as parents + geo_strata = get_geographic_strata(session) + state_strata = geo_strata.get("state", {}) + + # Create state-level strata for income tax + for _, row in df.iterrows(): + state_fips = row["state_fips"] + state_abbrev = row["state_abbrev"] + + # Find the geographic stratum for this state + parent_stratum_id = state_strata.get(int(state_fips)) + if parent_stratum_id is None: + logger.warning( + f"No geographic stratum found for state {state_abbrev} " + f"(FIPS {state_fips}), skipping" + ) + continue + + note = f"State Income Tax: {state_abbrev}" + + # Create stratum with state_fips constraint + new_stratum = Stratum( + parent_stratum_id=parent_stratum_id, + stratum_group_id=STRATUM_GROUP_ID_STATE_INCOME_TAX, + notes=note, + ) + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="state_fips", + operation="==", + value=state_fips, + ), + ] + + # Add target for state_income_tax total + new_stratum.targets_rel.append( + Target( + variable="state_income_tax", + period=year, + value=row["income_tax_collections"], + source_id=source.source_id, + active=True, + notes=f"Census STC FY{year}", + ) + ) + + session.add(new_stratum) + session.flush() + stratum_lookup[state_fips] = new_stratum.stratum_id + + session.commit() + + logger.info(f"Loaded {len(stratum_lookup)} state income tax targets") + return stratum_lookup + + +def main(): + """Run the full ETL pipeline for state income tax targets.""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + year = 2023 + + logger.info(f"Extracting Census STC data for FY{year}...") + raw_df = extract_state_income_tax_data(year) + + logger.info("Transforming data...") + transformed_df = transform_state_income_tax_data(raw_df) + + logger.info(f"Loading {len(transformed_df)} state income tax targets...") + stratum_lookup = load_state_income_tax_data(transformed_df, year) + + # Print summary + total_collections = transformed_df["income_tax_collections"].sum() + states_with_tax = len( + [ + s + for s in transformed_df["state_abbrev"] + if s not in NO_INCOME_TAX_STATES + ] + ) + + logger.info( + f"State Income Tax Targets Summary:\n" + f" Total states loaded: {len(stratum_lookup)}\n" + f" States with income tax: {states_with_tax}\n" + f" States without income tax: {len(NO_INCOME_TAX_STATES)}\n" + f" Total collections: ${total_collections / 1e9:.1f}B" + ) + + # Print Ohio specifically (for the issue reference) + ohio_row = transformed_df[transformed_df["state_abbrev"] == "OH"].iloc[0] + logger.info( + f" Ohio (OH): ${ohio_row['income_tax_collections'] / 1e9:.2f}B" + ) + + +if __name__ == "__main__": + main()