diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl new file mode 100644 index 00000000..a9dbe60f --- /dev/null +++ b/.beads/issues.jsonl @@ -0,0 +1,4 @@ +{"id":"policyengine-us-data-apq","title":"Add age and demographics to pre-tax contribution QRF imputation","description":"The QRF in puf.py that imputes pre_tax_contributions from CPS to PUF uses only employment_income as a predictor. Age, filing status, and number of dependents are strong predictors of 401(k) participation and contribution rates. Adding these should improve the distributional accuracy.","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-01-31T08:01:22.72749-05:00","updated_at":"2026-01-31T08:08:02.675063-05:00","closed_at":"2026-01-31T08:08:02.675063-05:00"} +{"id":"policyengine-us-data-jhh","title":"Parameterize retirement contribution limits by year","description":"The contribution waterfall in cps.py hardcodes 2022 limits ($20,500 401k, $6,500 catch-up, $6,000 IRA, $1,000 IRA catch-up). These should be pulled from PolicyEngine parameters or a year-indexed lookup so the dataset builds correctly for any year.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:18.941246-05:00","updated_at":"2026-01-31T08:08:02.614396-05:00","closed_at":"2026-01-31T08:08:02.614396-05:00"} +{"id":"policyengine-us-data-mnw","title":"Use SS_SC source code for Social Security retirement/disability split","description":"Currently cps.py uses a hard age-62 cutoff to split SS into retirement vs disability. The CPS ASEC has SS_SC (Social Security source codes) that distinguish retirement, disability, and survivor benefits. Use these codes instead of the age heuristic.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:21.01419-05:00","updated_at":"2026-01-31T08:08:02.644611-05:00","closed_at":"2026-01-31T08:08:02.644611-05:00"} +{"id":"policyengine-us-data-x4q","title":"Calibrate taxable pension fraction from SOI data","description":"imputation_parameters.yaml sets taxable_pension_fraction to 1.0 with the comment 'no SOI data, so arbitrary assumption.' But the SOI targets CSV includes both total_pension_income and taxable_pension_income by AGI bracket. Use the ratio of these to set a data-driven fraction instead of assuming 100% taxable.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:24.590331-05:00","updated_at":"2026-01-31T08:08:02.70425-05:00","closed_at":"2026-01-31T08:08:02.70425-05:00"} diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..5af090db 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,10 @@ +- bump: minor + changes: + changed: + - Use CPS ASEC RESNSS1/RESNSS2 source codes to classify Social Security income into retirement, disability, survivors, and dependents (replacing age-62 heuristic) + - Parameterize retirement contribution limits by year (2020-2025) instead of hardcoded 2022 values + - Update taxable pension fraction from 1.0 to 0.590 based on SOI 2015 Table 1.4 + - Add age and is_male as QRF predictors for pension contribution imputation + added: + - SSA benefit-type calibration targets for social_security_retirement, social_security_disability, social_security_survivors, and social_security_dependents + - IRA contribution calibration targets for traditional_ira_contributions and roth_ira_contributions from IRS SOI data diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 15e8733b..84f01a8b 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -405,22 +405,64 @@ def add_personal_income_variables( 1 - p["qualified_dividend_fraction"] ) cps["rental_income"] = person.RNT_VAL - # Assign Social Security retirement benefits if at least 62. - MINIMUM_RETIREMENT_AGE = 62 + + # Classify Social Security income using CPS ASEC reason codes + # (RESNSS1 and RESNSS2). Reason code values: + # 1 = Retired + # 2 = Disabled (adult or child) + # 3 = Widowed + # 4 = Spouse + # 5 = Surviving child + # 6 = Dependent child + # 7 = On behalf of surviving/dependent/disabled child(ren) + # 8 = Other + is_retirement = (person.RESNSS1 == 1) | (person.RESNSS2 == 1) + is_disability = (person.RESNSS1 == 2) | (person.RESNSS2 == 2) + is_survivor = np.isin(person.RESNSS1, [3, 5]) | np.isin( + person.RESNSS2, [3, 5] + ) + is_dependent = np.isin(person.RESNSS1, [4, 6, 7]) | np.isin( + person.RESNSS2, [4, 6, 7] + ) + + # Primary classification: assign full SS_VAL to the highest- + # priority category when someone has multiple source codes. cps["social_security_retirement"] = np.where( - person.A_AGE >= MINIMUM_RETIREMENT_AGE, person.SS_VAL, 0 + is_retirement, person.SS_VAL, 0 ) - # Otherwise assign them to Social Security disability benefits. - cps["social_security_disability"] = ( - person.SS_VAL - cps["social_security_retirement"] + cps["social_security_disability"] = np.where( + is_disability & ~is_retirement, person.SS_VAL, 0 ) - # Provide placeholders for other Social Security inputs to avoid creating - # NaNs as they're uprated. - cps["social_security_dependents"] = np.zeros_like( - cps["social_security_retirement"] + cps["social_security_survivors"] = np.where( + is_survivor & ~is_retirement & ~is_disability, + person.SS_VAL, + 0, ) - cps["social_security_survivors"] = np.zeros_like( - cps["social_security_retirement"] + cps["social_security_dependents"] = np.where( + is_dependent & ~is_retirement & ~is_disability & ~is_survivor, + person.SS_VAL, + 0, + ) + + # Fallback for records with SS income but no informative source + # code: use the age-62 heuristic (retirement vs. disability). + MINIMUM_RETIREMENT_AGE = 62 + unclassified = ( + (person.SS_VAL > 0) + & ~is_retirement + & ~is_disability + & ~is_survivor + & ~is_dependent + ) + cps["social_security_retirement"] += np.where( + unclassified & (person.A_AGE >= MINIMUM_RETIREMENT_AGE), + person.SS_VAL, + 0, + ) + cps["social_security_disability"] += np.where( + unclassified & (person.A_AGE < MINIMUM_RETIREMENT_AGE), + person.SS_VAL, + 0, ) cps["unemployment_compensation"] = person.UC_VAL # Weeks looking for work during the year (Census variable LKWEEKS) @@ -496,11 +538,56 @@ def add_personal_income_variables( # Disregard reported pension contributions from people who report neither wage and salary # nor self-employment income. # Assume no 403(b) or 457 contributions for now. - LIMIT_401K_2022 = 20_500 - LIMIT_401K_CATCH_UP_2022 = 6_500 - LIMIT_IRA_2022 = 6_000 - LIMIT_IRA_CATCH_UP_2022 = 1_000 - CATCH_UP_AGE_2022 = 50 + # IRS retirement contribution limits by year. + RETIREMENT_LIMITS = { + 2020: { + "401k": 19_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2021: { + "401k": 19_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2022: { + "401k": 20_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2023: { + "401k": 22_500, + "401k_catch_up": 7_500, + "ira": 6_500, + "ira_catch_up": 1_000, + }, + 2024: { + "401k": 23_000, + "401k_catch_up": 7_500, + "ira": 7_000, + "ira_catch_up": 1_000, + }, + 2025: { + "401k": 23_500, + "401k_catch_up": 7_500, + "ira": 7_000, + "ira_catch_up": 1_000, + }, + } + # Clamp to the nearest available year for out-of-range values. + clamped_year = max( + min(year, max(RETIREMENT_LIMITS)), + min(RETIREMENT_LIMITS), + ) + limits = RETIREMENT_LIMITS[clamped_year] + LIMIT_401K = limits["401k"] + LIMIT_401K_CATCH_UP = limits["401k_catch_up"] + LIMIT_IRA = limits["ira"] + LIMIT_IRA_CATCH_UP = limits["ira_catch_up"] + CATCH_UP_AGE = 50 retirement_contributions = person.RETCB_VAL cps["self_employed_pension_contributions"] = np.where( person.SEMP_VAL > 0, retirement_contributions, 0 @@ -510,9 +597,9 @@ def add_personal_income_variables( 0, ) # Compute the 401(k) limit for the person's age. - catch_up_eligible = person.A_AGE >= CATCH_UP_AGE_2022 - limit_401k = LIMIT_401K_2022 + catch_up_eligible * LIMIT_401K_CATCH_UP_2022 - limit_ira = LIMIT_IRA_2022 + catch_up_eligible * LIMIT_IRA_CATCH_UP_2022 + catch_up_eligible = person.A_AGE >= CATCH_UP_AGE + limit_401k = LIMIT_401K + catch_up_eligible * LIMIT_401K_CATCH_UP + limit_ira = LIMIT_IRA + catch_up_eligible * LIMIT_IRA_CATCH_UP cps["traditional_401k_contributions"] = np.where( person.WSAL_VAL > 0, np.minimum(remaining_retirement_contributions, limit_401k), diff --git a/policyengine_us_data/datasets/cps/imputation_parameters.yaml b/policyengine_us_data/datasets/cps/imputation_parameters.yaml index 895edf0a..132b52af 100644 --- a/policyengine_us_data/datasets/cps/imputation_parameters.yaml +++ b/policyengine_us_data/datasets/cps/imputation_parameters.yaml @@ -7,8 +7,8 @@ taxable_interest_fraction: 0.680 # SOI 2018 data qualified_dividend_fraction: 0.448 -# no SOI data, so arbitrary assumption -taxable_pension_fraction: 1.0 +# SOI 2015 data (Table 1.4: taxable / total pension income) +taxable_pension_fraction: 0.590 taxable_401k_distribution_fraction: 1.0 taxable_403b_distribution_fraction: 1.0 taxable_ira_distribution_fraction: 1.0 diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 38afcbea..f52153e3 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -168,8 +168,15 @@ def impute_pension_contributions_to_puf(puf_df): cps = Microsimulation(dataset=CPS_2021) cps.subsample(10_000) + + predictors = [ + "employment_income", + "age", + "is_male", + ] + cps_df = cps.calculate_dataframe( - ["employment_income", "household_weight", "pre_tax_contributions"] + predictors + ["household_weight", "pre_tax_contributions"] ) from microimpute.models.qrf import QRF @@ -177,16 +184,16 @@ def impute_pension_contributions_to_puf(puf_df): qrf = QRF() # Combine predictors and target into single DataFrame for models.QRF - cps_train = cps_df[["employment_income", "pre_tax_contributions"]] + cps_train = cps_df[predictors + ["pre_tax_contributions"]] fitted_model = qrf.fit( X_train=cps_train, - predictors=["employment_income"], + predictors=predictors, imputed_variables=["pre_tax_contributions"], ) # Predict using the fitted model - predictions = fitted_model.predict(X_test=puf_df[["employment_income"]]) + predictions = fitted_model.predict(X_test=puf_df[predictors]) return predictions["pre_tax_contributions"] @@ -559,8 +566,11 @@ def generate(self): original_recid = puf.RECID.values.copy() puf = preprocess_puf(puf) puf = impute_missing_demographics(puf, demographics) + # Derive age and is_male for pension imputation predictors + puf["age"] = puf["AGERANGE"].apply(decode_age_filer) + puf["is_male"] = (puf["GENDER"] == 1).astype(float) puf["pre_tax_contributions"] = impute_pension_contributions_to_puf( - puf[["employment_income"]] + puf[["employment_income", "age", "is_male"]] ) # Sort in original PUF order diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 5cb910d5..8626ddc7 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -191,6 +191,51 @@ def extract_national_targets(): "notes": "Social security tips uprated 40% to account for underreporting", "year": HARDCODED_YEAR, }, + # SSA benefit-type totals derived from trust fund data and + # SSA fact sheet type shares + { + "variable": "social_security_retirement", + "value": 1_060e9, + "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", + "notes": "~73% of total OASDI ($1,452B CBO projection)", + "year": HARDCODED_YEAR, + }, + { + "variable": "social_security_disability", + "value": 148e9, + "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", + "notes": "~10.2% of total OASDI (disabled workers)", + "year": HARDCODED_YEAR, + }, + { + "variable": "social_security_survivors", + "value": 160e9, + "source": "https://www.ssa.gov/OACT/FACTS/", + "notes": "~11.0% of total OASDI (widows, children of deceased)", + "year": HARDCODED_YEAR, + }, + { + "variable": "social_security_dependents", + "value": 84e9, + "source": "https://www.ssa.gov/OACT/FACTS/", + "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)", + "year": HARDCODED_YEAR, + }, + # IRA contribution totals from IRS SOI accumulation tables + { + "variable": "traditional_ira_contributions", + "value": 25e9, + "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", + "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024", + "year": HARDCODED_YEAR, + }, + { + "variable": "roth_ira_contributions", + "value": 39e9, + "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", + "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024", + "year": HARDCODED_YEAR, + }, ] # Conditional count targets - these need strata with constraints diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index e368d504..f798c0dc 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -9,9 +9,10 @@ from policyengine_core.reforms import Reform from policyengine_us_data.utils.soi import pe_to_soi, get_soi -# CPS-derived statistics -# Medical expenses, sum of spm thresholds -# Child support expenses +# National calibration targets consumed by build_loss_matrix(). +# These are duplicated in db/etl_national_targets.py which loads them +# into policy_data.db. A future PR should wire build_loss_matrix() +# to read from the database so this dict can be deleted. See PR #488. HARD_CODED_TOTALS = { "health_insurance_premiums_without_medicare_part_b": 385e9, @@ -35,6 +36,29 @@ # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC # Assume 40% through 2024 "tip_income": 38e9 * 1.4, + # SSA benefit-type totals for 2024, derived from: + # - Total OASDI: $1,452B (CBO projection) + # - OASI trust fund: $1,227.4B in 2023 + # https://www.ssa.gov/OACT/STATS/table4a3.html + # - DI trust fund: $151.9B in 2023 + # https://www.ssa.gov/OACT/STATS/table4a3.html + # - SSA 2024 fact sheet type shares: retired+deps=78.5%, + # survivors=11.0%, disabled+deps=10.5% + # https://www.ssa.gov/OACT/FACTS/ + # - SSA Annual Statistical Supplement Table 5.A1 + # https://www.ssa.gov/policy/docs/statcomps/supplement/2024/5a.html + "social_security_retirement": 1_060e9, # ~73% of total + "social_security_disability": 148e9, # ~10.2% (disabled workers) + "social_security_survivors": 160e9, # ~11.0% (widows, children of deceased) + "social_security_dependents": 84e9, # ~5.8% (spouses/children of retired+disabled) + # IRA contribution totals from IRS SOI IRA accumulation tables. + # Tax year 2022: ~5M taxpayers x $4,510 avg = ~$22.5B traditional; + # ~10M taxpayers x $3,482 avg = ~$34.8B Roth. + # Uprated ~12% to 2024 for limit increases ($6k->$7k) and + # wage growth. + # https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements + "traditional_ira_contributions": 25e9, + "roth_ira_contributions": 39e9, }