From dc7e289f2aa3321d69483ec72e96aafb845b8d2c Mon Sep 17 00:00:00 2001 From: = Date: Fri, 2 Aug 2024 12:13:31 -0400 Subject: [PATCH 01/12] Added data processing workflow and (unimplemented) processing script --- .github/workflows/schedule.yaml | 24 ++++++++++++++++++++++++ reweight/logic/process_data.py | 1 + 2 files changed, 25 insertions(+) create mode 100644 .github/workflows/schedule.yaml create mode 100644 reweight/logic/process_data.py diff --git a/.github/workflows/schedule.yaml b/.github/workflows/schedule.yaml new file mode 100644 index 0000000..d57cb6f --- /dev/null +++ b/.github/workflows/schedule.yaml @@ -0,0 +1,24 @@ +name: Scheduled Data Processing + +on: + schedule: + - cron: "0 0 1 * *" # Runs at 00:00 on the first day of every month + push: + branches: [main] # Runs on pushes to the main branch + pull_request: + branches: [main] # Runs on pull requests to the main branch + +jobs: + process_data: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install dependencies + run: make install + - name: Run data processing script + run: python reweight/logic/process_data.py diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py new file mode 100644 index 0000000..e843899 --- /dev/null +++ b/reweight/logic/process_data.py @@ -0,0 +1 @@ +raise NotImplementedError("Data processing function still in development") \ No newline at end of file From 2bd2a5421eb52f6c99ddd85ac7c8e74cc80ccbf7 Mon Sep 17 00:00:00 2001 From: = Date: Fri, 2 Aug 2024 18:10:09 -0400 Subject: [PATCH 02/12] Fixed setup.py installation issues with torch --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index db55313..8bdd301 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ install_requires=[ "numpy<2.0", "pandas", - "torch+cpu", + "torch", "tensorboard", "jupyter-book", "pytest", From 6367ec8aca81c81072689b52c4871b07aa9ec4b3 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:26:18 -0400 Subject: [PATCH 03/12] Added a gitignore to exclude items in root starting with the string test_ --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 712df78..1cf11d0 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,5 @@ docs/_build # Testing notebooks # ##################### -/*.ipynb \ No newline at end of file +/*.ipynb +/test_* \ No newline at end of file From 8b6e3e73255de7f1248f49dbe77463b0c60eeda3 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:27:15 -0400 Subject: [PATCH 04/12] Now ignores CSV files in root --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1cf11d0..9bbaef0 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,8 @@ docs/_build # Testing notebooks # ##################### /*.ipynb -/test_* \ No newline at end of file +/test_* + +# Temporary CSV files # +####################### +/*.csv \ No newline at end of file From e310f491f1015bac81dcfe9b97958bec8f9452f7 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:35:53 -0400 Subject: [PATCH 05/12] Wrote a script to process data and post it to the reweight repo --- reweight/logic/process_data.py | 105 ++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index e843899..90f4443 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -1 +1,104 @@ -raise NotImplementedError("Data processing function still in development") \ No newline at end of file +import pandas as pd +import numpy as np +import torch +from torch.utils.tensorboard import SummaryWriter +import os +import requests +import base64 + +import policyengine_uk +from policyengine_uk.data import RawFRS_2021_22 +from policyengine_uk.data.datasets.frs.calibration.calibrate import generate_model_variables + +from reweight import reweight + +#UK dataframe generation. + +RawFRS_2021_22().download() + +uk_weights_df = pd.DataFrame() + +for year in range(2024, 2029): + ( + household_weights, + weight_adjustment, + values_df, + targets, + targets_array, + equivalisation_factors_array + ) = generate_model_variables("frs_2021", year) + sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) + uk_final_weights = reweight(household_weights, sim_matrix, targets, targets_array, epochs=1_000) + uk_weight_series = pd.Series(uk_final_weights.numpy()) + uk_weights_df[str(year)] = uk_weight_series + + +csv_filename = "updated_uk_weights.csv" +uk_weights_df.to_csv(csv_filename) + + +#US dataframe generation. + +import policyengine_us +from policyengine_us.data.datasets.cps.enhanced_cps.loss import generate_model_variables + +us_weights_df = pd.DataFrame() + +for year in range(2024, 2029): + ( + household_weights, + weight_adjustment, + values_df, + targets, + targets_array, + equivalisation_factors_array + ) = generate_model_variables("cps_2021", year) + sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) + initial_weights = torch.tensor(household_weights, dtype=torch.float32) + targets_tensor = torch.tensor(targets_array, dtype=torch.float32) + us_final_weights = reweight(initial_weights, sim_matrix, targets, targets_tensor, epochs=1_000) + us_weight_series = pd.Series(us_final_weights.numpy()) + us_weights_df[str(year)] = us_weight_series + +#Now, for testing, save these dataframes as CSV. + +csv_filename = "updated_us_weights.csv" +us_weights_df.to_csv(csv_filename) + +#Now, create a GitHub release + +api_url = 'https://api.github.com/repos/PolicyEngine/reweight/releases' + +owner = 'pmberg' +repo = 'reweight' +token = os.environ.get('GITHUB_TOKEN') + +# Create release +headers = { + 'Authorization': f'token {token}', + 'Accept': 'application/vnd.github.v3+json' +} +release_data = { + 'tag_name': f'v{pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', + 'name': f'Data Release {pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', + 'body': 'Automated data release with updated weights' +} +response = requests.post(api_url.format(owner=owner, repo=repo), headers=headers, json=release_data) +release = response.json() + +# Upload assets +upload_url = release['upload_url'].split('{')[0] + +def upload_file(file_name): + with open(file_name, 'rb') as file: + content = file.read() + headers['Content-Type'] = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + params = {'name': os.path.basename(file_name)} + response = requests.post(upload_url, headers=headers, params=params, data=content) + if response.status_code == 201: + print(f"File added successfully: {release['html_url']}") + else: + print(f"Failed to add file: {response.content}") + +for file_name in ["updated_uk_weights.csv", "updated_us_weights.csv"]: + upload_file(file_name) \ No newline at end of file From 1849351dfed9e700284ffbafcb6454a9cd541d93 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:41:07 -0400 Subject: [PATCH 06/12] Reformatted code --- reweight/logic/process_data.py | 68 +++++++++++++++++++++------------- reweight/logic/reweight.py | 2 +- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 90f4443..57e3151 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -8,11 +8,13 @@ import policyengine_uk from policyengine_uk.data import RawFRS_2021_22 -from policyengine_uk.data.datasets.frs.calibration.calibrate import generate_model_variables +from policyengine_uk.data.datasets.frs.calibration.calibrate import ( + generate_model_variables, +) from reweight import reweight -#UK dataframe generation. +# UK dataframe generation. RawFRS_2021_22().download() @@ -25,10 +27,12 @@ values_df, targets, targets_array, - equivalisation_factors_array + equivalisation_factors_array, ) = generate_model_variables("frs_2021", year) sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) - uk_final_weights = reweight(household_weights, sim_matrix, targets, targets_array, epochs=1_000) + uk_final_weights = reweight( + household_weights, sim_matrix, targets, targets_array, epochs=1_000 + ) uk_weight_series = pd.Series(uk_final_weights.numpy()) uk_weights_df[str(year)] = uk_weight_series @@ -37,10 +41,12 @@ uk_weights_df.to_csv(csv_filename) -#US dataframe generation. +# US dataframe generation. import policyengine_us -from policyengine_us.data.datasets.cps.enhanced_cps.loss import generate_model_variables +from policyengine_us.data.datasets.cps.enhanced_cps.loss import ( + generate_model_variables, +) us_weights_df = pd.DataFrame() @@ -51,54 +57,64 @@ values_df, targets, targets_array, - equivalisation_factors_array + equivalisation_factors_array, ) = generate_model_variables("cps_2021", year) sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) initial_weights = torch.tensor(household_weights, dtype=torch.float32) targets_tensor = torch.tensor(targets_array, dtype=torch.float32) - us_final_weights = reweight(initial_weights, sim_matrix, targets, targets_tensor, epochs=1_000) + us_final_weights = reweight( + initial_weights, sim_matrix, targets, targets_tensor, epochs=1_000 + ) us_weight_series = pd.Series(us_final_weights.numpy()) us_weights_df[str(year)] = us_weight_series -#Now, for testing, save these dataframes as CSV. +# Now, for testing, save these dataframes as CSV. csv_filename = "updated_us_weights.csv" us_weights_df.to_csv(csv_filename) -#Now, create a GitHub release +# Now, create a GitHub release -api_url = 'https://api.github.com/repos/PolicyEngine/reweight/releases' +api_url = "https://api.github.com/repos/PolicyEngine/reweight/releases" -owner = 'pmberg' -repo = 'reweight' -token = os.environ.get('GITHUB_TOKEN') +owner = "pmberg" +repo = "reweight" +token = os.environ.get("GITHUB_TOKEN") # Create release headers = { - 'Authorization': f'token {token}', - 'Accept': 'application/vnd.github.v3+json' + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", } release_data = { - 'tag_name': f'v{pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', - 'name': f'Data Release {pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', - 'body': 'Automated data release with updated weights' + "tag_name": f'v{pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', + "name": f'Data Release {pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', + "body": "Automated data release with updated weights", } -response = requests.post(api_url.format(owner=owner, repo=repo), headers=headers, json=release_data) +response = requests.post( + api_url.format(owner=owner, repo=repo), headers=headers, json=release_data +) release = response.json() # Upload assets -upload_url = release['upload_url'].split('{')[0] +upload_url = release["upload_url"].split("{")[0] + def upload_file(file_name): - with open(file_name, 'rb') as file: + with open(file_name, "rb") as file: content = file.read() - headers['Content-Type'] = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' - params = {'name': os.path.basename(file_name)} - response = requests.post(upload_url, headers=headers, params=params, data=content) + headers["Content-Type"] = ( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + params = {"name": os.path.basename(file_name)} + response = requests.post( + upload_url, headers=headers, params=params, data=content + ) if response.status_code == 201: print(f"File added successfully: {release['html_url']}") else: print(f"Failed to add file: {response.content}") + for file_name in ["updated_uk_weights.csv", "updated_us_weights.csv"]: - upload_file(file_name) \ No newline at end of file + upload_file(file_name) diff --git a/reweight/logic/reweight.py b/reweight/logic/reweight.py index 4227c29..1e48f66 100644 --- a/reweight/logic/reweight.py +++ b/reweight/logic/reweight.py @@ -43,7 +43,7 @@ def reweight( optimizer = torch.optim.Adam([log_weights]) - #Report the initial loss: + # Report the initial loss: targets_estimate = torch.exp(log_weights) @ estimate_matrix # Calculate the loss loss = torch.mean( From 5513fbe487bbde84f26fdddd8dadb5feb3b81984 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:50:05 -0400 Subject: [PATCH 07/12] Added Microsimulation lines to process_data --- reweight/logic/process_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 57e3151..57fb7f4 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -7,6 +7,7 @@ import base64 import policyengine_uk +from policyengine_uk import Microsimulation from policyengine_uk.data import RawFRS_2021_22 from policyengine_uk.data.datasets.frs.calibration.calibrate import ( generate_model_variables, @@ -15,6 +16,7 @@ from reweight import reweight # UK dataframe generation. +sim = Microsimulation() RawFRS_2021_22().download() From 6bfab0220c9a24d52e542cd2558c63402e730fdd Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 12:02:02 -0400 Subject: [PATCH 08/12] Reworked env in YAML file --- .github/workflows/schedule.yaml | 4 ++++ reweight/logic/process_data.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/schedule.yaml b/.github/workflows/schedule.yaml index d57cb6f..49bd335 100644 --- a/.github/workflows/schedule.yaml +++ b/.github/workflows/schedule.yaml @@ -22,3 +22,7 @@ jobs: run: make install - name: Run data processing script run: python reweight/logic/process_data.py + env: + POVERTYTRACKER_RAW_URL: ${{ secrets.POVERTYTRACKER_RAW_URL }} + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}} + API_GITHUB_TOKEN: ${{ secrets.API_GITHUB_TOKEN }} diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 57fb7f4..5226a46 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -81,7 +81,7 @@ owner = "pmberg" repo = "reweight" -token = os.environ.get("GITHUB_TOKEN") +token = os.environ.get("API_GITHUB_TOKEN") # Create release headers = { From bb40b673bfd5b27046c849197f8fcc9b37e2eedf Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 13 Aug 2024 14:09:21 +0100 Subject: [PATCH 09/12] Add sketch of condensed code --- reweight/logic/process_data.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 5226a46..9369ab4 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -13,6 +13,18 @@ generate_model_variables, ) +def calibrate_country_weights( + household_weights, loss_matrix, target_labels, target_values, epochs +) -> pd.DataFrame: + pass + + +uk_inputs = ... +us_inputs = ... + +calibrate_country_weights(*uk_inputs) +calibrate_country_weights(*us_inputs) + from reweight import reweight # UK dataframe generation. From a96ac6a0797ff51f362d7fa2038b6d3c78562336 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 13 Aug 2024 10:40:55 -0400 Subject: [PATCH 10/12] Refactored process_data, splitting repeated code into two functions. --- reweight/logic/process_data.py | 98 +++++++++++++--------------------- 1 file changed, 38 insertions(+), 60 deletions(-) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 9369ab4..3a463f0 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -9,32 +9,23 @@ import policyengine_uk from policyengine_uk import Microsimulation from policyengine_uk.data import RawFRS_2021_22 -from policyengine_uk.data.datasets.frs.calibration.calibrate import ( - generate_model_variables, -) - -def calibrate_country_weights( - household_weights, loss_matrix, target_labels, target_values, epochs -) -> pd.DataFrame: - pass - - -uk_inputs = ... -us_inputs = ... +from policyengine_uk.data.datasets.frs.calibration.calibrate import generate_model_variables as uk_generate -calibrate_country_weights(*uk_inputs) -calibrate_country_weights(*us_inputs) +import policyengine_us +from policyengine_us.data.datasets.cps.enhanced_cps.loss import generate_model_variables as us_generate from reweight import reweight -# UK dataframe generation. -sim = Microsimulation() - -RawFRS_2021_22().download() - -uk_weights_df = pd.DataFrame() +def generate_country_weights(year, data_source, generate_func): + """ + Parameters: + year (int): The year for which these country values are generated. + data_source (str): The name of the data source for that country. + generate_func (function): The function used to generate the initial values. -for year in range(2024, 2029): + Returns: + final_weights (torch.Tensor): a PyTorch tensor of final reweighted weights. + """ ( household_weights, weight_adjustment, @@ -42,50 +33,37 @@ def calibrate_country_weights( targets, targets_array, equivalisation_factors_array, - ) = generate_model_variables("frs_2021", year) - sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) - uk_final_weights = reweight( - household_weights, sim_matrix, targets, targets_array, epochs=1_000 - ) - uk_weight_series = pd.Series(uk_final_weights.numpy()) - uk_weights_df[str(year)] = uk_weight_series - - -csv_filename = "updated_uk_weights.csv" -uk_weights_df.to_csv(csv_filename) - - -# US dataframe generation. - -import policyengine_us -from policyengine_us.data.datasets.cps.enhanced_cps.loss import ( - generate_model_variables, -) - -us_weights_df = pd.DataFrame() - -for year in range(2024, 2029): - ( - household_weights, - weight_adjustment, - values_df, - targets, - targets_array, - equivalisation_factors_array, - ) = generate_model_variables("cps_2021", year) + ) = generate_func(data_source, year) sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) initial_weights = torch.tensor(household_weights, dtype=torch.float32) targets_tensor = torch.tensor(targets_array, dtype=torch.float32) - us_final_weights = reweight( + final_weights = reweight( initial_weights, sim_matrix, targets, targets_tensor, epochs=1_000 ) - us_weight_series = pd.Series(us_final_weights.numpy()) - us_weights_df[str(year)] = us_weight_series + return final_weights + +def generate_country_csv(start_year, end_year, data_source, generate_func, csv_filename): + """ + Parameters: + start_year (int): The year for which these country values start generating (inclusive). + end_year (int): The year for which these country values stop generating (non-inclusive). + data_source (str): The name of the data source for that country. + generate_func (function): The function used to generate the initial values. + csv_filename (str): The name of the file which the generated data are saved under. + + Returns: + None. Generates and saves a CSV file of reweighted weights. + """ + weights_df = pd.DataFrame() + for year in range(start_year, end_year): + final_weights = generate_country_weights(year, data_source, generate_func) + weight_series = pd.Series(final_weights.numpy()) + weights_df[str(year)] = weight_series + weights_df.to_csv(csv_filename) -# Now, for testing, save these dataframes as CSV. - -csv_filename = "updated_us_weights.csv" -us_weights_df.to_csv(csv_filename) +RawFRS_2021_22().download() +generate_country_csv(2024, 2029, "frs_2021", uk_generate, "updated_uk_weights.csv") +generate_country_csv(2024, 2029, "cps_2021", us_generate, "updated_us_weights.csv") # Now, create a GitHub release @@ -109,7 +87,7 @@ def calibrate_country_weights( api_url.format(owner=owner, repo=repo), headers=headers, json=release_data ) release = response.json() - +print(release) # Upload assets upload_url = release["upload_url"].split("{")[0] From 78be5c37b39f3c0b65ae84200985e468810d37bf Mon Sep 17 00:00:00 2001 From: = Date: Tue, 13 Aug 2024 11:19:51 -0400 Subject: [PATCH 11/12] Reformatted process_data --- reweight/logic/process_data.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 3a463f0..ec71c5e 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -9,13 +9,18 @@ import policyengine_uk from policyengine_uk import Microsimulation from policyengine_uk.data import RawFRS_2021_22 -from policyengine_uk.data.datasets.frs.calibration.calibrate import generate_model_variables as uk_generate +from policyengine_uk.data.datasets.frs.calibration.calibrate import ( + generate_model_variables as uk_generate, +) import policyengine_us -from policyengine_us.data.datasets.cps.enhanced_cps.loss import generate_model_variables as us_generate +from policyengine_us.data.datasets.cps.enhanced_cps.loss import ( + generate_model_variables as us_generate, +) from reweight import reweight + def generate_country_weights(year, data_source, generate_func): """ Parameters: @@ -42,7 +47,10 @@ def generate_country_weights(year, data_source, generate_func): ) return final_weights -def generate_country_csv(start_year, end_year, data_source, generate_func, csv_filename): + +def generate_country_csv( + start_year, end_year, data_source, generate_func, csv_filename +): """ Parameters: start_year (int): The year for which these country values start generating (inclusive). @@ -56,14 +64,21 @@ def generate_country_csv(start_year, end_year, data_source, generate_func, csv_f """ weights_df = pd.DataFrame() for year in range(start_year, end_year): - final_weights = generate_country_weights(year, data_source, generate_func) + final_weights = generate_country_weights( + year, data_source, generate_func + ) weight_series = pd.Series(final_weights.numpy()) weights_df[str(year)] = weight_series weights_df.to_csv(csv_filename) + RawFRS_2021_22().download() -generate_country_csv(2024, 2029, "frs_2021", uk_generate, "updated_uk_weights.csv") -generate_country_csv(2024, 2029, "cps_2021", us_generate, "updated_us_weights.csv") +generate_country_csv( + 2024, 2029, "frs_2021", uk_generate, "updated_uk_weights.csv" +) +generate_country_csv( + 2024, 2029, "cps_2021", us_generate, "updated_us_weights.csv" +) # Now, create a GitHub release From 9e1120d9a0ea61cb0893d630ee80afe5f7b2f763 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 13 Aug 2024 14:00:18 -0400 Subject: [PATCH 12/12] Update reweight