Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,4 @@ data/WORC Employment.xlsx
data/ARC Enrollments.xlsx
data/ARC Application.xlsx
data/All demographics and programs.xlsx
data/WORC_Employment.xlsx
34 changes: 23 additions & 11 deletions src/Carmen_WORCEmployment.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
import pandas as pd

file_path = "data/WORC_Employment.xlsx"
worc = pd.read_excel(file_path)
def load_and_clean(file_path="../../data/WORC_Employment.xlsx"):
"""
Loads and cleans the WORC Employment dataset.

Parameter:
file_path (str): Relative path to the Excel file.

# removed auto id as we may need it later
cols_to_drop = ['Employment History Name']
Returns:
pd.DataFrame: Cleaned DataFrame.
"""
# Load data
worc = pd.read_excel(file_path)

worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)
# Drop columns we don't need
cols_to_drop = ['Employment History Name']
worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)

# Why did we decide to drop all nulls?
# This can be dangerous if we have a lot of nulls
# Also it removed the entire row if any column had a null value
# will this cause issues later?
worc_cols_dropped_nulls = worc_cols_dropped.dropna()
# Clean up data types
worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date'])
worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'], errors='coerce')

worc_cleaned = worc_cols_dropped_nulls
# Adjust salary that is listed as 60,000 to 28.84 for consistency with other salaries
# Took 60,000 / 2080hrs - 28.84
worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84)

worc_clean = worc_cols_dropped
return worc_clean
59 changes: 59 additions & 0 deletions src/notebooks/Carmen_WORCEmployment_Plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sys
import os

# Add the parent directory (src) to sys.path
sys.path.append(os.path.abspath(".."))

from Carmen_WORCEmployment import load_and_clean
import matplotlib.pyplot as plt
import seaborn as sns


def plot_salary_by_gender(data):
plt.figure(figsize=(8, 5))
sns.boxplot(data=data, x='Gender', y='Salary')
plt.title("Salary Distribution by Gender")
plt.show()


def plot_avg_salary_by_city(data):
region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()
region_salary.plot(kind='barh', figsize=(8, 5), title="Average Salary by KY Region")
plt.xlabel("Average Salary")
plt.show()


def plot_placements_over_time(data):
data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))
plt.title("Number of Placements Over Time")
plt.ylabel("Placements")
plt.show()


def plot_placement_type_by_program(data):
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')
plt.xticks(rotation=45)
plt.title("Placement Type by Program")
plt.show()


def plot_top_cities(data):
city_counts = data['Mailing City'].value_counts().head(10)
city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))
plt.ylabel("Count")
plt.show()


def main():
worc_clean = load_and_clean()

plot_salary_by_gender(worc_clean)
plot_avg_salary_by_city(worc_clean)
plot_placements_over_time(worc_clean)
plot_placement_type_by_program(worc_clean)
plot_top_cities(worc_clean)


if __name__ == "__main__":
main()
76 changes: 52 additions & 24 deletions src/notebooks/worc_cleaning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,33 @@
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n"
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Read in file\n",
"\n",
"file_path = \"WORC_Employment.xlsx\"\n",
"file_path = \"../../data/WORC_Employment.xlsx\"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -59,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -238,7 +247,7 @@
"4 25.48 Female White SOAR "
]
},
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -250,7 +259,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -259,7 +268,7 @@
"(25, 16)"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -280,7 +289,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -303,7 +312,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -324,7 +333,7 @@
" 'First ATP Placement - Already in Tech'], dtype=object)"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -337,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -346,7 +355,7 @@
"np.True_"
]
},
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -358,7 +367,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -383,7 +392,7 @@
"dtype: int64"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -404,7 +413,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -416,34 +425,34 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"worc_cols_dropped_nulls = worc_cols_dropped.dropna()"
"# worc_cols_dropped_nulls = worc_cols_dropped.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"worc_cleaned = worc_cols_dropped_nulls"
"worc_cleaned = worc_cols_dropped"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(22, 14)"
"(25, 14)"
]
},
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -454,7 +463,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -621,14 +630,33 @@
"4 25.48 Female White SOAR "
]
},
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"worc_cleaned.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"worc_cleaned.to_excel(\"worc_employment.xlsx\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"worc_cleaned['Start Date'] = pd.to_datetime(worc_cleaned['Start Date'])\n",
"worc_cleaned['Salary'] = pd.to_numeric(worc_cleaned['Salary'], errors='coerce')\n"
]
}
],
"metadata": {
Expand Down
Binary file added src/notebooks/worc_employment.xlsx
Binary file not shown.
Binary file added src/notebooks/worc_employment_clean.xlsx
Binary file not shown.
Loading
Loading