diff --git a/src/Carmen_WORCEmployment.py b/src/Carmen_WORCEmployment.py
index 8da6613..e8b671d 100644
--- a/src/Carmen_WORCEmployment.py
+++ b/src/Carmen_WORCEmployment.py
@@ -1,5 +1,6 @@
import pandas as pd
+
def load_and_clean(file_path="data/WORC_Employment.xlsx"):
"""
Loads and cleans the WORC Employment dataset.
@@ -18,12 +19,15 @@ def load_and_clean(file_path="data/WORC_Employment.xlsx"):
worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)
# Clean up data types
- worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date'])
- worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'], errors='coerce')
+ worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date']) # noqa
+ worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'],
+ errors='coerce')
- # Adjust salary that is listed as 60,000 to 28.84 for consistency with other salaries
+ # Adjust salary that is listed as 60,000 to 28.84 for
+ # consistency with other salaries
# Took 60,000 / 2080hrs - 28.84
- worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84)
+ worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84) # noqa
worc_clean = worc_cols_dropped
+
return worc_clean
diff --git a/src/Carmen_WORCEmployment_Plots.py b/src/Carmen_WORCEmployment_Plots.py
index 3c99d1e..9d4fa01 100644
--- a/src/Carmen_WORCEmployment_Plots.py
+++ b/src/Carmen_WORCEmployment_Plots.py
@@ -1,5 +1,3 @@
-import sys
-import os
import pandas as pd
from Carmen_WORCEmployment import load_and_clean
import matplotlib.pyplot as plt
@@ -8,20 +6,26 @@
def plot_salary_by_gender(data):
plt.figure(figsize=(8, 5))
- sns.boxplot(data=data, x='Gender', y='Salary')
+ sns.boxplot(data=data,
+ x='Gender',
+ y='Salary')
plt.title("Salary Distribution by Gender")
plt.show()
def plot_avg_salary_by_city(data):
region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()
- region_salary.plot(kind='barh', figsize=(8, 5), title="Average Salary by KY Region")
+ region_salary.plot(kind='barh',
+ figsize=(8, 5),
+ title="Average Salary by KY Region")
plt.xlabel("Average Salary")
plt.show()
def plot_placements_over_time(data):
- data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))
+ data.set_index('Start Date').resample('M').size().plot(kind='line',
+ marker='o',
+ figsize=(10, 4))
plt.title("Number of Placements Over Time")
plt.ylabel("Placements")
plt.show()
@@ -29,7 +33,9 @@ def plot_placements_over_time(data):
def plot_placement_type_by_program(data):
plt.figure(figsize=(10, 6))
- sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')
+ sns.countplot(data=data,
+ x='ATP Placement Type',
+ hue='Program: Program Name')
plt.xticks(rotation=45)
plt.title("Placement Type by Program")
plt.show()
@@ -37,7 +43,9 @@ def plot_placement_type_by_program(data):
def plot_top_cities(data):
city_counts = data['Mailing City'].value_counts().head(10)
- city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))
+ city_counts.plot(kind='bar',
+ title='Top Cities by Participant Count',
+ figsize=(8, 4))
plt.ylabel("Count")
plt.show()
diff --git a/src/cleaning_enrollments_data.py b/src/cleaning_enrollments_data.py
new file mode 100644
index 0000000..0c2d9fb
--- /dev/null
+++ b/src/cleaning_enrollments_data.py
@@ -0,0 +1,186 @@
+import pandas as pd
+import numpy as np
+
+
+class EnrollmentsCleaning:
+ def __init__(self, raw_data):
+ self.raw_data = raw_data
+
+ def __Drop_columns(self, df):
+ """
+ Deletes the columns not needed for the analysis,
+ if you want to add columns to delete change the const variable
+ 'COLUMNS_TO_DROP'.
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ pandas.DataFrame
+ """
+ COLUMNS_TO_DROP = ['Full Name']
+ result = df.drop(columns=COLUMNS_TO_DROP)
+ return result
+
+ def __Fix_nan_values(self, df):
+ """
+ Gives values to NaN.
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ pandas.DataFrame
+ """
+ # Fix NaN values
+ NAN_VALUE_SUBSTITUTE = 'NA'
+ columns_to_fix = {
+ 'Projected Start Date': NAN_VALUE_SUBSTITUTE,
+ 'Actual Start Date': NAN_VALUE_SUBSTITUTE,
+ 'Projected End Date': NAN_VALUE_SUBSTITUTE,
+ 'Actual End Date': NAN_VALUE_SUBSTITUTE,
+ 'Outcome': NAN_VALUE_SUBSTITUTE
+ }
+ # 'ATP Cohort' NA will handle in a separed function
+ for column, substitute_value in columns_to_fix.items():
+ df[column] = df[column].fillna(substitute_value)
+
+ return df
+
+ def __Rename_values(self, df):
+ """
+ Changes values for consistency.
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ pandas.DataFrame
+ """
+ df.loc[df['Service'] == 'Data Analytics 2',
+ 'Service'] = 'Data Analysis 2'
+ return df
+
+ def __Delete_values(self, df):
+ """
+ Deletes values not needed, if you want to add values to delete
+ change the const variable 'VALUES_NOT_NEEDED'.
+
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ pandas.DataFrame
+ """
+ # 'Referral to External Service', 'Supportive Services Referral',
+ # are deleted because dont have a "Projected Start Date"
+ VALUES_NOT_NEEDED = {
+ 'Service': ['Software Development 1',
+ 'Software Development 2',
+ 'Web Development 1', 'Web Development 2',
+ 'Data Analysis 1', 'Data Analysis 2',
+ 'Referral to External Service',
+ 'Supportive Services Referral']
+ }
+ for column, value in VALUES_NOT_NEEDED.items():
+ df = df[~df[column].isin(value)]
+ return df
+
+ def __Set_data_types(self, df):
+ """
+ Sets data type for each column.
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ pandas.DataFrame
+ """
+ # DataTypes
+ column_datatype: dict = {'Auto Id': str, 'KY Region': str,
+ 'Assessment ID': str, 'EnrollmentId': str,
+ 'Enrollment Service Name': str,
+ 'Service': str,
+ 'Projected Start Date': str,
+ 'Actual Start Date': str,
+ 'Projected End Date': str,
+ 'Actual End Date': str,
+ 'Outcome': str,
+ 'ATP Cohort': 'datetime64[ns]'}
+ # TODO: 'Projected Start Date', 'Actual Start Date',
+ # 'Projected End Date','Actual End Date' are all datetime
+ # types but have a value fix of NA
+
+ for column, type in column_datatype.items():
+ df[column] = df[column].astype(type)
+ return df
+
+ def __Find_cohort(self, id: str,
+ projected_start_date: str,
+ cohort_to_find: str,
+ df_to_clean: pd.DataFrame):
+ """
+ Finds values for each NaN of 'ATP Cohort' column.
+ This function was created with the idea of using
+ pandas.DataFrame.apply().
+
+
+ Args:
+ id: str
+ projected_start_date: str
+ cohort_to_find: str
+ df_to_clean: pandas.DataFrame
+
+ Return:
+ numpy.array
+ """
+ # Q: What to do with Service: ['Referral to External Service',
+ # 'Supportive Services Referral']
+ # TODO: Clean the NaTType before this function runs
+ if pd.isna(cohort_to_find):
+ student_df = df_to_clean[df_to_clean['Auto Id'] == id]
+ # remove ATP Cohort NA values, it can be more than one
+ student_df: pd.DataFrame = student_df[~student_df['ATP Cohort']
+ .isna()]
+ cohorts_participaded = student_df['ATP Cohort'].astype(
+ 'datetime64[ns]').unique()
+
+ # print(cohorts_participaded)
+ if len(cohorts_participaded) == 1:
+ return cohorts_participaded[0]
+ else:
+ # cohorts_participaded.append(pd.to_datetime(projected_start_date))
+ stimated_module_date = np.datetime64(projected_start_date)
+ cohorts_participaded = np.append(
+ cohorts_participaded, stimated_module_date)
+ cohorts_participaded.sort()
+ previus_date = cohorts_participaded[0]
+ for cohort in cohorts_participaded:
+ if stimated_module_date == cohort:
+ return previus_date
+ else:
+ return np.datetime64(cohort_to_find)
+
+ def Get_clean_data(self):
+ """
+ Cleans the raw data.
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ pandas.DataFrame
+ """
+ df = self.raw_data
+ df = self.__Drop_columns(df)
+ df = self.__Fix_nan_values(df)
+ df = self.__Rename_values(df)
+ df = self.__Delete_values(df)
+ df = self.__Set_data_types(df)
+ df['ATP Cohort'] = df.apply(lambda row: self.__Find_cohort(
+ row['Auto Id'],
+ row['Projected Start Date'],
+ row['ATP Cohort'],
+ df), axis=1)
+ return df
diff --git a/src/completion_rate_data.py b/src/completion_rate_data.py
new file mode 100644
index 0000000..095c447
--- /dev/null
+++ b/src/completion_rate_data.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+
+class Completion_rate_data:
+ def __init__(self, data):
+ self.data = data
+ self.__pathways = [
+ 'Web Development M1',
+ 'Web Development M2',
+ 'Web Development M3',
+ 'Web Development M4',
+ 'Data Analysis M1',
+ 'Data Analysis M2',
+ 'Data Analysis M3',
+ 'Data Analysis M4',
+ 'Software Development M1',
+ 'Software Development M2',
+ 'Software Development M3',
+ 'Software Development M4',
+ 'Quality Assurance M1',
+ 'Quality Assurance M2',
+ 'Quality Assurance M3',
+ 'Quality Assurance M4',
+ 'User Experience M1',
+ 'User Experience M2',
+ 'User Experience M3',
+ 'User Experience M4',
+ ]
+
+ # Not the best Pandas way to do it:
+ def Get_completion_percentages(self,
+ cohort: str = 'All cohorts') -> pd.DataFrame: # noqa
+ """
+ Creates a pandas.Datafreme that contains the %
+ of completion of each pathway.
+
+ Args:
+ cohort: str
+
+ Return:
+ pandas.DataFrame
+ """
+ if cohort == 'All cohorts':
+ data = self.data
+ else:
+ data = self.data[self.data['ATP Cohort'] == pd.Timestamp(cohort)]
+
+ completion_dictionary = {}
+
+ for path in self.__pathways:
+ outcome = data[data['Service'] == path]['Outcome'].value_counts(
+ normalize=True).reset_index()
+ completion_dictionary[path] = {
+ row.Outcome: row.proportion for row in outcome.itertuples(index=True)} # noqa
+
+ result_df = pd.DataFrame(completion_dictionary).transpose().fillna(
+ 0).rename_axis('Module').reset_index()
+
+ result_df['Pathway'] = result_df['Module'].apply(
+ # intended to be able to sort by pathway
+ lambda x: x[:x.rfind(' ')])
+ return result_df
+
+ def Get_pathways_name(self, df: pd.DataFrame) -> list:
+ """
+ List of all the pathways in a pandas.DataFrame generated by
+ self.Get_completion_percentages().
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ list
+ """
+ return list(df['Pathway'].unique())
diff --git a/src/most_common_pathways_taken_data.py b/src/most_common_pathways_taken_data.py
new file mode 100644
index 0000000..dc99483
--- /dev/null
+++ b/src/most_common_pathways_taken_data.py
@@ -0,0 +1,62 @@
+import pandas as pd
+
+
+class Most_common_pathways_taken_data:
+ def __init__(self, data):
+ self.data = data
+ self.__starter_pathways = [
+ 'Web Development M1',
+ 'Data Analysis M1',
+ 'Software Development M1',
+ 'Quality Assurance M1',
+ 'User Experience M1',
+ ]
+ self.starter_only_df = self.Get_starting_pathways()
+
+ def Get_starting_pathways(self):
+ """
+ Returns a pandas.DataFrame were all the services are
+ the beginning pathways.
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ pandas.DataFrame
+ """
+ mask_starter_pathways = self.data['Service'].isin(self.__starter_pathways) # noqa
+ return self.data[mask_starter_pathways]
+
+ def Get_cohorts_list(self):
+ """
+ List of cohorts from starting pathways.
+
+ Args:
+ df: pandas.DataFrame
+
+ Return:
+ list
+ """
+ df = self.starter_only_df
+ cohorts = list(pd.to_datetime(df['ATP Cohort'][df['ATP Cohort'] != 'NA']).sort_values(ascending=True).astype(str).unique()) # noqa
+ cohorts.insert(0, 'All cohorts')
+ return cohorts
+
+ def Get_data_by_cohort(self, cohort: str = 'All cohorts') -> pd.DataFrame:
+ """
+ Returns a pandas.DataFrame for a specific cohort or all cohorts.
+
+ Args:
+ df: pandas.DataFrame
+ cohort: str
+
+ Return:
+ pandas.DataFrame
+ """
+ df = self.starter_only_df
+ if cohort == 'All cohorts':
+ result = df.value_counts('Service').reset_index()
+ else:
+ result = df[df['ATP Cohort'] == str(pd.to_datetime(cohort))].value_counts('Service').reset_index() # noqa
+
+ return result
diff --git a/src/notebooks/visualization_examples.ipynb b/src/notebooks/visualization_examples.ipynb
new file mode 100644
index 0000000..c40b6af
--- /dev/null
+++ b/src/notebooks/visualization_examples.ipynb
@@ -0,0 +1,236 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b0c6df40",
+ "metadata": {},
+ "source": [
+ "# Visualization examples\n",
+ "\n",
+ "Visualizion was not turn into a class because the project will use Google Locker for dashboard creation, this notebook only works to showcase how to use the Data Manipulation classes."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc151064",
+ "metadata": {},
+ "source": [
+ "## Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "47cd23cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import plotly.express as px\n",
+ "from dash import Dash, dcc, html, Input, Output\n",
+ "from most_common_pathways_taken_data import Most_common_pathways_taken_data\n",
+ "from completion_rate_data import Completion_rate_data\n",
+ "from cleaning_enrollments_data import EnrollmentsCleaning"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cc61af47",
+ "metadata": {},
+ "source": [
+ "## Cleaning data\n",
+ "\n",
+ "This step should be done before the use of any of the Data classes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "ba57e157",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cleaner = EnrollmentsCleaning(pd.read_excel('Data\\\\Raw\\\\ARC Enrollments.xlsx'))\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4225b677",
+ "metadata": {},
+ "source": [
+ "## Most common pathway taken:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "fa1b6e02",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "def Dash_most_selected_path_by_cohort() -> Dash: # Need to pass the dataframe argument because of how the Data is structure\n",
+ " app = Dash(__name__)\n",
+ " # Const\n",
+ " data_class = Most_common_pathways_taken_data(cleaner.Get_clean_data())\n",
+ "\n",
+ " dropdown_options = data_class.Get_cohorts_list()\n",
+ " pathway_color = {\n",
+ " 'Web Development M1': 'blue',\n",
+ " 'Data Analysis M1': 'red', \n",
+ " 'Software Development M1': 'green',\n",
+ " 'Quality Assurance M1': 'yellow', \n",
+ " 'User Experience M1': 'purple'\n",
+ " }\n",
+ "\n",
+ " # Display\n",
+ " app.layout = html.Div([\n",
+ " html.H2('Cohorts', style={'text-align': \"center\"}),\n",
+ " html.P('Select Cohort:'),\n",
+ " dcc.Dropdown(\n",
+ " id=\"dropdown\",\n",
+ " options=dropdown_options,\n",
+ " value=dropdown_options[0],\n",
+ " clearable=False,\n",
+ " ),\n",
+ " dcc.Graph(id=\"graph\")\n",
+ " \n",
+ " ], style={'backgroundColor':'white'})\n",
+ "\n",
+ " @app.callback(\n",
+ " Output(\"graph\", \"figure\"),\n",
+ " Input(\"dropdown\", \"value\"))\n",
+ "\n",
+ " # Graph\n",
+ " def tt(time):\n",
+ " df = data_class.Get_data_by_cohort(time)\n",
+ " fig = px.pie(df, names='Service', values='count', color='Service', color_discrete_map=pathway_color)\n",
+ " return fig\n",
+ "\n",
+ " return app\n",
+ "\n",
+ " # TODO: Add number of students per each cohort \n",
+ " # TODO: Fix the options on the selection \n",
+ " # TODO: make colors better\n",
+ "\n",
+ "Dash_most_selected_path_by_cohort().run(debug=True, port=8052)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b5b514e",
+ "metadata": {},
+ "source": [
+ "## Compleation rates:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c0b7d44e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "def Dash_completion_rates_by_path() -> Dash: # TODO: fix data structure so visualization doesn't use df\n",
+ " app2 = Dash(__name__)\n",
+ " # Const\n",
+ " data_class = Completion_rate_data(cleaner.Get_clean_data())\n",
+ " completion_df = data_class.Get_completion_percentages()\n",
+ " options = data_class.Get_pathways_name(completion_df)\n",
+ "\n",
+ " # Display\n",
+ " app2.layout = html.Div([\n",
+ " html.H2('Pathways Completion', style={'text-align': \"center\"}),\n",
+ " html.P('Select pathway:'),\n",
+ " dcc.Dropdown(\n",
+ " id=\"dropdown\",\n",
+ " options=options,\n",
+ " value=options[0],\n",
+ " clearable=False,\n",
+ " ),\n",
+ " dcc.Graph(id=\"graph\")\n",
+ " \n",
+ " ], style={'backgroundColor':'white'})\n",
+ "\n",
+ " @app2.callback(\n",
+ " Output(\"graph\", \"figure\"),\n",
+ " Input(\"dropdown\", \"value\"))\n",
+ "\n",
+ " # Graph\n",
+ " # TODO: Need to add an extra selection box with the cohorts\n",
+ " def Display_pathway_completion(p):\n",
+ " df = completion_df[completion_df['Pathway'] == p]\n",
+ " fig = px.bar(df, x='Module', y='Successfully Completed')\n",
+ " return fig\n",
+ "\n",
+ " return app2\n",
+ "\n",
+ "Dash_completion_rates_by_path().run(debug=True, port=8053)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}