diff --git a/src/Carmen_WORCEmployment.py b/src/Carmen_WORCEmployment.py index 8da6613..e8b671d 100644 --- a/src/Carmen_WORCEmployment.py +++ b/src/Carmen_WORCEmployment.py @@ -1,5 +1,6 @@ import pandas as pd + def load_and_clean(file_path="data/WORC_Employment.xlsx"): """ Loads and cleans the WORC Employment dataset. @@ -18,12 +19,15 @@ def load_and_clean(file_path="data/WORC_Employment.xlsx"): worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1) # Clean up data types - worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date']) - worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'], errors='coerce') + worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date']) # noqa + worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'], + errors='coerce') - # Adjust salary that is listed as 60,000 to 28.84 for consistency with other salaries + # Adjust salary that is listed as 60,000 to 28.84 for + # consistency with other salaries # Took 60,000 / 2080hrs - 28.84 - worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84) + worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84) # noqa worc_clean = worc_cols_dropped + return worc_clean diff --git a/src/Carmen_WORCEmployment_Plots.py b/src/Carmen_WORCEmployment_Plots.py index 3c99d1e..9d4fa01 100644 --- a/src/Carmen_WORCEmployment_Plots.py +++ b/src/Carmen_WORCEmployment_Plots.py @@ -1,5 +1,3 @@ -import sys -import os import pandas as pd from Carmen_WORCEmployment import load_and_clean import matplotlib.pyplot as plt @@ -8,20 +6,26 @@ def plot_salary_by_gender(data): plt.figure(figsize=(8, 5)) - sns.boxplot(data=data, x='Gender', y='Salary') + sns.boxplot(data=data, + x='Gender', + y='Salary') plt.title("Salary Distribution by Gender") plt.show() def plot_avg_salary_by_city(data): region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values() - region_salary.plot(kind='barh', figsize=(8, 5), title="Average Salary by KY Region") + region_salary.plot(kind='barh', + figsize=(8, 5), + title="Average Salary by KY Region") plt.xlabel("Average Salary") plt.show() def plot_placements_over_time(data): - data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4)) + data.set_index('Start Date').resample('M').size().plot(kind='line', + marker='o', + figsize=(10, 4)) plt.title("Number of Placements Over Time") plt.ylabel("Placements") plt.show() @@ -29,7 +33,9 @@ def plot_placements_over_time(data): def plot_placement_type_by_program(data): plt.figure(figsize=(10, 6)) - sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name') + sns.countplot(data=data, + x='ATP Placement Type', + hue='Program: Program Name') plt.xticks(rotation=45) plt.title("Placement Type by Program") plt.show() @@ -37,7 +43,9 @@ def plot_placement_type_by_program(data): def plot_top_cities(data): city_counts = data['Mailing City'].value_counts().head(10) - city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4)) + city_counts.plot(kind='bar', + title='Top Cities by Participant Count', + figsize=(8, 4)) plt.ylabel("Count") plt.show() diff --git a/src/cleaning_enrollments_data.py b/src/cleaning_enrollments_data.py new file mode 100644 index 0000000..0c2d9fb --- /dev/null +++ b/src/cleaning_enrollments_data.py @@ -0,0 +1,186 @@ +import pandas as pd +import numpy as np + + +class EnrollmentsCleaning: + def __init__(self, raw_data): + self.raw_data = raw_data + + def __Drop_columns(self, df): + """ + Deletes the columns not needed for the analysis, + if you want to add columns to delete change the const variable + 'COLUMNS_TO_DROP'. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ + COLUMNS_TO_DROP = ['Full Name'] + result = df.drop(columns=COLUMNS_TO_DROP) + return result + + def __Fix_nan_values(self, df): + """ + Gives values to NaN. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ + # Fix NaN values + NAN_VALUE_SUBSTITUTE = 'NA' + columns_to_fix = { + 'Projected Start Date': NAN_VALUE_SUBSTITUTE, + 'Actual Start Date': NAN_VALUE_SUBSTITUTE, + 'Projected End Date': NAN_VALUE_SUBSTITUTE, + 'Actual End Date': NAN_VALUE_SUBSTITUTE, + 'Outcome': NAN_VALUE_SUBSTITUTE + } + # 'ATP Cohort' NA will handle in a separed function + for column, substitute_value in columns_to_fix.items(): + df[column] = df[column].fillna(substitute_value) + + return df + + def __Rename_values(self, df): + """ + Changes values for consistency. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ + df.loc[df['Service'] == 'Data Analytics 2', + 'Service'] = 'Data Analysis 2' + return df + + def __Delete_values(self, df): + """ + Deletes values not needed, if you want to add values to delete + change the const variable 'VALUES_NOT_NEEDED'. + + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ + # 'Referral to External Service', 'Supportive Services Referral', + # are deleted because dont have a "Projected Start Date" + VALUES_NOT_NEEDED = { + 'Service': ['Software Development 1', + 'Software Development 2', + 'Web Development 1', 'Web Development 2', + 'Data Analysis 1', 'Data Analysis 2', + 'Referral to External Service', + 'Supportive Services Referral'] + } + for column, value in VALUES_NOT_NEEDED.items(): + df = df[~df[column].isin(value)] + return df + + def __Set_data_types(self, df): + """ + Sets data type for each column. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ + # DataTypes + column_datatype: dict = {'Auto Id': str, 'KY Region': str, + 'Assessment ID': str, 'EnrollmentId': str, + 'Enrollment Service Name': str, + 'Service': str, + 'Projected Start Date': str, + 'Actual Start Date': str, + 'Projected End Date': str, + 'Actual End Date': str, + 'Outcome': str, + 'ATP Cohort': 'datetime64[ns]'} + # TODO: 'Projected Start Date', 'Actual Start Date', + # 'Projected End Date','Actual End Date' are all datetime + # types but have a value fix of NA + + for column, type in column_datatype.items(): + df[column] = df[column].astype(type) + return df + + def __Find_cohort(self, id: str, + projected_start_date: str, + cohort_to_find: str, + df_to_clean: pd.DataFrame): + """ + Finds values for each NaN of 'ATP Cohort' column. + This function was created with the idea of using + pandas.DataFrame.apply(). + + + Args: + id: str + projected_start_date: str + cohort_to_find: str + df_to_clean: pandas.DataFrame + + Return: + numpy.array + """ + # Q: What to do with Service: ['Referral to External Service', + # 'Supportive Services Referral'] + # TODO: Clean the NaTType before this function runs + if pd.isna(cohort_to_find): + student_df = df_to_clean[df_to_clean['Auto Id'] == id] + # remove ATP Cohort NA values, it can be more than one + student_df: pd.DataFrame = student_df[~student_df['ATP Cohort'] + .isna()] + cohorts_participaded = student_df['ATP Cohort'].astype( + 'datetime64[ns]').unique() + + # print(cohorts_participaded) + if len(cohorts_participaded) == 1: + return cohorts_participaded[0] + else: + # cohorts_participaded.append(pd.to_datetime(projected_start_date)) + stimated_module_date = np.datetime64(projected_start_date) + cohorts_participaded = np.append( + cohorts_participaded, stimated_module_date) + cohorts_participaded.sort() + previus_date = cohorts_participaded[0] + for cohort in cohorts_participaded: + if stimated_module_date == cohort: + return previus_date + else: + return np.datetime64(cohort_to_find) + + def Get_clean_data(self): + """ + Cleans the raw data. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ + df = self.raw_data + df = self.__Drop_columns(df) + df = self.__Fix_nan_values(df) + df = self.__Rename_values(df) + df = self.__Delete_values(df) + df = self.__Set_data_types(df) + df['ATP Cohort'] = df.apply(lambda row: self.__Find_cohort( + row['Auto Id'], + row['Projected Start Date'], + row['ATP Cohort'], + df), axis=1) + return df diff --git a/src/completion_rate_data.py b/src/completion_rate_data.py new file mode 100644 index 0000000..095c447 --- /dev/null +++ b/src/completion_rate_data.py @@ -0,0 +1,75 @@ +import pandas as pd + + +class Completion_rate_data: + def __init__(self, data): + self.data = data + self.__pathways = [ + 'Web Development M1', + 'Web Development M2', + 'Web Development M3', + 'Web Development M4', + 'Data Analysis M1', + 'Data Analysis M2', + 'Data Analysis M3', + 'Data Analysis M4', + 'Software Development M1', + 'Software Development M2', + 'Software Development M3', + 'Software Development M4', + 'Quality Assurance M1', + 'Quality Assurance M2', + 'Quality Assurance M3', + 'Quality Assurance M4', + 'User Experience M1', + 'User Experience M2', + 'User Experience M3', + 'User Experience M4', + ] + + # Not the best Pandas way to do it: + def Get_completion_percentages(self, + cohort: str = 'All cohorts') -> pd.DataFrame: # noqa + """ + Creates a pandas.Datafreme that contains the % + of completion of each pathway. + + Args: + cohort: str + + Return: + pandas.DataFrame + """ + if cohort == 'All cohorts': + data = self.data + else: + data = self.data[self.data['ATP Cohort'] == pd.Timestamp(cohort)] + + completion_dictionary = {} + + for path in self.__pathways: + outcome = data[data['Service'] == path]['Outcome'].value_counts( + normalize=True).reset_index() + completion_dictionary[path] = { + row.Outcome: row.proportion for row in outcome.itertuples(index=True)} # noqa + + result_df = pd.DataFrame(completion_dictionary).transpose().fillna( + 0).rename_axis('Module').reset_index() + + result_df['Pathway'] = result_df['Module'].apply( + # intended to be able to sort by pathway + lambda x: x[:x.rfind(' ')]) + return result_df + + def Get_pathways_name(self, df: pd.DataFrame) -> list: + """ + List of all the pathways in a pandas.DataFrame generated by + self.Get_completion_percentages(). + + Args: + df: pandas.DataFrame + + Return: + list + """ + return list(df['Pathway'].unique()) diff --git a/src/most_common_pathways_taken_data.py b/src/most_common_pathways_taken_data.py new file mode 100644 index 0000000..dc99483 --- /dev/null +++ b/src/most_common_pathways_taken_data.py @@ -0,0 +1,62 @@ +import pandas as pd + + +class Most_common_pathways_taken_data: + def __init__(self, data): + self.data = data + self.__starter_pathways = [ + 'Web Development M1', + 'Data Analysis M1', + 'Software Development M1', + 'Quality Assurance M1', + 'User Experience M1', + ] + self.starter_only_df = self.Get_starting_pathways() + + def Get_starting_pathways(self): + """ + Returns a pandas.DataFrame were all the services are + the beginning pathways. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ + mask_starter_pathways = self.data['Service'].isin(self.__starter_pathways) # noqa + return self.data[mask_starter_pathways] + + def Get_cohorts_list(self): + """ + List of cohorts from starting pathways. + + Args: + df: pandas.DataFrame + + Return: + list + """ + df = self.starter_only_df + cohorts = list(pd.to_datetime(df['ATP Cohort'][df['ATP Cohort'] != 'NA']).sort_values(ascending=True).astype(str).unique()) # noqa + cohorts.insert(0, 'All cohorts') + return cohorts + + def Get_data_by_cohort(self, cohort: str = 'All cohorts') -> pd.DataFrame: + """ + Returns a pandas.DataFrame for a specific cohort or all cohorts. + + Args: + df: pandas.DataFrame + cohort: str + + Return: + pandas.DataFrame + """ + df = self.starter_only_df + if cohort == 'All cohorts': + result = df.value_counts('Service').reset_index() + else: + result = df[df['ATP Cohort'] == str(pd.to_datetime(cohort))].value_counts('Service').reset_index() # noqa + + return result diff --git a/src/notebooks/visualization_examples.ipynb b/src/notebooks/visualization_examples.ipynb new file mode 100644 index 0000000..c40b6af --- /dev/null +++ b/src/notebooks/visualization_examples.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b0c6df40", + "metadata": {}, + "source": [ + "# Visualization examples\n", + "\n", + "Visualizion was not turn into a class because the project will use Google Locker for dashboard creation, this notebook only works to showcase how to use the Data Manipulation classes." + ] + }, + { + "cell_type": "markdown", + "id": "fc151064", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "47cd23cd", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import plotly.express as px\n", + "from dash import Dash, dcc, html, Input, Output\n", + "from most_common_pathways_taken_data import Most_common_pathways_taken_data\n", + "from completion_rate_data import Completion_rate_data\n", + "from cleaning_enrollments_data import EnrollmentsCleaning" + ] + }, + { + "cell_type": "markdown", + "id": "cc61af47", + "metadata": {}, + "source": [ + "## Cleaning data\n", + "\n", + "This step should be done before the use of any of the Data classes" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ba57e157", + "metadata": {}, + "outputs": [], + "source": [ + "cleaner = EnrollmentsCleaning(pd.read_excel('Data\\\\Raw\\\\ARC Enrollments.xlsx'))\n" + ] + }, + { + "cell_type": "markdown", + "id": "4225b677", + "metadata": {}, + "source": [ + "## Most common pathway taken:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "fa1b6e02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def Dash_most_selected_path_by_cohort() -> Dash: # Need to pass the dataframe argument because of how the Data is structure\n", + " app = Dash(__name__)\n", + " # Const\n", + " data_class = Most_common_pathways_taken_data(cleaner.Get_clean_data())\n", + "\n", + " dropdown_options = data_class.Get_cohorts_list()\n", + " pathway_color = {\n", + " 'Web Development M1': 'blue',\n", + " 'Data Analysis M1': 'red', \n", + " 'Software Development M1': 'green',\n", + " 'Quality Assurance M1': 'yellow', \n", + " 'User Experience M1': 'purple'\n", + " }\n", + "\n", + " # Display\n", + " app.layout = html.Div([\n", + " html.H2('Cohorts', style={'text-align': \"center\"}),\n", + " html.P('Select Cohort:'),\n", + " dcc.Dropdown(\n", + " id=\"dropdown\",\n", + " options=dropdown_options,\n", + " value=dropdown_options[0],\n", + " clearable=False,\n", + " ),\n", + " dcc.Graph(id=\"graph\")\n", + " \n", + " ], style={'backgroundColor':'white'})\n", + "\n", + " @app.callback(\n", + " Output(\"graph\", \"figure\"),\n", + " Input(\"dropdown\", \"value\"))\n", + "\n", + " # Graph\n", + " def tt(time):\n", + " df = data_class.Get_data_by_cohort(time)\n", + " fig = px.pie(df, names='Service', values='count', color='Service', color_discrete_map=pathway_color)\n", + " return fig\n", + "\n", + " return app\n", + "\n", + " # TODO: Add number of students per each cohort \n", + " # TODO: Fix the options on the selection \n", + " # TODO: make colors better\n", + "\n", + "Dash_most_selected_path_by_cohort().run(debug=True, port=8052)" + ] + }, + { + "cell_type": "markdown", + "id": "6b5b514e", + "metadata": {}, + "source": [ + "## Compleation rates:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0b7d44e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def Dash_completion_rates_by_path() -> Dash: # TODO: fix data structure so visualization doesn't use df\n", + " app2 = Dash(__name__)\n", + " # Const\n", + " data_class = Completion_rate_data(cleaner.Get_clean_data())\n", + " completion_df = data_class.Get_completion_percentages()\n", + " options = data_class.Get_pathways_name(completion_df)\n", + "\n", + " # Display\n", + " app2.layout = html.Div([\n", + " html.H2('Pathways Completion', style={'text-align': \"center\"}),\n", + " html.P('Select pathway:'),\n", + " dcc.Dropdown(\n", + " id=\"dropdown\",\n", + " options=options,\n", + " value=options[0],\n", + " clearable=False,\n", + " ),\n", + " dcc.Graph(id=\"graph\")\n", + " \n", + " ], style={'backgroundColor':'white'})\n", + "\n", + " @app2.callback(\n", + " Output(\"graph\", \"figure\"),\n", + " Input(\"dropdown\", \"value\"))\n", + "\n", + " # Graph\n", + " # TODO: Need to add an extra selection box with the cohorts\n", + " def Display_pathway_completion(p):\n", + " df = completion_df[completion_df['Pathway'] == p]\n", + " fig = px.bar(df, x='Module', y='Successfully Completed')\n", + " return fig\n", + "\n", + " return app2\n", + "\n", + "Dash_completion_rates_by_path().run(debug=True, port=8053)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}