Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions src/Carmen_WORCEmployment.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd


def load_and_clean(file_path="data/WORC_Employment.xlsx"):
"""
Loads and cleans the WORC Employment dataset.
Expand All @@ -18,12 +19,15 @@ def load_and_clean(file_path="data/WORC_Employment.xlsx"):
worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)

# Clean up data types
worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date'])
worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'], errors='coerce')
worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date']) # noqa
worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'],
errors='coerce')

# Adjust salary that is listed as 60,000 to 28.84 for consistency with other salaries
# Adjust salary that is listed as 60,000 to 28.84 for
# consistency with other salaries
# Took 60,000 / 2080hrs - 28.84
worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84)
worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84) # noqa

worc_clean = worc_cols_dropped

return worc_clean
22 changes: 15 additions & 7 deletions src/Carmen_WORCEmployment_Plots.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import sys
import os
import pandas as pd
from Carmen_WORCEmployment import load_and_clean
import matplotlib.pyplot as plt
Expand All @@ -8,36 +6,46 @@

def plot_salary_by_gender(data):
plt.figure(figsize=(8, 5))
sns.boxplot(data=data, x='Gender', y='Salary')
sns.boxplot(data=data,
x='Gender',
y='Salary')
plt.title("Salary Distribution by Gender")
plt.show()


def plot_avg_salary_by_city(data):
region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()
region_salary.plot(kind='barh', figsize=(8, 5), title="Average Salary by KY Region")
region_salary.plot(kind='barh',
figsize=(8, 5),
title="Average Salary by KY Region")
plt.xlabel("Average Salary")
plt.show()


def plot_placements_over_time(data):
data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))
data.set_index('Start Date').resample('M').size().plot(kind='line',
marker='o',
figsize=(10, 4))
plt.title("Number of Placements Over Time")
plt.ylabel("Placements")
plt.show()


def plot_placement_type_by_program(data):
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')
sns.countplot(data=data,
x='ATP Placement Type',
hue='Program: Program Name')
plt.xticks(rotation=45)
plt.title("Placement Type by Program")
plt.show()


def plot_top_cities(data):
city_counts = data['Mailing City'].value_counts().head(10)
city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))
city_counts.plot(kind='bar',
title='Top Cities by Participant Count',
figsize=(8, 4))
plt.ylabel("Count")
plt.show()

Expand Down
186 changes: 186 additions & 0 deletions src/cleaning_enrollments_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import pandas as pd
import numpy as np


class EnrollmentsCleaning:
def __init__(self, raw_data):
self.raw_data = raw_data

def __Drop_columns(self, df):
"""
Deletes the columns not needed for the analysis,
if you want to add columns to delete change the const variable
'COLUMNS_TO_DROP'.

Args:
df: pandas.DataFrame

Return:
pandas.DataFrame
"""
COLUMNS_TO_DROP = ['Full Name']
result = df.drop(columns=COLUMNS_TO_DROP)
return result

def __Fix_nan_values(self, df):
"""
Gives values to NaN.

Args:
df: pandas.DataFrame

Return:
pandas.DataFrame
"""
# Fix NaN values
NAN_VALUE_SUBSTITUTE = 'NA'
columns_to_fix = {
'Projected Start Date': NAN_VALUE_SUBSTITUTE,
'Actual Start Date': NAN_VALUE_SUBSTITUTE,
'Projected End Date': NAN_VALUE_SUBSTITUTE,
'Actual End Date': NAN_VALUE_SUBSTITUTE,
'Outcome': NAN_VALUE_SUBSTITUTE
}
# 'ATP Cohort' NA will handle in a separed function
for column, substitute_value in columns_to_fix.items():
df[column] = df[column].fillna(substitute_value)

return df

def __Rename_values(self, df):
"""
Changes values for consistency.

Args:
df: pandas.DataFrame

Return:
pandas.DataFrame
"""
df.loc[df['Service'] == 'Data Analytics 2',
'Service'] = 'Data Analysis 2'
return df

def __Delete_values(self, df):
"""
Deletes values not needed, if you want to add values to delete
change the const variable 'VALUES_NOT_NEEDED'.


Args:
df: pandas.DataFrame

Return:
pandas.DataFrame
"""
# 'Referral to External Service', 'Supportive Services Referral',
# are deleted because dont have a "Projected Start Date"
VALUES_NOT_NEEDED = {
'Service': ['Software Development 1',
'Software Development 2',
'Web Development 1', 'Web Development 2',
'Data Analysis 1', 'Data Analysis 2',
'Referral to External Service',
'Supportive Services Referral']
}
for column, value in VALUES_NOT_NEEDED.items():
df = df[~df[column].isin(value)]
return df

def __Set_data_types(self, df):
"""
Sets data type for each column.

Args:
df: pandas.DataFrame

Return:
pandas.DataFrame
"""
# DataTypes
column_datatype: dict = {'Auto Id': str, 'KY Region': str,
'Assessment ID': str, 'EnrollmentId': str,
'Enrollment Service Name': str,
'Service': str,
'Projected Start Date': str,
'Actual Start Date': str,
'Projected End Date': str,
'Actual End Date': str,
'Outcome': str,
'ATP Cohort': 'datetime64[ns]'}
# TODO: 'Projected Start Date', 'Actual Start Date',
# 'Projected End Date','Actual End Date' are all datetime
# types but have a value fix of NA

for column, type in column_datatype.items():
df[column] = df[column].astype(type)
return df

def __Find_cohort(self, id: str,
projected_start_date: str,
cohort_to_find: str,
df_to_clean: pd.DataFrame):
"""
Finds values for each NaN of 'ATP Cohort' column.
This function was created with the idea of using
pandas.DataFrame.apply().


Args:
id: str
projected_start_date: str
cohort_to_find: str
df_to_clean: pandas.DataFrame

Return:
numpy.array
"""
# Q: What to do with Service: ['Referral to External Service',
# 'Supportive Services Referral']
# TODO: Clean the NaTType before this function runs
if pd.isna(cohort_to_find):
student_df = df_to_clean[df_to_clean['Auto Id'] == id]
# remove ATP Cohort NA values, it can be more than one
student_df: pd.DataFrame = student_df[~student_df['ATP Cohort']
.isna()]
cohorts_participaded = student_df['ATP Cohort'].astype(
'datetime64[ns]').unique()

# print(cohorts_participaded)
if len(cohorts_participaded) == 1:
return cohorts_participaded[0]
else:
# cohorts_participaded.append(pd.to_datetime(projected_start_date))
stimated_module_date = np.datetime64(projected_start_date)
cohorts_participaded = np.append(
cohorts_participaded, stimated_module_date)
cohorts_participaded.sort()
previus_date = cohorts_participaded[0]
for cohort in cohorts_participaded:
if stimated_module_date == cohort:
return previus_date
else:
return np.datetime64(cohort_to_find)

def Get_clean_data(self):
"""
Cleans the raw data.

Args:
df: pandas.DataFrame

Return:
pandas.DataFrame
"""
df = self.raw_data
df = self.__Drop_columns(df)
df = self.__Fix_nan_values(df)
df = self.__Rename_values(df)
df = self.__Delete_values(df)
df = self.__Set_data_types(df)
df['ATP Cohort'] = df.apply(lambda row: self.__Find_cohort(
row['Auto Id'],
row['Projected Start Date'],
row['ATP Cohort'],
df), axis=1)
return df
75 changes: 75 additions & 0 deletions src/completion_rate_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd


class Completion_rate_data:
def __init__(self, data):
self.data = data
self.__pathways = [
'Web Development M1',
'Web Development M2',
'Web Development M3',
'Web Development M4',
'Data Analysis M1',
'Data Analysis M2',
'Data Analysis M3',
'Data Analysis M4',
'Software Development M1',
'Software Development M2',
'Software Development M3',
'Software Development M4',
'Quality Assurance M1',
'Quality Assurance M2',
'Quality Assurance M3',
'Quality Assurance M4',
'User Experience M1',
'User Experience M2',
'User Experience M3',
'User Experience M4',
]

# Not the best Pandas way to do it:
def Get_completion_percentages(self,
cohort: str = 'All cohorts') -> pd.DataFrame: # noqa
"""
Creates a pandas.Datafreme that contains the %
of completion of each pathway.

Args:
cohort: str

Return:
pandas.DataFrame
"""
if cohort == 'All cohorts':
data = self.data
else:
data = self.data[self.data['ATP Cohort'] == pd.Timestamp(cohort)]

completion_dictionary = {}

for path in self.__pathways:
outcome = data[data['Service'] == path]['Outcome'].value_counts(
normalize=True).reset_index()
completion_dictionary[path] = {
row.Outcome: row.proportion for row in outcome.itertuples(index=True)} # noqa

result_df = pd.DataFrame(completion_dictionary).transpose().fillna(
0).rename_axis('Module').reset_index()

result_df['Pathway'] = result_df['Module'].apply(
# intended to be able to sort by pathway
lambda x: x[:x.rfind(' ')])
return result_df

def Get_pathways_name(self, df: pd.DataFrame) -> list:
"""
List of all the pathways in a pandas.DataFrame generated by
self.Get_completion_percentages().

Args:
df: pandas.DataFrame

Return:
list
"""
return list(df['Pathway'].unique())
Loading
Loading