Code-You-Contributors · dmorton714 · Aug 12, 2025 · Aug 10, 2025 · Aug 12, 2025
diff --git a/src/Carmen_WORCEmployment.py b/src/Carmen_WORCEmployment.py
@@ -1,5 +1,6 @@
 import pandas as pd
 
+
 def load_and_clean(file_path="data/WORC_Employment.xlsx"):
     """
     Loads and cleans the WORC Employment dataset.
@@ -18,12 +19,15 @@ def load_and_clean(file_path="data/WORC_Employment.xlsx"):
     worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)
 
     # Clean up data types
-    worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date'])
-    worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'], errors='coerce')
+    worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date']) # noqa
+    worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'],
+                                                errors='coerce')
 
-    # Adjust salary that is listed as 60,000 to 28.84 for consistency with other salaries
+    # Adjust salary that is listed as 60,000 to 28.84 for
+    # consistency with other salaries
     # Took 60,000 / 2080hrs - 28.84
-    worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84)
+    worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84) # noqa
 
     worc_clean = worc_cols_dropped
+
     return worc_clean
diff --git a/src/Carmen_WORCEmployment_Plots.py b/src/Carmen_WORCEmployment_Plots.py
@@ -1,5 +1,3 @@
-import sys
-import os
 import pandas as pd
 from Carmen_WORCEmployment import load_and_clean
 import matplotlib.pyplot as plt
@@ -8,36 +6,46 @@
 
 def plot_salary_by_gender(data):
     plt.figure(figsize=(8, 5))
-    sns.boxplot(data=data, x='Gender', y='Salary')
+    sns.boxplot(data=data,
+                x='Gender',
+                y='Salary')
     plt.title("Salary Distribution by Gender")
     plt.show()
 
 
 def plot_avg_salary_by_city(data):
     region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()
-    region_salary.plot(kind='barh', figsize=(8, 5), title="Average Salary by KY Region")
+    region_salary.plot(kind='barh',
+                       figsize=(8, 5),
+                       title="Average Salary by KY Region")
     plt.xlabel("Average Salary")
     plt.show()
 
 
 def plot_placements_over_time(data):
-    data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))
+    data.set_index('Start Date').resample('M').size().plot(kind='line',
+                                                           marker='o',
+                                                           figsize=(10, 4))
     plt.title("Number of Placements Over Time")
     plt.ylabel("Placements")
     plt.show()
 
 
 def plot_placement_type_by_program(data):
     plt.figure(figsize=(10, 6))
-    sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')
+    sns.countplot(data=data,
+                  x='ATP Placement Type',
+                  hue='Program: Program Name')
     plt.xticks(rotation=45)
     plt.title("Placement Type by Program")
     plt.show()
 
 
 def plot_top_cities(data):
     city_counts = data['Mailing City'].value_counts().head(10)
-    city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))
+    city_counts.plot(kind='bar',
+                     title='Top Cities by Participant Count',
+                     figsize=(8, 4))
     plt.ylabel("Count")
     plt.show()
 

diff --git a/src/cleaning_enrollments_data.py b/src/cleaning_enrollments_data.py
@@ -0,0 +1,186 @@
+import pandas as pd
+import numpy as np
+
+
+class EnrollmentsCleaning:
+    def __init__(self, raw_data):
+        self.raw_data = raw_data
+
+    def __Drop_columns(self, df):
+        """
+            Deletes the columns not needed for the analysis,
+            if you want to add columns to delete change the const variable
+            'COLUMNS_TO_DROP'.
+
+            Args:
+                df: pandas.DataFrame
+
+            Return:
+                pandas.DataFrame
+        """
+        COLUMNS_TO_DROP = ['Full Name']
+        result = df.drop(columns=COLUMNS_TO_DROP)
+        return result
+
+    def __Fix_nan_values(self, df):
+        """
+            Gives values to NaN.
+
+            Args:
+                df: pandas.DataFrame
+
+            Return:
+                pandas.DataFrame
+        """
+        # Fix NaN values
+        NAN_VALUE_SUBSTITUTE = 'NA'
+        columns_to_fix = {
+            'Projected Start Date': NAN_VALUE_SUBSTITUTE,
+            'Actual Start Date': NAN_VALUE_SUBSTITUTE,
+            'Projected End Date': NAN_VALUE_SUBSTITUTE,
+            'Actual End Date': NAN_VALUE_SUBSTITUTE,
+            'Outcome': NAN_VALUE_SUBSTITUTE
+        }
+        # 'ATP Cohort' NA will handle in a separed function
+        for column, substitute_value in columns_to_fix.items():
+            df[column] = df[column].fillna(substitute_value)
+
+        return df
+
+    def __Rename_values(self, df):
+        """
+            Changes values for consistency.
+
+            Args:
+                df: pandas.DataFrame
+
+            Return:
+                pandas.DataFrame
+        """
+        df.loc[df['Service'] == 'Data Analytics 2',
+               'Service'] = 'Data Analysis 2'
+        return df
+
+    def __Delete_values(self, df):
+        """
+            Deletes values not needed, if you want to add values to delete
+            change the const variable 'VALUES_NOT_NEEDED'.
+
+
+            Args:
+                df: pandas.DataFrame
+
+            Return:
+                pandas.DataFrame
+        """
+        # 'Referral to External Service', 'Supportive Services Referral',
+        # are deleted because dont have a "Projected Start Date"
+        VALUES_NOT_NEEDED = {
+            'Service': ['Software Development 1',
+                        'Software Development 2',
+                        'Web Development 1', 'Web Development 2',
+                        'Data Analysis 1', 'Data Analysis 2',
+                        'Referral to External Service',
+                        'Supportive Services Referral']
+        }
+        for column, value in VALUES_NOT_NEEDED.items():
+            df = df[~df[column].isin(value)]
+        return df
+
+    def __Set_data_types(self, df):
+        """
+            Sets data type for each column.
+
+            Args:
+                df: pandas.DataFrame
+
+            Return:
+                pandas.DataFrame
+        """
+        # DataTypes
+        column_datatype: dict = {'Auto Id': str, 'KY Region': str,
+                                 'Assessment ID': str, 'EnrollmentId': str,
+                                 'Enrollment Service Name': str,
+                                 'Service': str,
+                                 'Projected Start Date': str,
+                                 'Actual Start Date': str,
+                                 'Projected End Date': str,
+                                 'Actual End Date': str,
+                                 'Outcome': str,
+                                 'ATP Cohort': 'datetime64[ns]'}
+        # TODO: 'Projected Start Date', 'Actual Start Date',
+        # 'Projected End Date','Actual End Date' are all datetime
+        # types but have a value fix of NA
+
+        for column, type in column_datatype.items():
+            df[column] = df[column].astype(type)
+        return df
+
+    def __Find_cohort(self, id: str,
+                      projected_start_date: str,
+                      cohort_to_find: str,
+                      df_to_clean: pd.DataFrame):
+        """
+            Finds values for each NaN of 'ATP Cohort' column.
+            This function was created with the idea of using
+            pandas.DataFrame.apply().
+
+
+            Args:
+                id: str
+                projected_start_date: str
+                cohort_to_find: str
+                df_to_clean: pandas.DataFrame
+
+            Return:
+                numpy.array
+        """
+        # Q: What to do with Service: ['Referral to External Service',
+        # 'Supportive Services Referral']
+        # TODO: Clean the NaTType before this function runs
+        if pd.isna(cohort_to_find):
+            student_df = df_to_clean[df_to_clean['Auto Id'] == id]
+            # remove ATP Cohort NA values, it can be more than one
+            student_df: pd.DataFrame = student_df[~student_df['ATP Cohort']
+                                                  .isna()]
+            cohorts_participaded = student_df['ATP Cohort'].astype(
+                'datetime64[ns]').unique()
+
+            # print(cohorts_participaded)
+            if len(cohorts_participaded) == 1:
+                return cohorts_participaded[0]
+            else:
+                # cohorts_participaded.append(pd.to_datetime(projected_start_date))
+                stimated_module_date = np.datetime64(projected_start_date)
+                cohorts_participaded = np.append(
+                    cohorts_participaded, stimated_module_date)
+                cohorts_participaded.sort()
+                previus_date = cohorts_participaded[0]
+                for cohort in cohorts_participaded:
+                    if stimated_module_date == cohort:
+                        return previus_date
+        else:
+            return np.datetime64(cohort_to_find)
+
+    def Get_clean_data(self):
+        """
+            Cleans the raw data.
+
+            Args:
+                df: pandas.DataFrame
+
+            Return:
+                pandas.DataFrame
+        """
+        df = self.raw_data
+        df = self.__Drop_columns(df)
+        df = self.__Fix_nan_values(df)
+        df = self.__Rename_values(df)
+        df = self.__Delete_values(df)
+        df = self.__Set_data_types(df)
+        df['ATP Cohort'] = df.apply(lambda row: self.__Find_cohort(
+            row['Auto Id'],
+            row['Projected Start Date'],
+            row['ATP Cohort'],
+            df), axis=1)
+        return df
diff --git a/src/completion_rate_data.py b/src/completion_rate_data.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+
+class Completion_rate_data:
+    def __init__(self, data):
+        self.data = data
+        self.__pathways = [
+            'Web Development M1',
+            'Web Development M2',
+            'Web Development M3',
+            'Web Development M4',
+            'Data Analysis M1',
+            'Data Analysis M2',
+            'Data Analysis M3',
+            'Data Analysis M4',
+            'Software Development M1',
+            'Software Development M2',
+            'Software Development M3',
+            'Software Development M4',
+            'Quality Assurance M1',
+            'Quality Assurance M2',
+            'Quality Assurance M3',
+            'Quality Assurance M4',
+            'User Experience M1',
+            'User Experience M2',
+            'User Experience M3',
+            'User Experience M4',
+        ]
+
+        # Not the best Pandas way to do it:
+    def Get_completion_percentages(self,
+                                   cohort: str = 'All cohorts') -> pd.DataFrame: # noqa
+        """
+            Creates a pandas.Datafreme that contains the %
+            of completion of each pathway.
+
+            Args:
+                cohort: str
+
+            Return:
+                pandas.DataFrame
+        """
+        if cohort == 'All cohorts':
+            data = self.data
+        else:
+            data = self.data[self.data['ATP Cohort'] == pd.Timestamp(cohort)]
+
+        completion_dictionary = {}
+
+        for path in self.__pathways:
+            outcome = data[data['Service'] == path]['Outcome'].value_counts(
+                normalize=True).reset_index()
+            completion_dictionary[path] = {
+                row.Outcome: row.proportion for row in outcome.itertuples(index=True)} # noqa
+
+        result_df = pd.DataFrame(completion_dictionary).transpose().fillna(
+            0).rename_axis('Module').reset_index()
+
+        result_df['Pathway'] = result_df['Module'].apply(
+            # intended to be able to sort by pathway
+            lambda x: x[:x.rfind(' ')])
+        return result_df
+
+    def Get_pathways_name(self, df: pd.DataFrame) -> list:
+        """
+            List of all the pathways in a pandas.DataFrame generated by
+            self.Get_completion_percentages().
+
+            Args:
+                df: pandas.DataFrame
+
+            Return:
+                list
+        """
+        return list(df['Pathway'].unique())