From 707cc391d1b9dbccbfc1750e43813ed46261610b Mon Sep 17 00:00:00 2001 From: Euclides Date: Sun, 10 Aug 2025 18:54:59 -0400 Subject: [PATCH] Added DocStrings to all the methods and functions --- cleaning_enrollments_data.py | 100 ++++++++++++++++++++++++----- completion_rate_data.py | 19 +++++- most_common_pathways_taken_data.py | 21 +++++- visualization_examples.ipynb | 26 +++----- 4 files changed, 129 insertions(+), 37 deletions(-) diff --git a/cleaning_enrollments_data.py b/cleaning_enrollments_data.py index 870303d..62fa63b 100644 --- a/cleaning_enrollments_data.py +++ b/cleaning_enrollments_data.py @@ -5,12 +5,31 @@ class EnrollmentsCleaning: def __init__(self, raw_data): self.raw_data = raw_data - def Drop_columns(self, df): + def __Drop_columns(self, df): + """ + Deletes the columns not needed for the analysis, + if you want to add columns to delete change the const variable 'COLUMNS_TO_DROP'. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ COLUMNS_TO_DROP = ['Full Name'] result = df.drop(columns=COLUMNS_TO_DROP) return result - def Fix_nan_values(self, df): + def __Fix_nan_values(self, df): + """ + Gives values to NaN. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ # Fix NaN values NAN_VALUE_SUBSTITUTE = 'NA' columns_to_fix = { @@ -23,22 +42,48 @@ def Fix_nan_values(self, df): return df - def Rename_values(self, df): - # Fix change name Data Analitics 2 to Data Analysis 2 for consistency + def __Rename_values(self, df): + """ + Changes values for consistency. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ df.loc[df['Service'] == 'Data Analytics 2', 'Service'] = 'Data Analysis 2' return df - def Delete_values(self, df): - # Delete values not needed + def __Delete_values(self, df): + """ + Deletes values not needed, if you want to add values to delete change the const variable 'VALUES_NOT_NEEDED'. + + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ # 'Referral to External Service', 'Supportive Services Referral', are deleted because dont have a "Projected Start Date" - values_not_needed = { + VALUES_NOT_NEEDED = { 'Service': ['Software Development 1', 'Software Development 2', 'Web Development 1', 'Web Development 2', 'Data Analysis 1','Data Analysis 2', 'Referral to External Service', 'Supportive Services Referral'] } - for column, value in values_not_needed.items(): + for column, value in VALUES_NOT_NEEDED.items(): df = df[~df[column].isin(value)] return df - def Set_data_types(self, df): + def __Set_data_types(self, df): + """ + Sets data type for each column. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ # DataTypes column_datatype: dict = {'Auto Id': str, 'KY Region': str, 'Assessment ID': str, 'EnrollmentId': str, 'Enrollment Service Name': str, 'Service': str, 'Projected Start Date': str, @@ -50,7 +95,21 @@ def Set_data_types(self, df): df[column] = df[column].astype(type) return df - def Find_cohort(self, id: str, projected_start_date: str, cohort_to_find: str, df_to_clean: pd.DataFrame): + def __Find_cohort(self, id: str, projected_start_date: str, cohort_to_find: str, df_to_clean: pd.DataFrame): + """ + Finds values for each NaN of 'ATP Cohort' column. + This function was created with the idea of using pandas.DataFrame.apply(). + + + Args: + id: str + projected_start_date: str + cohort_to_find: str + df_to_clean: pandas.DataFrame + + Return: + numpy.array + """ ## Q: What to do with Service: ['Referral to External Service', 'Supportive Services Referral'] ## TODO: Clean the NaTType before this function runs if pd.isna(cohort_to_find): @@ -75,11 +134,20 @@ def Find_cohort(self, id: str, projected_start_date: str, cohort_to_find: str, d return np.datetime64(cohort_to_find) def Get_clean_data(self): + """ + Cleans the raw data. + + Args: + df: pandas.DataFrame + + Return: + pandas.DataFrame + """ df = self.raw_data - df = self.Drop_columns(df) - df = self.Fix_nan_values(df) - df = self.Rename_values(df) - df = self.Delete_values(df) - df = self.Set_data_types(df) - df['ATP Cohort'] = df.apply(lambda row: self.Find_cohort(row['Auto Id'], row['Projected Start Date'], row['ATP Cohort'], df), axis=1) + df = self.__Drop_columns(df) + df = self.__Fix_nan_values(df) + df = self.__Rename_values(df) + df = self.__Delete_values(df) + df = self.__Set_data_types(df) + df['ATP Cohort'] = df.apply(lambda row: self.__Find_cohort(row['Auto Id'], row['Projected Start Date'], row['ATP Cohort'], df), axis=1) return df \ No newline at end of file diff --git a/completion_rate_data.py b/completion_rate_data.py index 48597d9..aeca043 100644 --- a/completion_rate_data.py +++ b/completion_rate_data.py @@ -28,8 +28,15 @@ def __init__(self, data): # Not the best Pandas way to do it: def Get_completion_percentages(self, cohort: str = 'All cohorts') -> pd.DataFrame: - + """ + Creates a pandas.Datafreme that contains the % of completion of each pathway. + + Args: + cohort: str + Return: + pandas.DataFrame + """ if cohort == 'All cohorts': data = self.data else: @@ -45,8 +52,16 @@ def Get_completion_percentages(self, cohort: str = 'All cohorts') -> pd.DataFram result_df['Pathway'] = result_df['Module'].apply(lambda x: x[:x.rfind(' ')]) # intended to be able to sort by pathway return result_df - # TODO: Add test def Get_pathways_name(self, df: pd.DataFrame) -> list: + """ + List of all the pathways in a pandas.DataFrame generated by self.Get_completion_percentages(). + + Args: + df: pandas.DataFrame + + Return: + list + """ return list(df['Pathway'].unique()) diff --git a/most_common_pathways_taken_data.py b/most_common_pathways_taken_data.py index 845c59f..f43bb70 100644 --- a/most_common_pathways_taken_data.py +++ b/most_common_pathways_taken_data.py @@ -14,7 +14,7 @@ def __init__(self, data): def Get_starting_pathways(self): """ - Returns a pandas.DataFrame were all the services are the biginning paths + Returns a pandas.DataFrame were all the services are the biginning pathways. Args: df: pandas.DataFrame @@ -26,12 +26,31 @@ def Get_starting_pathways(self): return self.data[mask_starter_pathways] def Get_cohorts_list(self): + """ + List of cohorts from starting pathways. + + Args: + df: pandas.DataFrame + + Return: + list + """ df = self.starter_only_df cohorts = list(pd.to_datetime(df['ATP Cohort'][df['ATP Cohort'] != 'NA']).sort_values(ascending=True).astype(str).unique()) cohorts.insert(0, 'All cohorts') return cohorts def Get_data_by_cohort(self, cohort: str = 'All cohorts') -> pd.DataFrame: + """ + Returns a pandas.DataFrame for a specific cohort or all cohorts. + + Args: + df: pandas.DataFrame + cohort: str + + Return: + pandas.DataFrame + """ df = self.starter_only_df if cohort == 'All cohorts': result = df.value_counts('Service').reset_index() diff --git a/visualization_examples.ipynb b/visualization_examples.ipynb index 8fa5ef8..c40b6af 100644 --- a/visualization_examples.ipynb +++ b/visualization_examples.ipynb @@ -20,17 +20,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "47cd23cd", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import plotly.express as px\n", - "import plotly.graph_objects as go\n", "from dash import Dash, dcc, html, Input, Output\n", "from most_common_pathways_taken_data import Most_common_pathways_taken_data\n", - "from compleation_rate_data import Compleation_rate_data\n", + "from completion_rate_data import Completion_rate_data\n", "from cleaning_enrollments_data import EnrollmentsCleaning" ] }, @@ -46,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "ba57e157", "metadata": {}, "outputs": [], @@ -64,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "id": "fa1b6e02", "metadata": {}, "outputs": [ @@ -83,7 +82,7 @@ " " ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -95,7 +94,6 @@ " app = Dash(__name__)\n", " # Const\n", " data_class = Most_common_pathways_taken_data(cleaner.Get_clean_data())\n", - " starter_only_enrollments = data_class.Get_starting_pathways() # This function should be able to comunicate with the data without argument\n", "\n", " dropdown_options = data_class.Get_cohorts_list()\n", " pathway_color = {\n", @@ -149,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "c0b7d44e", "metadata": {}, "outputs": [ @@ -168,7 +166,7 @@ " " ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -179,18 +177,10 @@ "def Dash_completion_rates_by_path() -> Dash: # TODO: fix data structure so visualization doesn't use df\n", " app2 = Dash(__name__)\n", " # Const\n", - " data_class = Compleation_rate_data(cleaner.Get_clean_data())\n", + " data_class = Completion_rate_data(cleaner.Get_clean_data())\n", " completion_df = data_class.Get_completion_percentages()\n", " options = data_class.Get_pathways_name(completion_df)\n", "\n", - " pathway_color = {\n", - " 'Software Development': 'green', \n", - " 'Web Development': 'blue', \n", - " 'Data Analysis': 'red',\n", - " 'Quality Assurance': 'yellow', \n", - " 'User Experience': 'purple'\n", - " }\n", - "\n", " # Display\n", " app2.layout = html.Div([\n", " html.H2('Pathways Completion', style={'text-align': \"center\"}),\n",