Code-You-Contributors · dmorton714 · Aug 5, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -212,3 +212,4 @@ data/WORC Employment.xlsx
 data/ARC Enrollments.xlsx
 data/ARC Application.xlsx
 data/All demographics and programs.xlsx
+data/WORC_Employment.xlsx
diff --git a/src/Carmen_WORCEmployment.py b/src/Carmen_WORCEmployment.py
@@ -1,17 +1,29 @@
 import pandas as pd
 
-file_path = "data/WORC_Employment.xlsx"
-worc = pd.read_excel(file_path)
+def load_and_clean(file_path="../../data/WORC_Employment.xlsx"):
+    """
+    Loads and cleans the WORC Employment dataset.
+
+    Parameter:
+        file_path (str): Relative path to the Excel file.
 
-# removed auto id as we may need it later
-cols_to_drop = ['Employment History Name']
+    Returns:
+        pd.DataFrame: Cleaned DataFrame.
+    """
+    # Load data
+    worc = pd.read_excel(file_path)
 
-worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)
+    # Drop columns we don't need
+    cols_to_drop = ['Employment History Name']
+    worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)
 
-# Why did we decide to drop all nulls?
-# This can be dangerous if we have a lot of nulls
-# Also it removed the entire row if any column had a null value
-# will this cause issues later?
-worc_cols_dropped_nulls = worc_cols_dropped.dropna()
+    # Clean up data types
+    worc_cols_dropped['Start Date'] = pd.to_datetime(worc_cols_dropped['Start Date'])
+    worc_cols_dropped['Salary'] = pd.to_numeric(worc_cols_dropped['Salary'], errors='coerce')
 
-worc_cleaned = worc_cols_dropped_nulls
+    # Adjust salary that is listed as 60,000 to 28.84 for consistency with other salaries
+    # Took 60,000 / 2080hrs - 28.84
+    worc_cols_dropped['Salary'] = worc_cols_dropped['Salary'].replace(60000, 28.84)
+
+    worc_clean = worc_cols_dropped
+    return worc_clean
diff --git a/src/notebooks/Carmen_WORCEmployment_Plots.py b/src/notebooks/Carmen_WORCEmployment_Plots.py
@@ -0,0 +1,59 @@
+import sys
+import os
+
+# Add the parent directory (src) to sys.path
+sys.path.append(os.path.abspath(".."))
+
+from Carmen_WORCEmployment import load_and_clean
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def plot_salary_by_gender(data):
+    plt.figure(figsize=(8, 5))
+    sns.boxplot(data=data, x='Gender', y='Salary')
+    plt.title("Salary Distribution by Gender")
+    plt.show()
+
+
+def plot_avg_salary_by_city(data):
+    region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()
+    region_salary.plot(kind='barh', figsize=(8, 5), title="Average Salary by KY Region")
+    plt.xlabel("Average Salary")
+    plt.show()
+
+
+def plot_placements_over_time(data):
+    data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))
+    plt.title("Number of Placements Over Time")
+    plt.ylabel("Placements")
+    plt.show()
+
+
+def plot_placement_type_by_program(data):
+    plt.figure(figsize=(10, 6))
+    sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')
+    plt.xticks(rotation=45)
+    plt.title("Placement Type by Program")
+    plt.show()
+
+
+def plot_top_cities(data):
+    city_counts = data['Mailing City'].value_counts().head(10)
+    city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))
+    plt.ylabel("Count")
+    plt.show()
+
+
+def main():
+    worc_clean = load_and_clean()
+
+    plot_salary_by_gender(worc_clean)
+    plot_avg_salary_by_city(worc_clean)
+    plot_placements_over_time(worc_clean)
+    plot_placement_type_by_program(worc_clean)
+    plot_top_cities(worc_clean)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/notebooks/worc_cleaning.ipynb b/src/notebooks/worc_cleaning.ipynb
@@ -32,24 +32,33 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "import os\n"
+    "\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# Read in file\n",
     "\n",
-    "file_path = \"WORC_Employment.xlsx\"\n",
+    "file_path = \"../../data/WORC_Employment.xlsx\"\n",
     "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -59,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -238,7 +247,7 @@
        "4   25.48  Female  White      SOAR  "
       ]
      },
-     "execution_count": 4,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -250,7 +259,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -259,7 +268,7 @@
        "(25, 16)"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -280,7 +289,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -303,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -324,7 +333,7 @@
        "       'First ATP Placement - Already in Tech'], dtype=object)"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -337,7 +346,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -346,7 +355,7 @@
        "np.True_"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -358,7 +367,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -383,7 +392,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -404,7 +413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -416,34 +425,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
-    "worc_cols_dropped_nulls = worc_cols_dropped.dropna()"
+    "# worc_cols_dropped_nulls = worc_cols_dropped.dropna()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
-    "worc_cleaned = worc_cols_dropped_nulls"
+    "worc_cleaned = worc_cols_dropped"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(22, 14)"
+       "(25, 14)"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -454,7 +463,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -621,14 +630,33 @@
        "4   25.48  Female  White      SOAR  "
       ]
      },
-     "execution_count": 14,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "worc_cleaned.head()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "worc_cleaned.to_excel(\"worc_employment.xlsx\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "worc_cleaned['Start Date'] = pd.to_datetime(worc_cleaned['Start Date'])\n",
+    "worc_cleaned['Salary'] = pd.to_numeric(worc_cleaned['Salary'], errors='coerce')\n"
+   ]
   }
  ],
  "metadata": {

diff --git a/src/notebooks/worc_employment.xlsx b/src/notebooks/worc_employment.xlsx
diff --git a/src/notebooks/worc_employment_clean.xlsx b/src/notebooks/worc_employment_clean.xlsx