diff --git a/data/Carmen_WORCEmployment.py b/data/Carmen_WORCEmployment.py
new file mode 100644
index 0000000..92f9cb3
--- /dev/null
+++ b/data/Carmen_WORCEmployment.py
@@ -0,0 +1,13 @@
+import pandas as pd
+import os
+
+file_path = "data/WORC_Employment.xlsx"
+worc = pd.read_excel(file_path)
+
+cols_to_drop = ['Auto Id','Employment History Name']
+
+worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)
+
+worc_cols_dropped_nulls = worc_cols_dropped.dropna()
+
+worc_cleaned = worc_cols_dropped_nulls
\ No newline at end of file
diff --git a/data/WORC Employment.xlsx b/data/WORC Employment.xlsx
deleted file mode 100644
index 1e5f800..0000000
Binary files a/data/WORC Employment.xlsx and /dev/null differ
diff --git a/data/WORC_Employment.xlsx b/data/WORC_Employment.xlsx
new file mode 100644
index 0000000..6afd7d5
Binary files /dev/null and b/data/WORC_Employment.xlsx differ
diff --git a/data/worc_cleaning.ipynb b/data/worc_cleaning.ipynb
new file mode 100644
index 0000000..f71dd2e
--- /dev/null
+++ b/data/worc_cleaning.ipynb
@@ -0,0 +1,655 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Best Practice\n",
+ "\n",
+ "Be sure to set up a virtual environment as there were lots of imports in this project\n",
+ "| Command | Linux/Mac | GitBash |\n",
+ "| ------- | --------- | ------- |\n",
+ "| Create | python3 -m venv venv | python -m venv venv |\n",
+ "| Activate | source venv/bin/activate | source venv/Scripts/activate |\n",
+ "| Install | pip install -r requirements.txt or pip install packages | pip install -r requirements.txt or pip install packages|\n",
+ "| Deactivate | deactivate | deactivate |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## WORC Employment\n",
+ "\n",
+ "1. Import libraries\n",
+ "2. Read in the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import os\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Read in file\n",
+ "\n",
+ "file_path = \"WORC_Employment.xlsx\"\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read in file\n",
+ "worc = pd.read_excel(file_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Auto Id | \n",
+ " Full Name | \n",
+ " Email | \n",
+ " EnrollmentId | \n",
+ " Employment History Name | \n",
+ " Company Name | \n",
+ " Job Title | \n",
+ " Start Date | \n",
+ " Program: Program Name | \n",
+ " Mailing City | \n",
+ " Mailing Zip/Postal Code | \n",
+ " ATP Placement Type | \n",
+ " Salary | \n",
+ " Gender | \n",
+ " Race | \n",
+ " KY Region | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 202203-7853 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-6442 | \n",
+ " EH-001676 | \n",
+ " Appalachian Regional Healthcare | \n",
+ " Network Coordinator | \n",
+ " 2023-10-09 | \n",
+ " Code Kentucky 22-23 | \n",
+ " Lost Creek | \n",
+ " 41348 | \n",
+ " First ATP Placement - New to Tech | \n",
+ " 16.00 | \n",
+ " Male | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 202207-8826 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-6188 | \n",
+ " EH-001824 | \n",
+ " MCHC - Mountain Comprehensive Health Corporation | \n",
+ " Junior IT systems administrator | \n",
+ " 2024-02-12 | \n",
+ " Code Kentucky 22-23 | \n",
+ " Greys Knob | \n",
+ " 40808 | \n",
+ " First ATP Placement - New to Tech | \n",
+ " 18.00 | \n",
+ " Female | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 202306-12150 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-7740 | \n",
+ " EH-002555 | \n",
+ " University of Kentucky | \n",
+ " Technical Support Specialist II | \n",
+ " 2024-04-01 | \n",
+ " Code Kentucky 23-24 | \n",
+ " Richmond | \n",
+ " 40475 | \n",
+ " First ATP Placement - Promotion | \n",
+ " 25.00 | \n",
+ " Male | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 202207-9034 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-6146 | \n",
+ " EH-002207 | \n",
+ " Childers oil company | \n",
+ " Web developer | \n",
+ " 2024-04-23 | \n",
+ " Code Kentucky 22-23 | \n",
+ " Hazard | \n",
+ " 41701 | \n",
+ " First ATP Placement - New to Tech | \n",
+ " 26.92 | \n",
+ " Male | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 202306-12149 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-7701 | \n",
+ " EH-002294 | \n",
+ " Code:You | \n",
+ " Student Community Coordinator | \n",
+ " 2024-05-20 | \n",
+ " Code Kentucky 23-24 | \n",
+ " Eubank | \n",
+ " 42567 | \n",
+ " First ATP Placement - New to Tech | \n",
+ " 25.48 | \n",
+ " Female | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Auto Id Full Name Email EnrollmentId \\\n",
+ "0 202203-7853 name name name@gmail.com Enrollment-6442 \n",
+ "1 202207-8826 name name name@gmail.com Enrollment-6188 \n",
+ "2 202306-12150 name name name@gmail.com Enrollment-7740 \n",
+ "3 202207-9034 name name name@gmail.com Enrollment-6146 \n",
+ "4 202306-12149 name name name@gmail.com Enrollment-7701 \n",
+ "\n",
+ " Employment History Name Company Name \\\n",
+ "0 EH-001676 Appalachian Regional Healthcare \n",
+ "1 EH-001824 MCHC - Mountain Comprehensive Health Corporation \n",
+ "2 EH-002555 University of Kentucky \n",
+ "3 EH-002207 Childers oil company \n",
+ "4 EH-002294 Code:You \n",
+ "\n",
+ " Job Title Start Date Program: Program Name \\\n",
+ "0 Network Coordinator 2023-10-09 Code Kentucky 22-23 \n",
+ "1 Junior IT systems administrator 2024-02-12 Code Kentucky 22-23 \n",
+ "2 Technical Support Specialist II 2024-04-01 Code Kentucky 23-24 \n",
+ "3 Web developer 2024-04-23 Code Kentucky 22-23 \n",
+ "4 Student Community Coordinator 2024-05-20 Code Kentucky 23-24 \n",
+ "\n",
+ " Mailing City Mailing Zip/Postal Code ATP Placement Type \\\n",
+ "0 Lost Creek 41348 First ATP Placement - New to Tech \n",
+ "1 Greys Knob 40808 First ATP Placement - New to Tech \n",
+ "2 Richmond 40475 First ATP Placement - Promotion \n",
+ "3 Hazard 41701 First ATP Placement - New to Tech \n",
+ "4 Eubank 42567 First ATP Placement - New to Tech \n",
+ "\n",
+ " Salary Gender Race KY Region \n",
+ "0 16.00 Male White SOAR \n",
+ "1 18.00 Female White SOAR \n",
+ "2 25.00 Male White SOAR \n",
+ "3 26.92 Male White SOAR \n",
+ "4 25.48 Female White SOAR "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# review top 5\n",
+ "worc.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(25, 16)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "worc.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Review of data \n",
+ "\n",
+ "1. Check for unique values in name, email, ATP Placement Type column - to determine if necessary to keep\n",
+ "2. Check for null values - remove"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def unique(df, column='column'):\n",
+ " \"\"\"\n",
+ " Check for unique values in a specified DataFrame column.\n",
+ "\n",
+ " Parameters:\n",
+ " df: The DataFrame that contains the data.\n",
+ " column: The name of the column to check for uniqueness.\n",
+ "\n",
+ " Returns:\n",
+ " numpy.ndarray: An array of unique values in the specified column.\n",
+ " \"\"\"\n",
+ " unique_values = df[column].unique()\n",
+ " print(f\"Unique values in '{column}': {unique_values}\")\n",
+ " \n",
+ " return unique_values\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Unique values in 'Full Name': ['name name']\n",
+ "Unique values in 'Email': ['name@gmail.com']\n",
+ "Unique values in 'ATP Placement Type': ['First ATP Placement - New to Tech' 'First ATP Placement - Promotion'\n",
+ " 'First ATP Placement - Already in Tech']\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array(['First ATP Placement - New to Tech',\n",
+ " 'First ATP Placement - Promotion',\n",
+ " 'First ATP Placement - Already in Tech'], dtype=object)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unique(worc,'Full Name')\n",
+ "unique(worc, 'Email')\n",
+ "unique(worc, 'ATP Placement Type')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "np.True_"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check for isnull\n",
+ "worc.isnull().any().any()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Auto Id 0\n",
+ "Full Name 0\n",
+ "Email 0\n",
+ "EnrollmentId 0\n",
+ "Employment History Name 0\n",
+ "Company Name 0\n",
+ "Job Title 0\n",
+ "Start Date 0\n",
+ "Program: Program Name 0\n",
+ "Mailing City 0\n",
+ "Mailing Zip/Postal Code 0\n",
+ "ATP Placement Type 0\n",
+ "Salary 3\n",
+ "Gender 0\n",
+ "Race 0\n",
+ "KY Region 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Checking where isnull values are located\n",
+ "worc.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Which Columns to Drop\n",
+ "- Even though there are no unique values for name and email will keep those items as requested\n",
+ "- Will drop na/null as well"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Dropping multiple columns based including those with no unique values as well as those that seem unnecessary\n",
+ "cols_to_drop = ['Auto Id','Employment History Name']\n",
+ "\n",
+ "worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "worc_cols_dropped_nulls = worc_cols_dropped.dropna()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "worc_cleaned = worc_cols_dropped_nulls"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(22, 14)"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "worc_cleaned.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Full Name | \n",
+ " Email | \n",
+ " EnrollmentId | \n",
+ " Company Name | \n",
+ " Job Title | \n",
+ " Start Date | \n",
+ " Program: Program Name | \n",
+ " Mailing City | \n",
+ " Mailing Zip/Postal Code | \n",
+ " ATP Placement Type | \n",
+ " Salary | \n",
+ " Gender | \n",
+ " Race | \n",
+ " KY Region | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-6442 | \n",
+ " Appalachian Regional Healthcare | \n",
+ " Network Coordinator | \n",
+ " 2023-10-09 | \n",
+ " Code Kentucky 22-23 | \n",
+ " Lost Creek | \n",
+ " 41348 | \n",
+ " First ATP Placement - New to Tech | \n",
+ " 16.00 | \n",
+ " Male | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-6188 | \n",
+ " MCHC - Mountain Comprehensive Health Corporation | \n",
+ " Junior IT systems administrator | \n",
+ " 2024-02-12 | \n",
+ " Code Kentucky 22-23 | \n",
+ " Greys Knob | \n",
+ " 40808 | \n",
+ " First ATP Placement - New to Tech | \n",
+ " 18.00 | \n",
+ " Female | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-7740 | \n",
+ " University of Kentucky | \n",
+ " Technical Support Specialist II | \n",
+ " 2024-04-01 | \n",
+ " Code Kentucky 23-24 | \n",
+ " Richmond | \n",
+ " 40475 | \n",
+ " First ATP Placement - Promotion | \n",
+ " 25.00 | \n",
+ " Male | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-6146 | \n",
+ " Childers oil company | \n",
+ " Web developer | \n",
+ " 2024-04-23 | \n",
+ " Code Kentucky 22-23 | \n",
+ " Hazard | \n",
+ " 41701 | \n",
+ " First ATP Placement - New to Tech | \n",
+ " 26.92 | \n",
+ " Male | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " name name | \n",
+ " name@gmail.com | \n",
+ " Enrollment-7701 | \n",
+ " Code:You | \n",
+ " Student Community Coordinator | \n",
+ " 2024-05-20 | \n",
+ " Code Kentucky 23-24 | \n",
+ " Eubank | \n",
+ " 42567 | \n",
+ " First ATP Placement - New to Tech | \n",
+ " 25.48 | \n",
+ " Female | \n",
+ " White | \n",
+ " SOAR | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Full Name Email EnrollmentId \\\n",
+ "0 name name name@gmail.com Enrollment-6442 \n",
+ "1 name name name@gmail.com Enrollment-6188 \n",
+ "2 name name name@gmail.com Enrollment-7740 \n",
+ "3 name name name@gmail.com Enrollment-6146 \n",
+ "4 name name name@gmail.com Enrollment-7701 \n",
+ "\n",
+ " Company Name \\\n",
+ "0 Appalachian Regional Healthcare \n",
+ "1 MCHC - Mountain Comprehensive Health Corporation \n",
+ "2 University of Kentucky \n",
+ "3 Childers oil company \n",
+ "4 Code:You \n",
+ "\n",
+ " Job Title Start Date Program: Program Name \\\n",
+ "0 Network Coordinator 2023-10-09 Code Kentucky 22-23 \n",
+ "1 Junior IT systems administrator 2024-02-12 Code Kentucky 22-23 \n",
+ "2 Technical Support Specialist II 2024-04-01 Code Kentucky 23-24 \n",
+ "3 Web developer 2024-04-23 Code Kentucky 22-23 \n",
+ "4 Student Community Coordinator 2024-05-20 Code Kentucky 23-24 \n",
+ "\n",
+ " Mailing City Mailing Zip/Postal Code ATP Placement Type \\\n",
+ "0 Lost Creek 41348 First ATP Placement - New to Tech \n",
+ "1 Greys Knob 40808 First ATP Placement - New to Tech \n",
+ "2 Richmond 40475 First ATP Placement - Promotion \n",
+ "3 Hazard 41701 First ATP Placement - New to Tech \n",
+ "4 Eubank 42567 First ATP Placement - New to Tech \n",
+ "\n",
+ " Salary Gender Race KY Region \n",
+ "0 16.00 Male White SOAR \n",
+ "1 18.00 Female White SOAR \n",
+ "2 25.00 Male White SOAR \n",
+ "3 26.92 Male White SOAR \n",
+ "4 25.48 Female White SOAR "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "worc_cleaned.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}