diff --git a/data/Carmen_WORCEmployment.py b/data/Carmen_WORCEmployment.py new file mode 100644 index 0000000..92f9cb3 --- /dev/null +++ b/data/Carmen_WORCEmployment.py @@ -0,0 +1,13 @@ +import pandas as pd +import os + +file_path = "data/WORC_Employment.xlsx" +worc = pd.read_excel(file_path) + +cols_to_drop = ['Auto Id','Employment History Name'] + +worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1) + +worc_cols_dropped_nulls = worc_cols_dropped.dropna() + +worc_cleaned = worc_cols_dropped_nulls \ No newline at end of file diff --git a/data/WORC Employment.xlsx b/data/WORC Employment.xlsx deleted file mode 100644 index 1e5f800..0000000 Binary files a/data/WORC Employment.xlsx and /dev/null differ diff --git a/data/WORC_Employment.xlsx b/data/WORC_Employment.xlsx new file mode 100644 index 0000000..6afd7d5 Binary files /dev/null and b/data/WORC_Employment.xlsx differ diff --git a/data/worc_cleaning.ipynb b/data/worc_cleaning.ipynb new file mode 100644 index 0000000..f71dd2e --- /dev/null +++ b/data/worc_cleaning.ipynb @@ -0,0 +1,655 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Best Practice\n", + "\n", + "Be sure to set up a virtual environment as there were lots of imports in this project\n", + "| Command | Linux/Mac | GitBash |\n", + "| ------- | --------- | ------- |\n", + "| Create | python3 -m venv venv | python -m venv venv |\n", + "| Activate | source venv/bin/activate | source venv/Scripts/activate |\n", + "| Install | pip install -r requirements.txt or pip install packages | pip install -r requirements.txt or pip install packages|\n", + "| Deactivate | deactivate | deactivate |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## WORC Employment\n", + "\n", + "1. Import libraries\n", + "2. Read in the data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Read in file\n", + "\n", + "file_path = \"WORC_Employment.xlsx\"\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# read in file\n", + "worc = pd.read_excel(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Auto IdFull NameEmailEnrollmentIdEmployment History NameCompany NameJob TitleStart DateProgram: Program NameMailing CityMailing Zip/Postal CodeATP Placement TypeSalaryGenderRaceKY Region
0202203-7853name namename@gmail.comEnrollment-6442EH-001676Appalachian Regional HealthcareNetwork Coordinator2023-10-09Code Kentucky 22-23Lost Creek41348First ATP Placement - New to Tech16.00MaleWhiteSOAR
1202207-8826name namename@gmail.comEnrollment-6188EH-001824MCHC - Mountain Comprehensive Health CorporationJunior IT systems administrator2024-02-12Code Kentucky 22-23Greys Knob40808First ATP Placement - New to Tech18.00FemaleWhiteSOAR
2202306-12150name namename@gmail.comEnrollment-7740EH-002555University of KentuckyTechnical Support Specialist II2024-04-01Code Kentucky 23-24Richmond40475First ATP Placement - Promotion25.00MaleWhiteSOAR
3202207-9034name namename@gmail.comEnrollment-6146EH-002207Childers oil companyWeb developer2024-04-23Code Kentucky 22-23Hazard41701First ATP Placement - New to Tech26.92MaleWhiteSOAR
4202306-12149name namename@gmail.comEnrollment-7701EH-002294Code:YouStudent Community Coordinator2024-05-20Code Kentucky 23-24Eubank42567First ATP Placement - New to Tech25.48FemaleWhiteSOAR
\n", + "
" + ], + "text/plain": [ + " Auto Id Full Name Email EnrollmentId \\\n", + "0 202203-7853 name name name@gmail.com Enrollment-6442 \n", + "1 202207-8826 name name name@gmail.com Enrollment-6188 \n", + "2 202306-12150 name name name@gmail.com Enrollment-7740 \n", + "3 202207-9034 name name name@gmail.com Enrollment-6146 \n", + "4 202306-12149 name name name@gmail.com Enrollment-7701 \n", + "\n", + " Employment History Name Company Name \\\n", + "0 EH-001676 Appalachian Regional Healthcare \n", + "1 EH-001824 MCHC - Mountain Comprehensive Health Corporation \n", + "2 EH-002555 University of Kentucky \n", + "3 EH-002207 Childers oil company \n", + "4 EH-002294 Code:You \n", + "\n", + " Job Title Start Date Program: Program Name \\\n", + "0 Network Coordinator 2023-10-09 Code Kentucky 22-23 \n", + "1 Junior IT systems administrator 2024-02-12 Code Kentucky 22-23 \n", + "2 Technical Support Specialist II 2024-04-01 Code Kentucky 23-24 \n", + "3 Web developer 2024-04-23 Code Kentucky 22-23 \n", + "4 Student Community Coordinator 2024-05-20 Code Kentucky 23-24 \n", + "\n", + " Mailing City Mailing Zip/Postal Code ATP Placement Type \\\n", + "0 Lost Creek 41348 First ATP Placement - New to Tech \n", + "1 Greys Knob 40808 First ATP Placement - New to Tech \n", + "2 Richmond 40475 First ATP Placement - Promotion \n", + "3 Hazard 41701 First ATP Placement - New to Tech \n", + "4 Eubank 42567 First ATP Placement - New to Tech \n", + "\n", + " Salary Gender Race KY Region \n", + "0 16.00 Male White SOAR \n", + "1 18.00 Female White SOAR \n", + "2 25.00 Male White SOAR \n", + "3 26.92 Male White SOAR \n", + "4 25.48 Female White SOAR " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# review top 5\n", + "worc.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(25, 16)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "worc.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Review of data \n", + "\n", + "1. Check for unique values in name, email, ATP Placement Type column - to determine if necessary to keep\n", + "2. Check for null values - remove" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def unique(df, column='column'):\n", + " \"\"\"\n", + " Check for unique values in a specified DataFrame column.\n", + "\n", + " Parameters:\n", + " df: The DataFrame that contains the data.\n", + " column: The name of the column to check for uniqueness.\n", + "\n", + " Returns:\n", + " numpy.ndarray: An array of unique values in the specified column.\n", + " \"\"\"\n", + " unique_values = df[column].unique()\n", + " print(f\"Unique values in '{column}': {unique_values}\")\n", + " \n", + " return unique_values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique values in 'Full Name': ['name name']\n", + "Unique values in 'Email': ['name@gmail.com']\n", + "Unique values in 'ATP Placement Type': ['First ATP Placement - New to Tech' 'First ATP Placement - Promotion'\n", + " 'First ATP Placement - Already in Tech']\n" + ] + }, + { + "data": { + "text/plain": [ + "array(['First ATP Placement - New to Tech',\n", + " 'First ATP Placement - Promotion',\n", + " 'First ATP Placement - Already in Tech'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique(worc,'Full Name')\n", + "unique(worc, 'Email')\n", + "unique(worc, 'ATP Placement Type')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.True_" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check for isnull\n", + "worc.isnull().any().any()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Auto Id 0\n", + "Full Name 0\n", + "Email 0\n", + "EnrollmentId 0\n", + "Employment History Name 0\n", + "Company Name 0\n", + "Job Title 0\n", + "Start Date 0\n", + "Program: Program Name 0\n", + "Mailing City 0\n", + "Mailing Zip/Postal Code 0\n", + "ATP Placement Type 0\n", + "Salary 3\n", + "Gender 0\n", + "Race 0\n", + "KY Region 0\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Checking where isnull values are located\n", + "worc.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Which Columns to Drop\n", + "- Even though there are no unique values for name and email will keep those items as requested\n", + "- Will drop na/null as well" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping multiple columns based including those with no unique values as well as those that seem unnecessary\n", + "cols_to_drop = ['Auto Id','Employment History Name']\n", + "\n", + "worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "worc_cols_dropped_nulls = worc_cols_dropped.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "worc_cleaned = worc_cols_dropped_nulls" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(22, 14)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "worc_cleaned.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Full NameEmailEnrollmentIdCompany NameJob TitleStart DateProgram: Program NameMailing CityMailing Zip/Postal CodeATP Placement TypeSalaryGenderRaceKY Region
0name namename@gmail.comEnrollment-6442Appalachian Regional HealthcareNetwork Coordinator2023-10-09Code Kentucky 22-23Lost Creek41348First ATP Placement - New to Tech16.00MaleWhiteSOAR
1name namename@gmail.comEnrollment-6188MCHC - Mountain Comprehensive Health CorporationJunior IT systems administrator2024-02-12Code Kentucky 22-23Greys Knob40808First ATP Placement - New to Tech18.00FemaleWhiteSOAR
2name namename@gmail.comEnrollment-7740University of KentuckyTechnical Support Specialist II2024-04-01Code Kentucky 23-24Richmond40475First ATP Placement - Promotion25.00MaleWhiteSOAR
3name namename@gmail.comEnrollment-6146Childers oil companyWeb developer2024-04-23Code Kentucky 22-23Hazard41701First ATP Placement - New to Tech26.92MaleWhiteSOAR
4name namename@gmail.comEnrollment-7701Code:YouStudent Community Coordinator2024-05-20Code Kentucky 23-24Eubank42567First ATP Placement - New to Tech25.48FemaleWhiteSOAR
\n", + "
" + ], + "text/plain": [ + " Full Name Email EnrollmentId \\\n", + "0 name name name@gmail.com Enrollment-6442 \n", + "1 name name name@gmail.com Enrollment-6188 \n", + "2 name name name@gmail.com Enrollment-7740 \n", + "3 name name name@gmail.com Enrollment-6146 \n", + "4 name name name@gmail.com Enrollment-7701 \n", + "\n", + " Company Name \\\n", + "0 Appalachian Regional Healthcare \n", + "1 MCHC - Mountain Comprehensive Health Corporation \n", + "2 University of Kentucky \n", + "3 Childers oil company \n", + "4 Code:You \n", + "\n", + " Job Title Start Date Program: Program Name \\\n", + "0 Network Coordinator 2023-10-09 Code Kentucky 22-23 \n", + "1 Junior IT systems administrator 2024-02-12 Code Kentucky 22-23 \n", + "2 Technical Support Specialist II 2024-04-01 Code Kentucky 23-24 \n", + "3 Web developer 2024-04-23 Code Kentucky 22-23 \n", + "4 Student Community Coordinator 2024-05-20 Code Kentucky 23-24 \n", + "\n", + " Mailing City Mailing Zip/Postal Code ATP Placement Type \\\n", + "0 Lost Creek 41348 First ATP Placement - New to Tech \n", + "1 Greys Knob 40808 First ATP Placement - New to Tech \n", + "2 Richmond 40475 First ATP Placement - Promotion \n", + "3 Hazard 41701 First ATP Placement - New to Tech \n", + "4 Eubank 42567 First ATP Placement - New to Tech \n", + "\n", + " Salary Gender Race KY Region \n", + "0 16.00 Male White SOAR \n", + "1 18.00 Female White SOAR \n", + "2 25.00 Male White SOAR \n", + "3 26.92 Male White SOAR \n", + "4 25.48 Female White SOAR " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "worc_cleaned.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}