diff --git a/Assignment/Assignment2 pd.ipynb b/Assignment/Assignment2 pd.ipynb new file mode 100644 index 0000000..2bb9e5b --- /dev/null +++ b/Assignment/Assignment2 pd.ipynb @@ -0,0 +1,1206 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "#%matplotlib notebook\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "import the dataset into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdEmployeeNameJobTitleBasePayOvertimePayOtherPayBenefitsTotalPayTotalPayBenefitsYearNotesAgencyStatus
01NATHANIEL FORDGENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY167411.180.00400184.25NaN567595.43567595.432011NaNSan FranciscoNaN
12GARY JIMENEZCAPTAIN III (POLICE DEPARTMENT)155966.02245131.88137811.38NaN538909.28538909.282011NaNSan FranciscoNaN
23ALBERT PARDINICAPTAIN III (POLICE DEPARTMENT)212739.13106088.1816452.60NaN335279.91335279.912011NaNSan FranciscoNaN
34CHRISTOPHER CHONGWIRE ROPE CABLE MAINTENANCE MECHANIC77916.0056120.71198306.90NaN332343.61332343.612011NaNSan FranciscoNaN
45PATRICK GARDNERDEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)134401.609737.00182234.59NaN326373.19326373.192011NaNSan FranciscoNaN
..........................................
148649148650Roy I TilleryCustodian0.000.000.000.00.000.002014NaNSan FranciscoNaN
148650148651Not providedNot providedNaNNaNNaNNaN0.000.002014NaNSan FranciscoNaN
148651148652Not providedNot providedNaNNaNNaNNaN0.000.002014NaNSan FranciscoNaN
148652148653Not providedNot providedNaNNaNNaNNaN0.000.002014NaNSan FranciscoNaN
148653148654Joe LopezCounselor, Log Cabin Ranch0.000.00-618.130.0-618.13-618.132014NaNSan FranciscoNaN
\n", + "

148654 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " Id EmployeeName \\\n", + "0 1 NATHANIEL FORD \n", + "1 2 GARY JIMENEZ \n", + "2 3 ALBERT PARDINI \n", + "3 4 CHRISTOPHER CHONG \n", + "4 5 PATRICK GARDNER \n", + "... ... ... \n", + "148649 148650 Roy I Tillery \n", + "148650 148651 Not provided \n", + "148651 148652 Not provided \n", + "148652 148653 Not provided \n", + "148653 148654 Joe Lopez \n", + "\n", + " JobTitle BasePay \\\n", + "0 GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY 167411.18 \n", + "1 CAPTAIN III (POLICE DEPARTMENT) 155966.02 \n", + "2 CAPTAIN III (POLICE DEPARTMENT) 212739.13 \n", + "3 WIRE ROPE CABLE MAINTENANCE MECHANIC 77916.00 \n", + "4 DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) 134401.60 \n", + "... ... ... \n", + "148649 Custodian 0.00 \n", + "148650 Not provided NaN \n", + "148651 Not provided NaN \n", + "148652 Not provided NaN \n", + "148653 Counselor, Log Cabin Ranch 0.00 \n", + "\n", + " OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year \\\n", + "0 0.00 400184.25 NaN 567595.43 567595.43 2011 \n", + "1 245131.88 137811.38 NaN 538909.28 538909.28 2011 \n", + "2 106088.18 16452.60 NaN 335279.91 335279.91 2011 \n", + "3 56120.71 198306.90 NaN 332343.61 332343.61 2011 \n", + "4 9737.00 182234.59 NaN 326373.19 326373.19 2011 \n", + "... ... ... ... ... ... ... \n", + "148649 0.00 0.00 0.0 0.00 0.00 2014 \n", + "148650 NaN NaN NaN 0.00 0.00 2014 \n", + "148651 NaN NaN NaN 0.00 0.00 2014 \n", + "148652 NaN NaN NaN 0.00 0.00 2014 \n", + "148653 0.00 -618.13 0.0 -618.13 -618.13 2014 \n", + "\n", + " Notes Agency Status \n", + "0 NaN San Francisco NaN \n", + "1 NaN San Francisco NaN \n", + "2 NaN San Francisco NaN \n", + "3 NaN San Francisco NaN \n", + "4 NaN San Francisco NaN \n", + "... ... ... ... \n", + "148649 NaN San Francisco NaN \n", + "148650 NaN San Francisco NaN \n", + "148651 NaN San Francisco NaN \n", + "148652 NaN San Francisco NaN \n", + "148653 NaN San Francisco NaN \n", + "\n", + "[148654 rows x 13 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"Salary.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display the column names" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Empty DataFrame\n", + "Columns: [Id, EmployeeName, JobTitle, BasePay, OvertimePay, OtherPay, Benefits, TotalPay, TotalPayBenefits, Year, Notes, Agency, Status]\n", + "Index: []\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(['Id', 'EmployeeName', 'JobTitle', 'BasePay', 'OvertimePay', 'OtherPay',\n", + " 'Benefits', 'TotalPay', 'TotalPayBenefits', 'Year', 'Notes', 'Agency',\n", + " 'Status'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = df.columns\n", + "print(df[:0]) # checking the column names in the row 1\n", + "a" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display the number of rows and cols" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(148654, 13)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display the dataframe info (types of data in columns and not null values etc.)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 148654 entries, 0 to 148653\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 148654 non-null int64 \n", + " 1 EmployeeName 148654 non-null object \n", + " 2 JobTitle 148654 non-null object \n", + " 3 BasePay 148045 non-null float64\n", + " 4 OvertimePay 148650 non-null float64\n", + " 5 OtherPay 148650 non-null float64\n", + " 6 Benefits 112491 non-null float64\n", + " 7 TotalPay 148654 non-null float64\n", + " 8 TotalPayBenefits 148654 non-null float64\n", + " 9 Year 148654 non-null int64 \n", + " 10 Notes 0 non-null float64\n", + " 11 Agency 148654 non-null object \n", + " 12 Status 0 non-null float64\n", + "dtypes: float64(8), int64(2), object(3)\n", + "memory usage: 14.7+ MB\n" + ] + } + ], + "source": [ + "df.info() # (verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display stats of the dataframe like count, mean, std, max, 25% etc....." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdBasePayOvertimePayOtherPayBenefitsTotalPayTotalPayBenefitsYearNotesStatus
count148654.000000148045.000000148650.000000148650.000000112491.000000148654.000000148654.000000148654.0000000.00.0
mean74327.50000066325.4488415066.0598863648.76729725007.89315174768.32197293692.5548112012.522643NaNNaN
std42912.85779542764.63549511454.3805598056.60186615402.21585850517.00527462793.5334831.117538NaNNaN
min1.000000-166.010000-0.010000-7058.590000-33.890000-618.130000-618.1300002011.000000NaNNaN
25%37164.25000033588.2000000.0000000.00000011535.39500036168.99500044065.6500002012.000000NaNNaN
50%74327.50000065007.4500000.000000811.27000028628.62000071426.61000092404.0900002013.000000NaNNaN
75%111490.75000094691.0500004658.1750004236.06500035566.855000105839.135000132876.4500002014.000000NaNNaN
max148654.000000319275.010000245131.880000400184.25000096570.660000567595.430000567595.4300002014.000000NaNNaN
\n", + "
" + ], + "text/plain": [ + " Id BasePay OvertimePay OtherPay \\\n", + "count 148654.000000 148045.000000 148650.000000 148650.000000 \n", + "mean 74327.500000 66325.448841 5066.059886 3648.767297 \n", + "std 42912.857795 42764.635495 11454.380559 8056.601866 \n", + "min 1.000000 -166.010000 -0.010000 -7058.590000 \n", + "25% 37164.250000 33588.200000 0.000000 0.000000 \n", + "50% 74327.500000 65007.450000 0.000000 811.270000 \n", + "75% 111490.750000 94691.050000 4658.175000 4236.065000 \n", + "max 148654.000000 319275.010000 245131.880000 400184.250000 \n", + "\n", + " Benefits TotalPay TotalPayBenefits Year Notes \\\n", + "count 112491.000000 148654.000000 148654.000000 148654.000000 0.0 \n", + "mean 25007.893151 74768.321972 93692.554811 2012.522643 NaN \n", + "std 15402.215858 50517.005274 62793.533483 1.117538 NaN \n", + "min -33.890000 -618.130000 -618.130000 2011.000000 NaN \n", + "25% 11535.395000 36168.995000 44065.650000 2012.000000 NaN \n", + "50% 28628.620000 71426.610000 92404.090000 2013.000000 NaN \n", + "75% 35566.855000 105839.135000 132876.450000 2014.000000 NaN \n", + "max 96570.660000 567595.430000 567595.430000 2014.000000 NaN \n", + "\n", + " Status \n", + "count 0.0 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe() # statistics - Summary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display null values per column" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BasePay 609\n", + "OvertimePay 4\n", + "OtherPay 4\n", + "Benefits 36163\n", + "Notes 148654\n", + "Status 148654\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_values=df.columns[df.isnull().any()]\n", + "df[null_values].isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "remove columns will all values as NaN" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdEmployeeNameJobTitleTotalPayTotalPayBenefitsYearAgency
01NATHANIEL FORDGENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY567595.43567595.432011San Francisco
12GARY JIMENEZCAPTAIN III (POLICE DEPARTMENT)538909.28538909.282011San Francisco
23ALBERT PARDINICAPTAIN III (POLICE DEPARTMENT)335279.91335279.912011San Francisco
34CHRISTOPHER CHONGWIRE ROPE CABLE MAINTENANCE MECHANIC332343.61332343.612011San Francisco
45PATRICK GARDNERDEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)326373.19326373.192011San Francisco
........................
148649148650Roy I TilleryCustodian0.000.002014San Francisco
148650148651Not providedNot provided0.000.002014San Francisco
148651148652Not providedNot provided0.000.002014San Francisco
148652148653Not providedNot provided0.000.002014San Francisco
148653148654Joe LopezCounselor, Log Cabin Ranch-618.13-618.132014San Francisco
\n", + "

148654 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Id EmployeeName \\\n", + "0 1 NATHANIEL FORD \n", + "1 2 GARY JIMENEZ \n", + "2 3 ALBERT PARDINI \n", + "3 4 CHRISTOPHER CHONG \n", + "4 5 PATRICK GARDNER \n", + "... ... ... \n", + "148649 148650 Roy I Tillery \n", + "148650 148651 Not provided \n", + "148651 148652 Not provided \n", + "148652 148653 Not provided \n", + "148653 148654 Joe Lopez \n", + "\n", + " JobTitle TotalPay \\\n", + "0 GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY 567595.43 \n", + "1 CAPTAIN III (POLICE DEPARTMENT) 538909.28 \n", + "2 CAPTAIN III (POLICE DEPARTMENT) 335279.91 \n", + "3 WIRE ROPE CABLE MAINTENANCE MECHANIC 332343.61 \n", + "4 DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) 326373.19 \n", + "... ... ... \n", + "148649 Custodian 0.00 \n", + "148650 Not provided 0.00 \n", + "148651 Not provided 0.00 \n", + "148652 Not provided 0.00 \n", + "148653 Counselor, Log Cabin Ranch -618.13 \n", + "\n", + " TotalPayBenefits Year Agency \n", + "0 567595.43 2011 San Francisco \n", + "1 538909.28 2011 San Francisco \n", + "2 335279.91 2011 San Francisco \n", + "3 332343.61 2011 San Francisco \n", + "4 326373.19 2011 San Francisco \n", + "... ... ... ... \n", + "148649 0.00 2014 San Francisco \n", + "148650 0.00 2014 San Francisco \n", + "148651 0.00 2014 San Francisco \n", + "148652 0.00 2014 San Francisco \n", + "148653 -618.13 2014 San Francisco \n", + "\n", + "[148654 rows x 7 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = df.dropna(axis=1) # inplace = True\n", + "df1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display number of unique values in each column" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id 148654\n", + "EmployeeName 110811\n", + "JobTitle 2159\n", + "BasePay 109489\n", + "OvertimePay 65998\n", + "OtherPay 83225\n", + "Benefits 98465\n", + "TotalPay 138486\n", + "TotalPayBenefits 142098\n", + "Year 4\n", + "Notes 0\n", + "Agency 1\n", + "Status 0\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = pd.read_csv(\"Salary.csv\")\n", + "df2.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "mean of total pay of all people based on year" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Year\n", + "2011 71744.103871\n", + "2012 74113.262265\n", + "2013 77611.443142\n", + "2014 75463.918140\n", + "Name: TotalPay, dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dftp = df2.groupby('Year').mean()['TotalPay'] #(total pay vs year))\n", + "dftp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "how many people have 0 overtime pay" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "77321" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3 = sum(df2[df2['OvertimePay']==0]['Id'].value_counts()==1)\n", + "df3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "max, min, mean, median and other stats of TotalPay of people having 0 OvertimePay" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 77321.000000\n", + "mean 60229.348901\n", + "std 49307.912350\n", + "min -618.130000\n", + "25% 13290.450000\n", + "50% 58158.590000\n", + "75% 91115.090000\n", + "max 567595.430000\n", + "Name: TotalPay, dtype: float64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Stats = df2.loc[df2['OvertimePay'] == 0]\n", + "Stats['TotalPay'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "find Id of that person with max TotalPay you got in previous question" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Id_person = df2.loc[df2['TotalPay'].idxmax()]\n", + "Id_person['Id']\n", + "#df2.groupby(['TotalPay']).max()['Id']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "name of employee with total pay benefits = 87619.78" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12345 REBECCA CHIU\n", + "Name: EmployeeName, dtype: object" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Name_Emp = df1.loc[df1['TotalPayBenefits'] == 87619.78]\n", + "Name_Emp['EmployeeName']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "how many people have BasePay > 150000 and OvertimePay > 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "156" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Total_people = df2[(df2['BasePay'] > 150000) & (df2['OvertimePay'] > 100000)]\n", + "Total_people.size" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "which job title generally has highest average TotalPayBenefits" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "JobTitle ZOO CURATOR\n", + "TotalPayBenefits 436224\n", + "dtype: object" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Job_Title = df2.groupby('JobTitle', as_index = False)['TotalPayBenefits'].mean().max()\n", + "Job_Title" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How many employees are POLICE" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2512" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[df2['JobTitle'].str.contains('POLICE')]['JobTitle'].size" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "# How many employees are POLICE\n", + "def police_string(title):\n", + " if 'police' in title.lower().split():\n", + " return True\n", + " else:\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total employees who are Police: 7489\n" + ] + } + ], + "source": [ + "P = sum(df2['JobTitle'].apply(lambda x:police_string(x)))\n", + "print(\"Total employees who are Police: \",P)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Assignment/Naga Assignment1 Numpy.ipynb b/Assignment/Naga Assignment1 Numpy.ipynb new file mode 100644 index 0000000..e243790 --- /dev/null +++ b/Assignment/Naga Assignment1 Numpy.ipynb @@ -0,0 +1,363 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Assignment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make a python list => \\[1,2,3,4,5\\]\n", + "\n", + "Convert it into numpy array and print it" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 2, 3, 4, 5])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list1 = [1,2,3,4,5]\n", + "np.array(list1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make a python matrix (3 x 3) => \\[[1,2,3],[4,5,6],[7,8,9]\\]\n", + "\n", + "Convert it into numpy array and print it" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 2, 3],\n", + " [4, 5, 6],\n", + " [7, 8, 9]])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "matrix1 = [[1,2,3],[4,5,6],[7,8,9]]\n", + "np.array(matrix1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make a matrix (3 x 3) using built-in methods (like arange(), reshape() etc.):\n", + "\n", + "\\[ [1,3,5],\n", + "\n", + " [7,9,11],\n", + " \n", + " [13,15,17] \\]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1, 3, 5],\n", + " [ 7, 9, 11],\n", + " [13, 15, 17]])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arr2 = np.arange(1,19,2) # does not consider the last element or value\n", + "arr2.reshape(3,3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a numpy array with 10 random numbers from 0 to 10 (there should be few numbers greater than 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.21018049, 2.44756364, -1.15507536, -0.52912029, 0.36448852,\n", + " 0.36441865, -1.4346832 , -1.05009222, 0.77984683, 1.07404423])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arr3 = np.random.randn(10)\n", + "arr3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create numpy array => \\[1,2,3,4,5\\] and convert it to 2D array with 5 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1],\n", + " [2],\n", + " [3],\n", + " [4],\n", + " [5]])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list4 = [1,2,3,4,5]\n", + "arr4 = np.array(list4)\n", + "arr4.reshape(5,1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the shape of the above created array" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5,)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arr4.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a numpy array with 10 elements in it. Access and print its 3rd, 4th and 9th element." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n", + "3 4 9\n" + ] + } + ], + "source": [ + "list6 = [1,2,3,4,5,6,7,8,9,10]\n", + "print(list6)\n", + "arr5 = np.array(list6)\n", + "print(arr5[2],arr5[3],arr5[8]) # takes the values under the index 3,4, and 9 " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print alternate elements of that array" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 3, 5, 7, 9])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arr6 = arr5[::2]\n", + "arr6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Change last 3 elements into 100 using broadcasting and print" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 3, 100, 100, 100])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# broadcasting\n", + "arr6[-3:]=100\n", + "arr6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a 5 x 5 matrix (fill it with any element you like), print it.\n", + "\n", + "Then print the middle (3 x 3) matrix." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0 1 2 3 4]\n", + " [ 5 6 7 8 9]\n", + " [10 11 12 13 14]\n", + " [15 16 17 18 19]\n", + " [20 21 22 23 24]]\n", + "Mid 3X3 matrix:\n", + "[[ 6 7 8]\n", + " [11 12 13]\n", + " [16 17 18]]\n" + ] + } + ], + "source": [ + "matrix2 = np.arange(25).reshape(5, 5)\n", + "print(matrix2)\n", + "print('Mid 3X3 matrix:',matrix2[1:4,1:4],sep='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Assignment2 pd.ipynb b/Assignment2 pd.ipynb new file mode 100644 index 0000000..2bb9e5b --- /dev/null +++ b/Assignment2 pd.ipynb @@ -0,0 +1,1206 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "#%matplotlib notebook\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "import the dataset into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdEmployeeNameJobTitleBasePayOvertimePayOtherPayBenefitsTotalPayTotalPayBenefitsYearNotesAgencyStatus
01NATHANIEL FORDGENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY167411.180.00400184.25NaN567595.43567595.432011NaNSan FranciscoNaN
12GARY JIMENEZCAPTAIN III (POLICE DEPARTMENT)155966.02245131.88137811.38NaN538909.28538909.282011NaNSan FranciscoNaN
23ALBERT PARDINICAPTAIN III (POLICE DEPARTMENT)212739.13106088.1816452.60NaN335279.91335279.912011NaNSan FranciscoNaN
34CHRISTOPHER CHONGWIRE ROPE CABLE MAINTENANCE MECHANIC77916.0056120.71198306.90NaN332343.61332343.612011NaNSan FranciscoNaN
45PATRICK GARDNERDEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)134401.609737.00182234.59NaN326373.19326373.192011NaNSan FranciscoNaN
..........................................
148649148650Roy I TilleryCustodian0.000.000.000.00.000.002014NaNSan FranciscoNaN
148650148651Not providedNot providedNaNNaNNaNNaN0.000.002014NaNSan FranciscoNaN
148651148652Not providedNot providedNaNNaNNaNNaN0.000.002014NaNSan FranciscoNaN
148652148653Not providedNot providedNaNNaNNaNNaN0.000.002014NaNSan FranciscoNaN
148653148654Joe LopezCounselor, Log Cabin Ranch0.000.00-618.130.0-618.13-618.132014NaNSan FranciscoNaN
\n", + "

148654 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " Id EmployeeName \\\n", + "0 1 NATHANIEL FORD \n", + "1 2 GARY JIMENEZ \n", + "2 3 ALBERT PARDINI \n", + "3 4 CHRISTOPHER CHONG \n", + "4 5 PATRICK GARDNER \n", + "... ... ... \n", + "148649 148650 Roy I Tillery \n", + "148650 148651 Not provided \n", + "148651 148652 Not provided \n", + "148652 148653 Not provided \n", + "148653 148654 Joe Lopez \n", + "\n", + " JobTitle BasePay \\\n", + "0 GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY 167411.18 \n", + "1 CAPTAIN III (POLICE DEPARTMENT) 155966.02 \n", + "2 CAPTAIN III (POLICE DEPARTMENT) 212739.13 \n", + "3 WIRE ROPE CABLE MAINTENANCE MECHANIC 77916.00 \n", + "4 DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) 134401.60 \n", + "... ... ... \n", + "148649 Custodian 0.00 \n", + "148650 Not provided NaN \n", + "148651 Not provided NaN \n", + "148652 Not provided NaN \n", + "148653 Counselor, Log Cabin Ranch 0.00 \n", + "\n", + " OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year \\\n", + "0 0.00 400184.25 NaN 567595.43 567595.43 2011 \n", + "1 245131.88 137811.38 NaN 538909.28 538909.28 2011 \n", + "2 106088.18 16452.60 NaN 335279.91 335279.91 2011 \n", + "3 56120.71 198306.90 NaN 332343.61 332343.61 2011 \n", + "4 9737.00 182234.59 NaN 326373.19 326373.19 2011 \n", + "... ... ... ... ... ... ... \n", + "148649 0.00 0.00 0.0 0.00 0.00 2014 \n", + "148650 NaN NaN NaN 0.00 0.00 2014 \n", + "148651 NaN NaN NaN 0.00 0.00 2014 \n", + "148652 NaN NaN NaN 0.00 0.00 2014 \n", + "148653 0.00 -618.13 0.0 -618.13 -618.13 2014 \n", + "\n", + " Notes Agency Status \n", + "0 NaN San Francisco NaN \n", + "1 NaN San Francisco NaN \n", + "2 NaN San Francisco NaN \n", + "3 NaN San Francisco NaN \n", + "4 NaN San Francisco NaN \n", + "... ... ... ... \n", + "148649 NaN San Francisco NaN \n", + "148650 NaN San Francisco NaN \n", + "148651 NaN San Francisco NaN \n", + "148652 NaN San Francisco NaN \n", + "148653 NaN San Francisco NaN \n", + "\n", + "[148654 rows x 13 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"Salary.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display the column names" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Empty DataFrame\n", + "Columns: [Id, EmployeeName, JobTitle, BasePay, OvertimePay, OtherPay, Benefits, TotalPay, TotalPayBenefits, Year, Notes, Agency, Status]\n", + "Index: []\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(['Id', 'EmployeeName', 'JobTitle', 'BasePay', 'OvertimePay', 'OtherPay',\n", + " 'Benefits', 'TotalPay', 'TotalPayBenefits', 'Year', 'Notes', 'Agency',\n", + " 'Status'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = df.columns\n", + "print(df[:0]) # checking the column names in the row 1\n", + "a" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display the number of rows and cols" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(148654, 13)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display the dataframe info (types of data in columns and not null values etc.)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 148654 entries, 0 to 148653\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 148654 non-null int64 \n", + " 1 EmployeeName 148654 non-null object \n", + " 2 JobTitle 148654 non-null object \n", + " 3 BasePay 148045 non-null float64\n", + " 4 OvertimePay 148650 non-null float64\n", + " 5 OtherPay 148650 non-null float64\n", + " 6 Benefits 112491 non-null float64\n", + " 7 TotalPay 148654 non-null float64\n", + " 8 TotalPayBenefits 148654 non-null float64\n", + " 9 Year 148654 non-null int64 \n", + " 10 Notes 0 non-null float64\n", + " 11 Agency 148654 non-null object \n", + " 12 Status 0 non-null float64\n", + "dtypes: float64(8), int64(2), object(3)\n", + "memory usage: 14.7+ MB\n" + ] + } + ], + "source": [ + "df.info() # (verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display stats of the dataframe like count, mean, std, max, 25% etc....." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdBasePayOvertimePayOtherPayBenefitsTotalPayTotalPayBenefitsYearNotesStatus
count148654.000000148045.000000148650.000000148650.000000112491.000000148654.000000148654.000000148654.0000000.00.0
mean74327.50000066325.4488415066.0598863648.76729725007.89315174768.32197293692.5548112012.522643NaNNaN
std42912.85779542764.63549511454.3805598056.60186615402.21585850517.00527462793.5334831.117538NaNNaN
min1.000000-166.010000-0.010000-7058.590000-33.890000-618.130000-618.1300002011.000000NaNNaN
25%37164.25000033588.2000000.0000000.00000011535.39500036168.99500044065.6500002012.000000NaNNaN
50%74327.50000065007.4500000.000000811.27000028628.62000071426.61000092404.0900002013.000000NaNNaN
75%111490.75000094691.0500004658.1750004236.06500035566.855000105839.135000132876.4500002014.000000NaNNaN
max148654.000000319275.010000245131.880000400184.25000096570.660000567595.430000567595.4300002014.000000NaNNaN
\n", + "
" + ], + "text/plain": [ + " Id BasePay OvertimePay OtherPay \\\n", + "count 148654.000000 148045.000000 148650.000000 148650.000000 \n", + "mean 74327.500000 66325.448841 5066.059886 3648.767297 \n", + "std 42912.857795 42764.635495 11454.380559 8056.601866 \n", + "min 1.000000 -166.010000 -0.010000 -7058.590000 \n", + "25% 37164.250000 33588.200000 0.000000 0.000000 \n", + "50% 74327.500000 65007.450000 0.000000 811.270000 \n", + "75% 111490.750000 94691.050000 4658.175000 4236.065000 \n", + "max 148654.000000 319275.010000 245131.880000 400184.250000 \n", + "\n", + " Benefits TotalPay TotalPayBenefits Year Notes \\\n", + "count 112491.000000 148654.000000 148654.000000 148654.000000 0.0 \n", + "mean 25007.893151 74768.321972 93692.554811 2012.522643 NaN \n", + "std 15402.215858 50517.005274 62793.533483 1.117538 NaN \n", + "min -33.890000 -618.130000 -618.130000 2011.000000 NaN \n", + "25% 11535.395000 36168.995000 44065.650000 2012.000000 NaN \n", + "50% 28628.620000 71426.610000 92404.090000 2013.000000 NaN \n", + "75% 35566.855000 105839.135000 132876.450000 2014.000000 NaN \n", + "max 96570.660000 567595.430000 567595.430000 2014.000000 NaN \n", + "\n", + " Status \n", + "count 0.0 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe() # statistics - Summary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display null values per column" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BasePay 609\n", + "OvertimePay 4\n", + "OtherPay 4\n", + "Benefits 36163\n", + "Notes 148654\n", + "Status 148654\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_values=df.columns[df.isnull().any()]\n", + "df[null_values].isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "remove columns will all values as NaN" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdEmployeeNameJobTitleTotalPayTotalPayBenefitsYearAgency
01NATHANIEL FORDGENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY567595.43567595.432011San Francisco
12GARY JIMENEZCAPTAIN III (POLICE DEPARTMENT)538909.28538909.282011San Francisco
23ALBERT PARDINICAPTAIN III (POLICE DEPARTMENT)335279.91335279.912011San Francisco
34CHRISTOPHER CHONGWIRE ROPE CABLE MAINTENANCE MECHANIC332343.61332343.612011San Francisco
45PATRICK GARDNERDEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)326373.19326373.192011San Francisco
........................
148649148650Roy I TilleryCustodian0.000.002014San Francisco
148650148651Not providedNot provided0.000.002014San Francisco
148651148652Not providedNot provided0.000.002014San Francisco
148652148653Not providedNot provided0.000.002014San Francisco
148653148654Joe LopezCounselor, Log Cabin Ranch-618.13-618.132014San Francisco
\n", + "

148654 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " Id EmployeeName \\\n", + "0 1 NATHANIEL FORD \n", + "1 2 GARY JIMENEZ \n", + "2 3 ALBERT PARDINI \n", + "3 4 CHRISTOPHER CHONG \n", + "4 5 PATRICK GARDNER \n", + "... ... ... \n", + "148649 148650 Roy I Tillery \n", + "148650 148651 Not provided \n", + "148651 148652 Not provided \n", + "148652 148653 Not provided \n", + "148653 148654 Joe Lopez \n", + "\n", + " JobTitle TotalPay \\\n", + "0 GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY 567595.43 \n", + "1 CAPTAIN III (POLICE DEPARTMENT) 538909.28 \n", + "2 CAPTAIN III (POLICE DEPARTMENT) 335279.91 \n", + "3 WIRE ROPE CABLE MAINTENANCE MECHANIC 332343.61 \n", + "4 DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) 326373.19 \n", + "... ... ... \n", + "148649 Custodian 0.00 \n", + "148650 Not provided 0.00 \n", + "148651 Not provided 0.00 \n", + "148652 Not provided 0.00 \n", + "148653 Counselor, Log Cabin Ranch -618.13 \n", + "\n", + " TotalPayBenefits Year Agency \n", + "0 567595.43 2011 San Francisco \n", + "1 538909.28 2011 San Francisco \n", + "2 335279.91 2011 San Francisco \n", + "3 332343.61 2011 San Francisco \n", + "4 326373.19 2011 San Francisco \n", + "... ... ... ... \n", + "148649 0.00 2014 San Francisco \n", + "148650 0.00 2014 San Francisco \n", + "148651 0.00 2014 San Francisco \n", + "148652 0.00 2014 San Francisco \n", + "148653 -618.13 2014 San Francisco \n", + "\n", + "[148654 rows x 7 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = df.dropna(axis=1) # inplace = True\n", + "df1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "display number of unique values in each column" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id 148654\n", + "EmployeeName 110811\n", + "JobTitle 2159\n", + "BasePay 109489\n", + "OvertimePay 65998\n", + "OtherPay 83225\n", + "Benefits 98465\n", + "TotalPay 138486\n", + "TotalPayBenefits 142098\n", + "Year 4\n", + "Notes 0\n", + "Agency 1\n", + "Status 0\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = pd.read_csv(\"Salary.csv\")\n", + "df2.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "mean of total pay of all people based on year" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Year\n", + "2011 71744.103871\n", + "2012 74113.262265\n", + "2013 77611.443142\n", + "2014 75463.918140\n", + "Name: TotalPay, dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dftp = df2.groupby('Year').mean()['TotalPay'] #(total pay vs year))\n", + "dftp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "how many people have 0 overtime pay" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "77321" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3 = sum(df2[df2['OvertimePay']==0]['Id'].value_counts()==1)\n", + "df3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "max, min, mean, median and other stats of TotalPay of people having 0 OvertimePay" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 77321.000000\n", + "mean 60229.348901\n", + "std 49307.912350\n", + "min -618.130000\n", + "25% 13290.450000\n", + "50% 58158.590000\n", + "75% 91115.090000\n", + "max 567595.430000\n", + "Name: TotalPay, dtype: float64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Stats = df2.loc[df2['OvertimePay'] == 0]\n", + "Stats['TotalPay'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "find Id of that person with max TotalPay you got in previous question" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Id_person = df2.loc[df2['TotalPay'].idxmax()]\n", + "Id_person['Id']\n", + "#df2.groupby(['TotalPay']).max()['Id']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "name of employee with total pay benefits = 87619.78" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12345 REBECCA CHIU\n", + "Name: EmployeeName, dtype: object" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Name_Emp = df1.loc[df1['TotalPayBenefits'] == 87619.78]\n", + "Name_Emp['EmployeeName']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "how many people have BasePay > 150000 and OvertimePay > 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "156" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Total_people = df2[(df2['BasePay'] > 150000) & (df2['OvertimePay'] > 100000)]\n", + "Total_people.size" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "which job title generally has highest average TotalPayBenefits" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "JobTitle ZOO CURATOR\n", + "TotalPayBenefits 436224\n", + "dtype: object" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Job_Title = df2.groupby('JobTitle', as_index = False)['TotalPayBenefits'].mean().max()\n", + "Job_Title" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How many employees are POLICE" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2512" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[df2['JobTitle'].str.contains('POLICE')]['JobTitle'].size" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "# How many employees are POLICE\n", + "def police_string(title):\n", + " if 'police' in title.lower().split():\n", + " return True\n", + " else:\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total employees who are Police: 7489\n" + ] + } + ], + "source": [ + "P = sum(df2['JobTitle'].apply(lambda x:police_string(x)))\n", + "print(\"Total employees who are Police: \",P)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}