diff --git a/.gitignore b/.gitignore index 2cfe5168a6f..cd05d636d57 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ my_feature_list.pkl .DS_Store __pycache__ venv/ +tools/enron_mail_20150507.tar.gz diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py index 853767a5e09..128cb2a6924 100755 --- a/datasets_questions/explore_enron_data.py +++ b/datasets_questions/explore_enron_data.py @@ -15,7 +15,74 @@ """ +from tkinter.font import names import joblib -enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb")) +enron_data = joblib.load(open("./final_project/final_project_dataset.pkl", "rb")) +# Print the first 5 items in the enron_data dictionary +for i, (key, value) in enumerate(enron_data.items()): + if i >= 5: + break + print(f"{key}: {value}\n") + +# Print the number of data points (people) in the dataset +print(f"Number of data points: {len(enron_data)}") +# Print the number of features for each person in the dataset +print(f"Number of features: {len(list(enron_data.values())[0])}") +# Count the number of POIs in the dataset +print(f"Number of POIs: {sum([1 for person in enron_data.values() if person['poi']== 1])}") # == 1 or == True or only if person['poi'] without == 1 +# form the list of POIs names in /final_project/poi_names.txt and print the number of POIs +poi_names = [name for name in open("./final_project/poi_names.txt").read().split("\n") if name not in ["", "http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm"]] +print(f"Number of POIs: {len(poi_names)}") +print(poi_names) + +# List of features for each person in the dataset +first_person_features = list(enron_data.values())[0] +# List of names in the dataset +names_enron_data = list(enron_data.keys()) + + +# Print the total value of the stock belonging to James Prentice +stock_features = [feature for feature in first_person_features.keys() if 'stock' in feature.lower() and 'total' in feature.lower()] +james_prentice_name = [name for name in names_enron_data if 'james' in name.lower() and 'prentice' in name.lower()] +# print(f"Stock features: {stock_features}") +# print(f"Name of the person: {james_prentice_name}") + +# Print the total value of the stock belonging to James Prentice +print(f"Total stock value of James Prentice: {enron_data[james_prentice_name[0]][stock_features[0]]}") + + +# Print the total value of the stock belonging to Wesley Colwell +wesley_colwell_name = [name for name in names_enron_data if 'wesley' in name.lower() and 'colwell' in name.lower()] +# print(f"Name of the person: {wesley_colwell_name}") +# Print Number of emails sent from Wesley Colwell to POIs +print(f"Total Number of emails of Wesley Colwell: {enron_data[wesley_colwell_name[0]]['from_this_person_to_poi']}") + + +# Print the value of stock options belonging to Jeffrey K Skillin +jeffrey_skillin_name = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'skillin' in name.lower()] +# print(f"Name of the person: {wesley_colwell_name}") + +# Print the value of stock options belonging to Jeffrey K Skillin +print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}") + +# Print the value of total payments to Lay, Skilling and Fastow +# Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()] +Lay_Skilling_Fastow_names = ['LAY KENNETH L', 'FASTOW ANDREW S', 'SKILLING JEFFREY K'] +print(f"Name of the person: {Lay_Skilling_Fastow_names}") +total_payments = {name: enron_data[name]['total_payments'] for name in Lay_Skilling_Fastow_names} +# Print the name of the person with the highest total payments +max_total_payments = max(total_payments, key=total_payments.get) +print(f"Name of the person with the highest total payments: {max_total_payments}") + +# Print the number of people with a quantified salary +people_with_quantified_salary = [person for person in enron_data.values() if person['salary'] != 'NaN'] +people_with_known_emails = [person for person in enron_data.values() if person['email_address'] != 'NaN'] + +print(f'number of persons with known salarry:', len(people_with_quantified_salary)) +print(f'number of persons with known emails:', len(people_with_known_emails)) + +sys.path.append("./tools/") +from feature_format import featureFormat +enron_data_array = featureFormat(enron_data, first_person_features) diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py index 006afb8357b..5098c9da1af 100644 --- a/decision_tree/dt_author_id.py +++ b/decision_tree/dt_author_id.py @@ -10,7 +10,7 @@ import sys from time import time -sys.path.append("../tools/") +sys.path.append("./tools/") from email_preprocess import preprocess @@ -24,8 +24,63 @@ ######################################################### ### your code goes here ### +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics import accuracy_score +# Create a Decision Tree Classifier (DT) object +t0 = time() +clf = DecisionTreeClassifier(random_state=0, min_samples_split=40) +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") ######################################################### +pred = clf.predict(features_test) +accuracy = clf.score(features_test, labels_test) +acc = accuracy_score(pred, labels_test) + +print("Accuracy:", round(accuracy,3)) +print("Metrics Accuracy:", round(acc, 3)) + +######################################################### + +from sklearn.ensemble import AdaBoostClassifier +from sklearn.metrics import accuracy_score + +# Create a AdaBoost Classifier (AB) object +t0 = time() +clf = AdaBoostClassifier(n_estimators=100, random_state=0) +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +acc = accuracy_score(clf.predict(features_test), labels_test) + +print("Metrics Accuracy:", round(acc, 3)) + + +######################################################### +from sklearn.neighbors import KNeighborsClassifier + +# Create a KNeighbors Classifier (KNN) object +t0 = time() +clf = KNeighborsClassifier(n_neighbors=3) +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +acc = accuracy_score(clf.predict(features_test), labels_test) +print("Metrics Accuracy:", round(acc, 3)) + + +######################################################### + +from sklearn.ensemble import RandomForestClassifier + +# Create a RandomForest Classifier (RF) object +t0 = time() +clf = RandomForestClassifier(n_estimators=100, random_state=0) +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +acc = accuracy_score(clf.predict(features_test), labels_test) +print("Metrics Accuracy:", round(acc, 3)) diff --git a/naive_bayes/nb_author_id.py b/naive_bayes/nb_author_id.py index 102da2ec13f..4ea18e3272f 100644 --- a/naive_bayes/nb_author_id.py +++ b/naive_bayes/nb_author_id.py @@ -9,10 +9,10 @@ Sara has label 0 Chris has label 1 """ - + import sys from time import time -sys.path.append("../tools/") +sys.path.append("./tools/") from email_preprocess import preprocess @@ -24,10 +24,27 @@ ############################################################## # Enter Your Code Here +from sklearn.naive_bayes import GaussianNB +t0 = time() +clf = GaussianNB() +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") +t0 = time() +pred = clf.predict(features_test) +print("Predicting Time:", round(time()-t0, 3), "s") + +accuracy = clf.score(features_test, labels_test) + +from sklearn.metrics import accuracy_score +acc = accuracy_score(pred, labels_test) + +print("Accuracy:", round(accuracy, 3)) + +print("Matice Accuracy:", round(acc, 3)) +######################################### -############################################################## ############################################################## ''' diff --git a/svm/svm_author_id.py b/svm/svm_author_id.py index 35390d60ad6..5662f443e42 100644 --- a/svm/svm_author_id.py +++ b/svm/svm_author_id.py @@ -10,7 +10,7 @@ import sys from time import time -sys.path.append("../tools/") +sys.path.append("./tools/") from email_preprocess import preprocess @@ -19,13 +19,35 @@ ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() - ######################################################### ### your code goes here ### +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score +# Create a Support Vector Classifier (SVC) object with a linear kernel +clf = SVC(kernel='linear') -######################################################### +# Record the start time for training +t0 = time() +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +# Record the start time for predicting +t0 = time() +pred = clf.predict(features_test) +print("Predicting Time:", round(time()-t0, 3), "s") + +# Calculate and print the accuracy of the model using the test data +accuracy = clf.score(features_test, labels_test) +print("Accuracy:", round(accuracy, 3)) +# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels +acc = accuracy_score(pred, labels_test) +print("Metrics Accuracy:", round(acc, 3)) + + +######################################################### +# Training on smaller datasets ######################################################### ''' You'll be Provided similar code in the Quiz @@ -33,7 +55,81 @@ The Code Below solves that issue, So use this one ''' -# features_train = features_train[:int(len(features_train)/100)] -# labels_train = labels_train[:int(len(labels_train)/100)] +# Reduce the size of the features_train list to 1% of its original size +features_train = features_train[:int(len(features_train)/100)] + +# Reduce the size of the labels_train list to 1% of its original size +labels_train = labels_train[:int(len(labels_train)/100)] + + +# Record the start time for training +t0 = time() +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +# Record the start time for predicting +t0 = time() +pred = clf.predict(features_test) +print("Predicting Time:", round(time()-t0, 3), "s") + +# Calculate and print the accuracy of the model using the test data +accuracy = clf.score(features_test, labels_test) +print("Accuracy:", round(accuracy, 3)) + +# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels +acc = accuracy_score(pred, labels_test) +print("Metrics Accuracy:", round(acc, 3)) ######################################################### +# running the modle with RBF kernal on the small dataset +######################################################## +# remember this modle running on the smaller data set of 1% of the oreginal data set +# Create a Support Vector Classifier (SVC) object with a RBF kernel +clf = SVC(kernel='rbf') + +# Record the start time for training +t0 = time() +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +# Record the start time for predicting +t0 = time() +pred = clf.predict(features_test) +print("Predicting Time:", round(time()-t0, 3), "s") + +# Calculate and print the accuracy of the model using the test data +accuracy = clf.score(features_test, labels_test) +print("Accuracy:", round(accuracy, 3)) + +# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels +acc = accuracy_score(pred, labels_test) +print("Metrics Accuracy:", round(acc, 3)) + +######################################################### +# running the modle with different C values (10.0, 100., 1000., and 10000) kernal on the small dataset +######################################################## +# remember this modle running on the smaller data set of 1% of the oreginal data set +# Create a Support Vector Classifier (SVC) object with a RBF kernel +c_values = [10.0, 100.0, 1000.0, 10000] + +for c in c_values: + clf = SVC(kernel='rbf', C=c) + + # Record the start time for training + t0 = time() + clf.fit(features_train, labels_train) + print("Training Time:", round(time()-t0, 3), "s") + + # Record the start time for predicting + t0 = time() + pred = clf.predict(features_test) + print("Predicting Time:", round(time()-t0, 3), "s") + + # Calculate and print the accuracy of the model using the test data + accuracy = clf.score(features_test, labels_test) + print("Accuracy:", round(accuracy, 3)) + + # Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels + acc = accuracy_score(pred, labels_test) + print("Metrics Accuracy:", round(acc, 3)) + diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py index fd83cea3e9f..f59d72e7bed 100644 --- a/tools/email_preprocess.py +++ b/tools/email_preprocess.py @@ -8,7 +8,7 @@ from sklearn.feature_selection import SelectPercentile, f_classif -def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): +def preprocess(words_file = "./tools/word_data.pkl", authors_file="./tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs @@ -48,7 +48,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result - selector = SelectPercentile(f_classif, percentile=10) + selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray()