From 7da97d88dabb3b06c885f592a5b88fab3126e849 Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Fri, 22 Mar 2024 22:15:55 +0200 Subject: [PATCH 01/10] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2cfe5168a6..cd05d636d5 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ my_feature_list.pkl .DS_Store __pycache__ venv/ +tools/enron_mail_20150507.tar.gz From a4b340b81202cdd75d2add0a812ae3384d78e67f Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Tue, 26 Mar 2024 20:18:31 +0200 Subject: [PATCH 02/10] naive bayes and svm solution --- naive_bayes/nb_author_id.py | 23 +++++++- svm/svm_author_id.py | 106 ++++++++++++++++++++++++++++++++++-- tools/email_preprocess.py | 2 +- 3 files changed, 122 insertions(+), 9 deletions(-) diff --git a/naive_bayes/nb_author_id.py b/naive_bayes/nb_author_id.py index 102da2ec13..4ea18e3272 100644 --- a/naive_bayes/nb_author_id.py +++ b/naive_bayes/nb_author_id.py @@ -9,10 +9,10 @@ Sara has label 0 Chris has label 1 """ - + import sys from time import time -sys.path.append("../tools/") +sys.path.append("./tools/") from email_preprocess import preprocess @@ -24,10 +24,27 @@ ############################################################## # Enter Your Code Here +from sklearn.naive_bayes import GaussianNB +t0 = time() +clf = GaussianNB() +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") +t0 = time() +pred = clf.predict(features_test) +print("Predicting Time:", round(time()-t0, 3), "s") + +accuracy = clf.score(features_test, labels_test) + +from sklearn.metrics import accuracy_score +acc = accuracy_score(pred, labels_test) + +print("Accuracy:", round(accuracy, 3)) + +print("Matice Accuracy:", round(acc, 3)) +######################################### -############################################################## ############################################################## ''' diff --git a/svm/svm_author_id.py b/svm/svm_author_id.py index 35390d60ad..5662f443e4 100644 --- a/svm/svm_author_id.py +++ b/svm/svm_author_id.py @@ -10,7 +10,7 @@ import sys from time import time -sys.path.append("../tools/") +sys.path.append("./tools/") from email_preprocess import preprocess @@ -19,13 +19,35 @@ ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() - ######################################################### ### your code goes here ### +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score +# Create a Support Vector Classifier (SVC) object with a linear kernel +clf = SVC(kernel='linear') -######################################################### +# Record the start time for training +t0 = time() +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +# Record the start time for predicting +t0 = time() +pred = clf.predict(features_test) +print("Predicting Time:", round(time()-t0, 3), "s") + +# Calculate and print the accuracy of the model using the test data +accuracy = clf.score(features_test, labels_test) +print("Accuracy:", round(accuracy, 3)) +# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels +acc = accuracy_score(pred, labels_test) +print("Metrics Accuracy:", round(acc, 3)) + + +######################################################### +# Training on smaller datasets ######################################################### ''' You'll be Provided similar code in the Quiz @@ -33,7 +55,81 @@ The Code Below solves that issue, So use this one ''' -# features_train = features_train[:int(len(features_train)/100)] -# labels_train = labels_train[:int(len(labels_train)/100)] +# Reduce the size of the features_train list to 1% of its original size +features_train = features_train[:int(len(features_train)/100)] + +# Reduce the size of the labels_train list to 1% of its original size +labels_train = labels_train[:int(len(labels_train)/100)] + + +# Record the start time for training +t0 = time() +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +# Record the start time for predicting +t0 = time() +pred = clf.predict(features_test) +print("Predicting Time:", round(time()-t0, 3), "s") + +# Calculate and print the accuracy of the model using the test data +accuracy = clf.score(features_test, labels_test) +print("Accuracy:", round(accuracy, 3)) + +# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels +acc = accuracy_score(pred, labels_test) +print("Metrics Accuracy:", round(acc, 3)) ######################################################### +# running the modle with RBF kernal on the small dataset +######################################################## +# remember this modle running on the smaller data set of 1% of the oreginal data set +# Create a Support Vector Classifier (SVC) object with a RBF kernel +clf = SVC(kernel='rbf') + +# Record the start time for training +t0 = time() +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +# Record the start time for predicting +t0 = time() +pred = clf.predict(features_test) +print("Predicting Time:", round(time()-t0, 3), "s") + +# Calculate and print the accuracy of the model using the test data +accuracy = clf.score(features_test, labels_test) +print("Accuracy:", round(accuracy, 3)) + +# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels +acc = accuracy_score(pred, labels_test) +print("Metrics Accuracy:", round(acc, 3)) + +######################################################### +# running the modle with different C values (10.0, 100., 1000., and 10000) kernal on the small dataset +######################################################## +# remember this modle running on the smaller data set of 1% of the oreginal data set +# Create a Support Vector Classifier (SVC) object with a RBF kernel +c_values = [10.0, 100.0, 1000.0, 10000] + +for c in c_values: + clf = SVC(kernel='rbf', C=c) + + # Record the start time for training + t0 = time() + clf.fit(features_train, labels_train) + print("Training Time:", round(time()-t0, 3), "s") + + # Record the start time for predicting + t0 = time() + pred = clf.predict(features_test) + print("Predicting Time:", round(time()-t0, 3), "s") + + # Calculate and print the accuracy of the model using the test data + accuracy = clf.score(features_test, labels_test) + print("Accuracy:", round(accuracy, 3)) + + # Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels + acc = accuracy_score(pred, labels_test) + print("Metrics Accuracy:", round(acc, 3)) + diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py index fd83cea3e9..4bdf3043b3 100644 --- a/tools/email_preprocess.py +++ b/tools/email_preprocess.py @@ -8,7 +8,7 @@ from sklearn.feature_selection import SelectPercentile, f_classif -def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): +def preprocess(words_file = "./tools/word_data.pkl", authors_file="./tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs From 30c7cd0d22dfa0d885080b83726da23283690d15 Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Sun, 31 Mar 2024 03:25:25 +0200 Subject: [PATCH 03/10] Update dt_author_id.py --- decision_tree/dt_author_id.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py index 006afb8357..fd843efc3b 100644 --- a/decision_tree/dt_author_id.py +++ b/decision_tree/dt_author_id.py @@ -24,8 +24,21 @@ ######################################################### ### your code goes here ### +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics import accuracy_score +# Create a Decision Tree Classifier (DT) object +t0 = time() +clf = DecisionTreeClassifier(random_state=0, min_samples_split=40) +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") ######################################################### +pred = clf.predict(features_test) +accuracy = clf.score(features_test, labels_test) +acc = accuracy_score(pred, labels_test) + +print("Accuracy:", round(accuracy,3)) +print("Metrics Accuracy:", round(acc, 3)) \ No newline at end of file From 693e558c10ea8f89754052cf551aeed29e5b4c2b Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Sat, 6 Apr 2024 01:01:22 +0200 Subject: [PATCH 04/10] Create settings.json --- .vscode/settings.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000..62f621ee4f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.analysis.autoImportCompletions": true, + "python.analysis.typeCheckingMode": "basic" +} \ No newline at end of file From 0679237bfeca6688e411ea133b29571c5f8e4210 Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Sat, 6 Apr 2024 11:48:03 +0200 Subject: [PATCH 05/10] Update email_preprocess.py --- tools/email_preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py index 4bdf3043b3..f59d72e7be 100644 --- a/tools/email_preprocess.py +++ b/tools/email_preprocess.py @@ -48,7 +48,7 @@ def preprocess(words_file = "./tools/word_data.pkl", authors_file="./tools/email ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result - selector = SelectPercentile(f_classif, percentile=10) + selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() From 9807e9521a4981a1a738057733dac123ea2ea496 Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Sat, 6 Apr 2024 13:08:07 +0200 Subject: [PATCH 06/10] Commit --- decision_tree/dt_author_id.py | 46 +++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py index fd843efc3b..5098c9da1a 100644 --- a/decision_tree/dt_author_id.py +++ b/decision_tree/dt_author_id.py @@ -10,7 +10,7 @@ import sys from time import time -sys.path.append("../tools/") +sys.path.append("./tools/") from email_preprocess import preprocess @@ -41,4 +41,46 @@ acc = accuracy_score(pred, labels_test) print("Accuracy:", round(accuracy,3)) -print("Metrics Accuracy:", round(acc, 3)) \ No newline at end of file +print("Metrics Accuracy:", round(acc, 3)) + +######################################################### + +from sklearn.ensemble import AdaBoostClassifier +from sklearn.metrics import accuracy_score + +# Create a AdaBoost Classifier (AB) object +t0 = time() +clf = AdaBoostClassifier(n_estimators=100, random_state=0) +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +acc = accuracy_score(clf.predict(features_test), labels_test) + +print("Metrics Accuracy:", round(acc, 3)) + + +######################################################### +from sklearn.neighbors import KNeighborsClassifier + +# Create a KNeighbors Classifier (KNN) object +t0 = time() +clf = KNeighborsClassifier(n_neighbors=3) +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +acc = accuracy_score(clf.predict(features_test), labels_test) +print("Metrics Accuracy:", round(acc, 3)) + + +######################################################### + +from sklearn.ensemble import RandomForestClassifier + +# Create a RandomForest Classifier (RF) object +t0 = time() +clf = RandomForestClassifier(n_estimators=100, random_state=0) +clf.fit(features_train, labels_train) +print("Training Time:", round(time()-t0, 3), "s") + +acc = accuracy_score(clf.predict(features_test), labels_test) +print("Metrics Accuracy:", round(acc, 3)) From ee90879c3a9042ed66b80dc4b5b77059f0489ead Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Sat, 13 Apr 2024 04:42:39 +0200 Subject: [PATCH 07/10] data exploration --- datasets_questions/explore_enron_data.py | 56 +++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py index 853767a5e0..6acfffdc40 100755 --- a/datasets_questions/explore_enron_data.py +++ b/datasets_questions/explore_enron_data.py @@ -15,7 +15,61 @@ """ +from tkinter.font import names import joblib -enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb")) +enron_data = joblib.load(open("./final_project/final_project_dataset.pkl", "rb")) +# Print the first 5 items in the enron_data dictionary +for i, (key, value) in enumerate(enron_data.items()): + if i >= 5: + break + print(f"{key}: {value}\n") + +# Print the number of data points (people) in the dataset +print(f"Number of data points: {len(enron_data)}") +# Print the number of features for each person in the dataset +print(f"Number of features: {len(list(enron_data.values())[0])}") +# Count the number of POIs in the dataset +print(f"Number of POIs: {sum([1 for person in enron_data.values() if person['poi']== 1])}") # == 1 or == True or only if person['poi'] without == 1 +# form the list of POIs names in /final_project/poi_names.txt and print the number of POIs +poi_names = [name for name in open("./final_project/poi_names.txt").read().split("\n") if name not in ["", "http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm"]] +print(f"Number of POIs: {len(poi_names)}") +print(poi_names) + +# List of features for each person in the dataset +first_person_features = list(enron_data.values())[0] +# List of names in the dataset +names_enron_data = list(enron_data.keys()) + + +# Print the total value of the stock belonging to James Prentice +stock_features = [feature for feature in first_person_features.keys() if 'stock' in feature.lower() and 'total' in feature.lower()] +james_prentice_name = [name for name in names_enron_data if 'james' in name.lower() and 'prentice' in name.lower()] +# print(f"Stock features: {stock_features}") +# print(f"Name of the person: {james_prentice_name}") + +# Print the total value of the stock belonging to James Prentice +print(f"Total stock value of James Prentice: {enron_data[james_prentice_name[0]][stock_features[0]]}") + + +# Print the total value of the stock belonging to Wesley Colwell +wesley_colwell_name = [name for name in names_enron_data if 'wesley' in name.lower() and 'colwell' in name.lower()] +# print(f"Name of the person: {wesley_colwell_name}") +# Print Number of emails sent from Wesley Colwell to POIs +print(f"Total Number of emails of Wesley Colwell: {enron_data[wesley_colwell_name[0]]['from_this_person_to_poi']}") + + +# Print the value of stock options belonging to Jeffrey K Skillin +jeffrey_skillin_name = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'skillin' in name.lower()] +# print(f"Name of the person: {wesley_colwell_name}") + +# Print the value of stock options belonging to Jeffrey K Skillin +print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}") + +# Print the value of total payments to Lay, Skilling and Fastow +Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'lay' in name.lower() and 'fastow' in name.lower()] +print(f"Name of the person: {Lay_Skilling_Fastow_names}") + +# Print the value of stock options belonging to Jeffrey K Skillin +print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}") From 2a521c53b09dffef89c54e42e17a21bb93f6ab9d Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Sat, 13 Apr 2024 04:51:44 +0200 Subject: [PATCH 08/10] Add unit test configuration to settings.json --- .vscode/settings.json | 11 ++++++++++- datasets_questions/explore_enron_data.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 62f621ee4f..2c746de878 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,13 @@ { "python.analysis.autoImportCompletions": true, - "python.analysis.typeCheckingMode": "basic" + "python.analysis.typeCheckingMode": "basic", + "python.testing.unittestArgs": [ + "-v", + "-s", + ".", + "-p", + "*test.py" + ], + "python.testing.pytestEnabled": false, + "python.testing.unittestEnabled": true } \ No newline at end of file diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py index 6acfffdc40..daada5d57f 100755 --- a/datasets_questions/explore_enron_data.py +++ b/datasets_questions/explore_enron_data.py @@ -68,7 +68,7 @@ print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}") # Print the value of total payments to Lay, Skilling and Fastow -Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'lay' in name.lower() and 'fastow' in name.lower()] +Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()] print(f"Name of the person: {Lay_Skilling_Fastow_names}") # Print the value of stock options belonging to Jeffrey K Skillin From 6acf2e7141f708cc6e6bee4d0f89053854c8763e Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Sat, 13 Apr 2024 06:08:37 +0200 Subject: [PATCH 09/10] no settings.json --- .vscode/settings.json | 13 ------------- datasets_questions/explore_enron_data.py | 10 ++++++---- 2 files changed, 6 insertions(+), 17 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 2c746de878..0000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "python.analysis.autoImportCompletions": true, - "python.analysis.typeCheckingMode": "basic", - "python.testing.unittestArgs": [ - "-v", - "-s", - ".", - "-p", - "*test.py" - ], - "python.testing.pytestEnabled": false, - "python.testing.unittestEnabled": true -} \ No newline at end of file diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py index daada5d57f..0b70090cbf 100755 --- a/datasets_questions/explore_enron_data.py +++ b/datasets_questions/explore_enron_data.py @@ -68,8 +68,10 @@ print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}") # Print the value of total payments to Lay, Skilling and Fastow -Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()] +# Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()] +Lay_Skilling_Fastow_names = ['LAY KENNETH L', 'FASTOW ANDREW S', 'SKILLING JEFFREY K'] print(f"Name of the person: {Lay_Skilling_Fastow_names}") - -# Print the value of stock options belonging to Jeffrey K Skillin -print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}") +total_payments = {name: enron_data[name]['total_payments'] for name in Lay_Skilling_Fastow_names} +# Print the name of the person with the highest total payments +max_total_payments = max(total_payments, key=total_payments.get) +print(f"Name of the person with the highest total payments: {max_total_payments}") From 480c928301cade5b1d15d415c9d00e15041b59d4 Mon Sep 17 00:00:00 2001 From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com> Date: Tue, 23 Apr 2024 17:40:44 +0200 Subject: [PATCH 10/10] Update explore_enron_data.py --- datasets_questions/explore_enron_data.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py index 0b70090cbf..128cb2a692 100755 --- a/datasets_questions/explore_enron_data.py +++ b/datasets_questions/explore_enron_data.py @@ -75,3 +75,14 @@ # Print the name of the person with the highest total payments max_total_payments = max(total_payments, key=total_payments.get) print(f"Name of the person with the highest total payments: {max_total_payments}") + +# Print the number of people with a quantified salary +people_with_quantified_salary = [person for person in enron_data.values() if person['salary'] != 'NaN'] +people_with_known_emails = [person for person in enron_data.values() if person['email_address'] != 'NaN'] + +print(f'number of persons with known salarry:', len(people_with_quantified_salary)) +print(f'number of persons with known emails:', len(people_with_known_emails)) + +sys.path.append("./tools/") +from feature_format import featureFormat +enron_data_array = featureFormat(enron_data, first_person_features)