From 7da97d88dabb3b06c885f592a5b88fab3126e849 Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Fri, 22 Mar 2024 22:15:55 +0200
Subject: [PATCH 01/10] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2cfe5168a6..cd05d636d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ my_feature_list.pkl
 .DS_Store
 __pycache__
 venv/
+tools/enron_mail_20150507.tar.gz

From a4b340b81202cdd75d2add0a812ae3384d78e67f Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Tue, 26 Mar 2024 20:18:31 +0200
Subject: [PATCH 02/10] naive bayes and svm solution

---
 naive_bayes/nb_author_id.py |  23 +++++++-
 svm/svm_author_id.py        | 106 ++++++++++++++++++++++++++++++++++--
 tools/email_preprocess.py   |   2 +-
 3 files changed, 122 insertions(+), 9 deletions(-)

diff --git a/naive_bayes/nb_author_id.py b/naive_bayes/nb_author_id.py
index 102da2ec13..4ea18e3272 100644
--- a/naive_bayes/nb_author_id.py
+++ b/naive_bayes/nb_author_id.py
@@ -9,10 +9,10 @@
     Sara has label 0
     Chris has label 1
 """
-    
+
 import sys
 from time import time
-sys.path.append("../tools/")
+sys.path.append("./tools/")
 from email_preprocess import preprocess
 
 
@@ -24,10 +24,27 @@
 
 ##############################################################
 # Enter Your Code Here
+from sklearn.naive_bayes import GaussianNB
 
+t0 = time()
+clf = GaussianNB()
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
 
+t0 = time()
+pred = clf.predict(features_test)
+print("Predicting Time:", round(time()-t0, 3), "s")
+
+accuracy = clf.score(features_test, labels_test)
+
+from sklearn.metrics import accuracy_score
+acc = accuracy_score(pred, labels_test)
+
+print("Accuracy:", round(accuracy, 3))
+
+print("Matice Accuracy:", round(acc, 3))
+#########################################
 
-##############################################################
 
 ##############################################################
 '''
diff --git a/svm/svm_author_id.py b/svm/svm_author_id.py
index 35390d60ad..5662f443e4 100644
--- a/svm/svm_author_id.py
+++ b/svm/svm_author_id.py
@@ -10,7 +10,7 @@
     
 import sys
 from time import time
-sys.path.append("../tools/")
+sys.path.append("./tools/")
 from email_preprocess import preprocess
 
 
@@ -19,13 +19,35 @@
 ### labels_train and labels_test are the corresponding item labels
 features_train, features_test, labels_train, labels_test = preprocess()
 
-
 #########################################################
 ### your code goes here ###
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score
 
+# Create a Support Vector Classifier (SVC) object with a linear kernel
+clf = SVC(kernel='linear')
 
-#########################################################
+# Record the start time for training
+t0 = time()
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+# Record the start time for predicting
+t0 = time()
+pred = clf.predict(features_test)
+print("Predicting Time:", round(time()-t0, 3), "s")
+
+# Calculate and print the accuracy of the model using the test data
+accuracy = clf.score(features_test, labels_test)
+print("Accuracy:", round(accuracy, 3))
 
+# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
+acc = accuracy_score(pred, labels_test)
+print("Metrics Accuracy:", round(acc, 3))
+
+
+#########################################################
+# Training on smaller datasets
 #########################################################
 '''
 You'll be Provided similar code in the Quiz
@@ -33,7 +55,81 @@
 The Code Below solves that issue, So use this one
 '''
 
-# features_train = features_train[:int(len(features_train)/100)]
-# labels_train = labels_train[:int(len(labels_train)/100)]
+# Reduce the size of the features_train list to 1% of its original size
+features_train = features_train[:int(len(features_train)/100)]
+
+# Reduce the size of the labels_train list to 1% of its original size
+labels_train = labels_train[:int(len(labels_train)/100)]
+
+
+# Record the start time for training
+t0 = time()
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+# Record the start time for predicting
+t0 = time()
+pred = clf.predict(features_test)
+print("Predicting Time:", round(time()-t0, 3), "s")
+
+# Calculate and print the accuracy of the model using the test data
+accuracy = clf.score(features_test, labels_test)
+print("Accuracy:", round(accuracy, 3))
+
+# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
+acc = accuracy_score(pred, labels_test)
+print("Metrics Accuracy:", round(acc, 3))
 
 #########################################################
+# running the modle with RBF kernal on the small dataset
+########################################################
+# remember this modle running on the smaller data set of 1% of the oreginal data set
+# Create a Support Vector Classifier (SVC) object with a RBF kernel
+clf = SVC(kernel='rbf')
+
+# Record the start time for training
+t0 = time()
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+# Record the start time for predicting
+t0 = time()
+pred = clf.predict(features_test)
+print("Predicting Time:", round(time()-t0, 3), "s")
+
+# Calculate and print the accuracy of the model using the test data
+accuracy = clf.score(features_test, labels_test)
+print("Accuracy:", round(accuracy, 3))
+
+# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
+acc = accuracy_score(pred, labels_test)
+print("Metrics Accuracy:", round(acc, 3))
+
+#########################################################
+# running the modle with different C values (10.0, 100., 1000., and 10000) kernal on the small dataset
+########################################################
+# remember this modle running on the smaller data set of 1% of the oreginal data set
+# Create a Support Vector Classifier (SVC) object with a RBF kernel
+c_values = [10.0, 100.0, 1000.0, 10000]
+
+for c in c_values:
+    clf = SVC(kernel='rbf', C=c)
+
+    # Record the start time for training
+    t0 = time()
+    clf.fit(features_train, labels_train)
+    print("Training Time:", round(time()-t0, 3), "s")
+
+    # Record the start time for predicting
+    t0 = time()
+    pred = clf.predict(features_test)
+    print("Predicting Time:", round(time()-t0, 3), "s")
+
+    # Calculate and print the accuracy of the model using the test data
+    accuracy = clf.score(features_test, labels_test)
+    print("Accuracy:", round(accuracy, 3))
+
+    # Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
+    acc = accuracy_score(pred, labels_test)
+    print("Metrics Accuracy:", round(acc, 3))
+    
diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py
index fd83cea3e9..4bdf3043b3 100644
--- a/tools/email_preprocess.py
+++ b/tools/email_preprocess.py
@@ -8,7 +8,7 @@
 from sklearn.feature_selection import SelectPercentile, f_classif
 
 
-def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
+def preprocess(words_file = "./tools/word_data.pkl", authors_file="./tools/email_authors.pkl"):
     """ 
         this function takes a pre-made list of email texts (by default word_data.pkl)
         and the corresponding authors (by default email_authors.pkl) and performs

From 30c7cd0d22dfa0d885080b83726da23283690d15 Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Sun, 31 Mar 2024 03:25:25 +0200
Subject: [PATCH 03/10] Update dt_author_id.py

---
 decision_tree/dt_author_id.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py
index 006afb8357..fd843efc3b 100644
--- a/decision_tree/dt_author_id.py
+++ b/decision_tree/dt_author_id.py
@@ -24,8 +24,21 @@
 
 #########################################################
 ### your code goes here ###
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
 
+# Create a Decision Tree Classifier (DT) object
+t0 = time()
+clf = DecisionTreeClassifier(random_state=0, min_samples_split=40)
+clf.fit(features_train, labels_train)        
+print("Training Time:", round(time()-t0, 3), "s")
 
 #########################################################
 
+pred = clf.predict(features_test)
+accuracy = clf.score(features_test, labels_test)
 
+acc = accuracy_score(pred, labels_test)
+
+print("Accuracy:", round(accuracy,3))
+print("Metrics Accuracy:", round(acc, 3))
\ No newline at end of file

From 693e558c10ea8f89754052cf551aeed29e5b4c2b Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Sat, 6 Apr 2024 01:01:22 +0200
Subject: [PATCH 04/10] Create settings.json

---
 .vscode/settings.json | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000..62f621ee4f
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,4 @@
+{
+    "python.analysis.autoImportCompletions": true,
+    "python.analysis.typeCheckingMode": "basic"
+}
\ No newline at end of file

From 0679237bfeca6688e411ea133b29571c5f8e4210 Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Sat, 6 Apr 2024 11:48:03 +0200
Subject: [PATCH 05/10] Update email_preprocess.py

---
 tools/email_preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py
index 4bdf3043b3..f59d72e7be 100644
--- a/tools/email_preprocess.py
+++ b/tools/email_preprocess.py
@@ -48,7 +48,7 @@ def preprocess(words_file = "./tools/word_data.pkl", authors_file="./tools/email
 
     ### feature selection, because text is super high dimensional and 
     ### can be really computationally chewy as a result
-    selector = SelectPercentile(f_classif, percentile=10)
+    selector = SelectPercentile(f_classif, percentile=1)
     selector.fit(features_train_transformed, labels_train)
     features_train_transformed = selector.transform(features_train_transformed).toarray()
     features_test_transformed  = selector.transform(features_test_transformed).toarray()

From 9807e9521a4981a1a738057733dac123ea2ea496 Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Sat, 6 Apr 2024 13:08:07 +0200
Subject: [PATCH 06/10] Commit

---
 decision_tree/dt_author_id.py | 46 +++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py
index fd843efc3b..5098c9da1a 100644
--- a/decision_tree/dt_author_id.py
+++ b/decision_tree/dt_author_id.py
@@ -10,7 +10,7 @@
     
 import sys
 from time import time
-sys.path.append("../tools/")
+sys.path.append("./tools/")
 from email_preprocess import preprocess
 
 
@@ -41,4 +41,46 @@
 acc = accuracy_score(pred, labels_test)
 
 print("Accuracy:", round(accuracy,3))
-print("Metrics Accuracy:", round(acc, 3))
\ No newline at end of file
+print("Metrics Accuracy:", round(acc, 3))
+
+#########################################################
+
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.metrics import accuracy_score
+
+# Create a AdaBoost Classifier (AB) object
+t0 = time()
+clf = AdaBoostClassifier(n_estimators=100, random_state=0)
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+acc = accuracy_score(clf.predict(features_test), labels_test)
+
+print("Metrics Accuracy:", round(acc, 3))
+
+
+#########################################################
+from sklearn.neighbors import KNeighborsClassifier
+
+# Create a KNeighbors Classifier (KNN) object
+t0 = time()
+clf = KNeighborsClassifier(n_neighbors=3)
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+acc = accuracy_score(clf.predict(features_test), labels_test)
+print("Metrics Accuracy:", round(acc, 3))
+
+
+#########################################################
+
+from sklearn.ensemble import RandomForestClassifier
+
+# Create a RandomForest Classifier (RF) object
+t0 = time()
+clf = RandomForestClassifier(n_estimators=100, random_state=0)
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+acc = accuracy_score(clf.predict(features_test), labels_test)
+print("Metrics Accuracy:", round(acc, 3))

From ee90879c3a9042ed66b80dc4b5b77059f0489ead Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Sat, 13 Apr 2024 04:42:39 +0200
Subject: [PATCH 07/10] data exploration

---
 datasets_questions/explore_enron_data.py | 56 +++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py
index 853767a5e0..6acfffdc40 100755
--- a/datasets_questions/explore_enron_data.py
+++ b/datasets_questions/explore_enron_data.py
@@ -15,7 +15,61 @@
     
 """
 
+from tkinter.font import names
 import joblib
 
-enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb"))
+enron_data = joblib.load(open("./final_project/final_project_dataset.pkl", "rb"))
 
+# Print the first 5 items in the enron_data dictionary
+for i, (key, value) in enumerate(enron_data.items()):
+    if i >= 5:
+        break
+    print(f"{key}: {value}\n")
+
+# Print the number of data points (people) in the dataset
+print(f"Number of data points: {len(enron_data)}")
+# Print the number of features for each person in the dataset
+print(f"Number of features: {len(list(enron_data.values())[0])}")
+# Count the number of POIs in the dataset
+print(f"Number of POIs: {sum([1 for person in enron_data.values() if person['poi']== 1])}") # == 1 or == True or only if person['poi'] without == 1
+# form the list of POIs names in /final_project/poi_names.txt and print the number of POIs
+poi_names = [name for name in open("./final_project/poi_names.txt").read().split("\n") if name not in ["", "http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm"]]
+print(f"Number of POIs: {len(poi_names)}")
+print(poi_names)
+
+# List of features for each person in the dataset
+first_person_features = list(enron_data.values())[0]
+# List of names in the dataset
+names_enron_data = list(enron_data.keys())
+
+
+# Print the total value of the stock belonging to James Prentice
+stock_features = [feature for feature in first_person_features.keys() if 'stock' in feature.lower() and 'total' in feature.lower()]
+james_prentice_name = [name for name in names_enron_data if 'james' in name.lower() and 'prentice' in name.lower()]
+# print(f"Stock features: {stock_features}")
+# print(f"Name of the person: {james_prentice_name}")
+
+# Print the total value of the stock belonging to James Prentice
+print(f"Total stock value of James Prentice: {enron_data[james_prentice_name[0]][stock_features[0]]}")
+
+
+# Print the total value of the stock belonging to Wesley Colwell
+wesley_colwell_name = [name for name in names_enron_data if 'wesley' in name.lower() and 'colwell' in name.lower()]
+# print(f"Name of the person: {wesley_colwell_name}")
+# Print Number of emails sent from Wesley Colwell to POIs
+print(f"Total Number of emails of Wesley Colwell: {enron_data[wesley_colwell_name[0]]['from_this_person_to_poi']}")
+
+
+# Print the  value of stock options belonging to Jeffrey K Skillin
+jeffrey_skillin_name = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'skillin' in name.lower()]
+# print(f"Name of the person: {wesley_colwell_name}")
+
+# Print the value of stock options belonging to Jeffrey K Skillin
+print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}")
+
+# Print the  value of total payments to Lay, Skilling and Fastow
+Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'lay' in name.lower() and 'fastow' in name.lower()]
+print(f"Name of the person: {Lay_Skilling_Fastow_names}")
+
+# Print the value of stock options belonging to Jeffrey K Skillin
+print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}")

From 2a521c53b09dffef89c54e42e17a21bb93f6ab9d Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Sat, 13 Apr 2024 04:51:44 +0200
Subject: [PATCH 08/10] Add unit test configuration to settings.json

---
 .vscode/settings.json                    | 11 ++++++++++-
 datasets_questions/explore_enron_data.py |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 62f621ee4f..2c746de878 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,4 +1,13 @@
 {
     "python.analysis.autoImportCompletions": true,
-    "python.analysis.typeCheckingMode": "basic"
+    "python.analysis.typeCheckingMode": "basic",
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        ".",
+        "-p",
+        "*test.py"
+    ],
+    "python.testing.pytestEnabled": false,
+    "python.testing.unittestEnabled": true
 }
\ No newline at end of file
diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py
index 6acfffdc40..daada5d57f 100755
--- a/datasets_questions/explore_enron_data.py
+++ b/datasets_questions/explore_enron_data.py
@@ -68,7 +68,7 @@
 print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}")
 
 # Print the  value of total payments to Lay, Skilling and Fastow
-Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'lay' in name.lower() and 'fastow' in name.lower()]
+Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()]
 print(f"Name of the person: {Lay_Skilling_Fastow_names}")
 
 # Print the value of stock options belonging to Jeffrey K Skillin

From 6acf2e7141f708cc6e6bee4d0f89053854c8763e Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Sat, 13 Apr 2024 06:08:37 +0200
Subject: [PATCH 09/10] no settings.json

---
 .vscode/settings.json                    | 13 -------------
 datasets_questions/explore_enron_data.py | 10 ++++++----
 2 files changed, 6 insertions(+), 17 deletions(-)
 delete mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 2c746de878..0000000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "python.analysis.autoImportCompletions": true,
-    "python.analysis.typeCheckingMode": "basic",
-    "python.testing.unittestArgs": [
-        "-v",
-        "-s",
-        ".",
-        "-p",
-        "*test.py"
-    ],
-    "python.testing.pytestEnabled": false,
-    "python.testing.unittestEnabled": true
-}
\ No newline at end of file
diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py
index daada5d57f..0b70090cbf 100755
--- a/datasets_questions/explore_enron_data.py
+++ b/datasets_questions/explore_enron_data.py
@@ -68,8 +68,10 @@
 print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}")
 
 # Print the  value of total payments to Lay, Skilling and Fastow
-Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()]
+# Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()]
+Lay_Skilling_Fastow_names = ['LAY KENNETH L', 'FASTOW ANDREW S', 'SKILLING JEFFREY K']
 print(f"Name of the person: {Lay_Skilling_Fastow_names}")
-
-# Print the value of stock options belonging to Jeffrey K Skillin
-print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}")
+total_payments = {name: enron_data[name]['total_payments'] for name in Lay_Skilling_Fastow_names}
+# Print the name of the person with the highest total payments
+max_total_payments = max(total_payments, key=total_payments.get)
+print(f"Name of the person with the highest total payments: {max_total_payments}")

From 480c928301cade5b1d15d415c9d00e15041b59d4 Mon Sep 17 00:00:00 2001
From: Ahmed Kadry <37486894+Dramkadry@users.noreply.github.com>
Date: Tue, 23 Apr 2024 17:40:44 +0200
Subject: [PATCH 10/10] Update explore_enron_data.py

---
 datasets_questions/explore_enron_data.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py
index 0b70090cbf..128cb2a692 100755
--- a/datasets_questions/explore_enron_data.py
+++ b/datasets_questions/explore_enron_data.py
@@ -75,3 +75,14 @@
 # Print the name of the person with the highest total payments
 max_total_payments = max(total_payments, key=total_payments.get)
 print(f"Name of the person with the highest total payments: {max_total_payments}")
+ 
+# Print the number of people with a quantified salary
+people_with_quantified_salary = [person for person in enron_data.values() if person['salary'] != 'NaN']
+people_with_known_emails = [person for person in enron_data.values() if person['email_address'] != 'NaN']
+
+print(f'number of persons with known salarry:', len(people_with_quantified_salary))
+print(f'number of persons with known emails:', len(people_with_known_emails))
+
+sys.path.append("./tools/")
+from feature_format import featureFormat
+enron_data_array = featureFormat(enron_data, first_person_features)