From b89af7e0880d5da4443c15928c112f7fe99c9670 Mon Sep 17 00:00:00 2001 From: Manish Date: Wed, 6 May 2020 03:04:09 +1000 Subject: [PATCH 1/4] Workaround for SSLError while downloaidng dataset Workaround for weak DH key used by www.cs.cmu.edu. Cipher selected is supported by server and is known to have no security flaw. Workaround code is only run when user encounter error on system as it is possible that older systems do not raise warning but may not support workaround cipher. Throwback error if its not exactly what we are looking for, to prevent hiding any other possible bug. --- tools/startup.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/startup.py b/tools/startup.py index 4638e0d115e..c3d2bc521ab 100644 --- a/tools/startup.py +++ b/tools/startup.py @@ -32,7 +32,22 @@ print "download will complete at about 423 MB" import urllib url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz" -urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz") +filename = "../enron_mail_20150507.tar.gz" +try: + urllib.urlretrieve(url, filename=filename) +except IOError as socket_error: + expected_error = ( + "IOError('socket error', SSLError(1, u'[SSL: DH_KEY_TOO_SMALL]"+ + " dh key too small (_ssl.c:727)'))" + ) + if repr(socket_error) == expected_error: + import ssl + cipher = "ECDHE-RSA-AES128-GCM-SHA256" + context = ssl.create_default_context() + context.set_ciphers(cipher) + urllib.urlretrieve(url, filename=filename, context=context) + else: + raise socket_error print "download complete!" From 738d07f9a65381937210f0e9dba254ef4ce98629 Mon Sep 17 00:00:00 2001 From: Manish Date: Sun, 14 Jun 2020 23:25:14 +1000 Subject: [PATCH 2/4] Replace deprecated module --- outliers/outlier_removal_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/outliers/outlier_removal_regression.py b/outliers/outlier_removal_regression.py index d509cd9f22f..789eb7c139a 100644 --- a/outliers/outlier_removal_regression.py +++ b/outliers/outlier_removal_regression.py @@ -20,7 +20,7 @@ ### and n_columns is the number of features ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that From f5c5e6c7096d8346f5bfb6729abd1c153da05539 Mon Sep 17 00:00:00 2001 From: Manish Date: Tue, 16 Jun 2020 19:45:19 +1000 Subject: [PATCH 3/4] Replace deprecated module --- feature_selection/find_signature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py index c01a1f2111a..4e63f4d2967 100644 --- a/feature_selection/find_signature.py +++ b/feature_selection/find_signature.py @@ -19,8 +19,8 @@ ### remainder go into training) ### feature matrices changed to dense representations for compatibility with ### classifier functions in versions 0.15.2 and earlier -from sklearn import cross_validation -features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) +from sklearn import model_selection +features_train, features_test, labels_train, labels_test = model_selection.train_test_split(word_data, authors, test_size=0.1, random_state=42) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, From fe4f3145330fe244e644ed5e86fdb698a08bbefd Mon Sep 17 00:00:00 2001 From: Manish Date: Tue, 16 Jun 2020 22:37:54 +1000 Subject: [PATCH 4/4] Fix depracted modules --- pca/eigenfaces.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pca/eigenfaces.py b/pca/eigenfaces.py index 074b860a253..b9ad1ccb9f2 100644 --- a/pca/eigenfaces.py +++ b/pca/eigenfaces.py @@ -23,12 +23,12 @@ import pylab as pl import numpy as np -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_lfw_people -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix -from sklearn.decomposition import RandomizedPCA +from sklearn.decomposition import PCA from sklearn.svm import SVC # Display progress logs on stdout @@ -70,7 +70,10 @@ print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() -pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) +pca = PCA( + n_components=n_components, + svd_solver='randomized', + whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) eigenfaces = pca.components_.reshape((n_components, h, w))