From 80c568b2c0113c137f58afe69e66430741d4beb3 Mon Sep 17 00:00:00 2001 From: Felix Eberhardt Date: Wed, 25 Oct 2017 12:59:57 -0400 Subject: [PATCH 1/2] completed --- learning_curve.py | 34 +++++++++++++++++----------------- questions.txt | 11 +++++++++++ test.py | 10 ++++++++++ 3 files changed, 38 insertions(+), 17 deletions(-) create mode 100644 questions.txt create mode 100644 test.py diff --git a/learning_curve.py b/learning_curve.py index 2baa81b..3539cb4 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -1,12 +1,7 @@ -"""Explore learning curves for classification of handwritten digits""" - -import matplotlib.pyplot as plt -import numpy from sklearn.datasets import * -from sklearn.model_selection import train_test_split +from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression - def display_digits(): """Read in the 8x8 pictures of numbers and display 10 of them""" digits = load_digits() @@ -21,7 +16,7 @@ def display_digits(): def train_model(): """Train a model on pictures of digits. - + Read in 8x8 pictures of numbers and evaluate the accuracy of the model when different percentages of the data are used as training data. This function plots the average accuracy of the model as a function of the percent of data @@ -32,14 +27,19 @@ def train_model(): train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) - # train models with training percentages between 5 and 90 (see - # train_percentages) and evaluate the resultant accuracy for each. - # You should repeat each training percentage num_trials times to smooth out - # variability. - # For consistency with the previous example use - # model = LogisticRegression(C=10**-10) for your learner - - # TODO: your code here + for i in train_percentages: + # repeat each value of train_size 10 times to smooth out variability + score_test = 0 + for k in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=i/100) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + score_test += model.score(X_test, y_test) +# print(i, k, model.score(X_test, y_test)) + accuracy_test = score_test/10 +# print(accuracy_test) + h = int(i/5-1) + test_accuracies[h]=accuracy_test fig = plt.figure() plt.plot(train_percentages, test_accuracies) @@ -50,5 +50,5 @@ def train_model(): if __name__ == "__main__": # Feel free to comment/uncomment as needed - display_digits() - # train_model() +# display_digits() + train_model() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..ad0a9e6 --- /dev/null +++ b/questions.txt @@ -0,0 +1,11 @@ +What is the general trend in the curve? + + +Are there parts of the curve that appear to be noisier than others? Why? + + +How many trials do you need to get a smooth curve? + + +Try different values for C (by changing LogisticRegression(C=10** -10)). What happens? If you want to know why this happens, see this Wikipedia page as well as the documentation for LogisticRegression in scikit-learn. + diff --git a/test.py b/test.py new file mode 100644 index 0000000..7c45a5a --- /dev/null +++ b/test.py @@ -0,0 +1,10 @@ +from sklearn.datasets import * +from sklearn.cross_validation import train_test_split +from sklearn.linear_model import LogisticRegression + +data = load_digits() +X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,train_size=0.5) +model = LogisticRegression(C=10**-10) +model.fit(X_train, y_train) +print("Train accuracy %f" %model.score(X_train, y_train)) +print("Test accuracy %f"%model.score(X_test, y_test)) From 60d21ed32234b4246c711c41d0069bfaac722e4d Mon Sep 17 00:00:00 2001 From: flxbrhrdt <31519444+flxbrhrdt@users.noreply.github.com> Date: Thu, 2 Nov 2017 00:43:56 -0400 Subject: [PATCH 2/2] Completed --- learning_curve.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/learning_curve.py b/learning_curve.py index 3539cb4..a5e8ede 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -1,6 +1,8 @@ +import numpy from sklearn.datasets import * from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression +import matplotlib.pyplot as plt def display_digits(): """Read in the 8x8 pictures of numbers and display 10 of them"""