From 80c568b2c0113c137f58afe69e66430741d4beb3 Mon Sep 17 00:00:00 2001
From: Felix Eberhardt <feberhardt@olin.edu>
Date: Wed, 25 Oct 2017 12:59:57 -0400
Subject: [PATCH 1/2] completed

---
 learning_curve.py | 34 +++++++++++++++++-----------------
 questions.txt     | 11 +++++++++++
 test.py           | 10 ++++++++++
 3 files changed, 38 insertions(+), 17 deletions(-)
 create mode 100644 questions.txt
 create mode 100644 test.py

diff --git a/learning_curve.py b/learning_curve.py
index 2baa81b..3539cb4 100644
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -1,12 +1,7 @@
-"""Explore learning curves for classification of handwritten digits"""
-
-import matplotlib.pyplot as plt
-import numpy
 from sklearn.datasets import *
-from sklearn.model_selection import train_test_split
+from sklearn.cross_validation import train_test_split
 from sklearn.linear_model import LogisticRegression
 
-
 def display_digits():
     """Read in the 8x8 pictures of numbers and display 10 of them"""
     digits = load_digits()
@@ -21,7 +16,7 @@ def display_digits():
 
 def train_model():
     """Train a model on pictures of digits.
-    
+
     Read in 8x8 pictures of numbers and evaluate the accuracy of the model
     when different percentages of the data are used as training data. This function
     plots the average accuracy of the model as a function of the percent of data
@@ -32,14 +27,19 @@ def train_model():
     train_percentages = range(5, 95, 5)
     test_accuracies = numpy.zeros(len(train_percentages))
 
-    # train models with training percentages between 5 and 90 (see
-    # train_percentages) and evaluate the resultant accuracy for each.
-    # You should repeat each training percentage num_trials times to smooth out
-    # variability.
-    # For consistency with the previous example use
-    # model = LogisticRegression(C=10**-10) for your learner
-
-    # TODO: your code here
+    for i in train_percentages:
+    # repeat each value of train_size 10 times to smooth out variability
+        score_test = 0
+        for k in range(num_trials):
+            X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=i/100)
+            model = LogisticRegression(C=10**-10)
+            model.fit(X_train, y_train)
+            score_test += model.score(X_test, y_test)
+#             print(i, k, model.score(X_test, y_test))
+        accuracy_test = score_test/10
+#         print(accuracy_test)
+        h = int(i/5-1)
+        test_accuracies[h]=accuracy_test
 
     fig = plt.figure()
     plt.plot(train_percentages, test_accuracies)
@@ -50,5 +50,5 @@ def train_model():
 
 if __name__ == "__main__":
     # Feel free to comment/uncomment as needed
-    display_digits()
-    # train_model()
+#     display_digits()
+    train_model()
diff --git a/questions.txt b/questions.txt
new file mode 100644
index 0000000..ad0a9e6
--- /dev/null
+++ b/questions.txt
@@ -0,0 +1,11 @@
+What is the general trend in the curve?
+
+
+Are there parts of the curve that appear to be noisier than others? Why?
+
+
+How many trials do you need to get a smooth curve?
+
+
+Try different values for C (by changing LogisticRegression(C=10** -10)). What happens? If you want to know why this happens, see this Wikipedia page as well as the documentation for LogisticRegression in scikit-learn.
+
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..7c45a5a
--- /dev/null
+++ b/test.py
@@ -0,0 +1,10 @@
+from sklearn.datasets import *
+from sklearn.cross_validation import train_test_split
+from sklearn.linear_model import LogisticRegression
+
+data = load_digits()
+X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,train_size=0.5)
+model = LogisticRegression(C=10**-10)
+model.fit(X_train, y_train)
+print("Train accuracy %f" %model.score(X_train, y_train))
+print("Test accuracy %f"%model.score(X_test, y_test))

From 60d21ed32234b4246c711c41d0069bfaac722e4d Mon Sep 17 00:00:00 2001
From: flxbrhrdt <31519444+flxbrhrdt@users.noreply.github.com>
Date: Thu, 2 Nov 2017 00:43:56 -0400
Subject: [PATCH 2/2] Completed

---
 learning_curve.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/learning_curve.py b/learning_curve.py
index 3539cb4..a5e8ede 100644
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -1,6 +1,8 @@
+import numpy
 from sklearn.datasets import *
 from sklearn.cross_validation import train_test_split
 from sklearn.linear_model import LogisticRegression
+import matplotlib.pyplot as plt
 
 def display_digits():
     """Read in the 8x8 pictures of numbers and display 10 of them"""