olin-toolboxes · sokuno222 · Nov 2, 2017
diff --git a/graphs/100trials.PNG b/graphs/100trials.PNG
diff --git a/graphs/10trials.PNG b/graphs/10trials.PNG
diff --git a/graphs/25trials.PNG b/graphs/25trials.PNG
diff --git a/graphs/50trials.PNG b/graphs/50trials.PNG
diff --git a/learning_curve.py b/learning_curve.py
@@ -21,25 +21,26 @@ def display_digits():
 
 def train_model():
     """Train a model on pictures of digits.
-    
+
     Read in 8x8 pictures of numbers and evaluate the accuracy of the model
     when different percentages of the data are used as training data. This function
     plots the average accuracy of the model as a function of the percent of data
     used to train it.
     """
     data = load_digits()
-    num_trials = 10
+    num_trials = 25
     train_percentages = range(5, 95, 5)
     test_accuracies = numpy.zeros(len(train_percentages))
 
-    # train models with training percentages between 5 and 90 (see
-    # train_percentages) and evaluate the resultant accuracy for each.
-    # You should repeat each training percentage num_trials times to smooth out
-    # variability.
-    # For consistency with the previous example use
-    # model = LogisticRegression(C=10**-10) for your learner
-
-    # TODO: your code here
+    for x,i in enumerate(train_percentages):
+        variable = 0
+        for a in range(num_trials):
+            X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,
+            train_size = i*.01)
+            model = LogisticRegression(C=10**-10)
+            model.fit(X_train, y_train)
+            variable+=model.score(X_test, y_test)
+        test_accuracies[x] = (variable / num_trials) * 100
 
     fig = plt.figure()
     plt.plot(train_percentages, test_accuracies)
@@ -50,5 +51,5 @@ def train_model():
 
 if __name__ == "__main__":
     # Feel free to comment/uncomment as needed
-    display_digits()
-    # train_model()
+    #display_digits()
+    train_model()
diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,30 @@
+Write-up:
+
+1. What is the general trend in the curve?
+
+   The graphs generated by plotting accuracy over percentage training make an
+   approximately linear line or a gradual decay of change in accuracy for
+   data training.
+
+
+2. Are there parts of the curve that appear to be noisier than others? Why?
+
+   The graph seems to spike up at around 40 and 60 percent and sharply drop
+   off immediately after when 10 trials are run. When the number of trials run
+   is higher, the graph still fluctuates around the middle section.
+   This might be a result of the algorithm adapting to specific features of the
+   training info and then having to unlearn it afterwards because it is not
+   representative of any actual number.
+
+
+3. How many trials do you need to get a smooth curve?
+
+   50 trials gives a pretty smooth curve, but 100 trials makes it very smooth.
+   More than 100 would make it smoother, but not by much.
+
+
+4. Try different values for C. What happens?
+
+   Decreasing the C value makes the line very unpredictable and the accuracy
+   drops quickly. Increasing the C value makes the curve look more and more
+   like the graph of a square root but mostly maintains the accuracy.