diff --git a/graphs/100trials.PNG b/graphs/100trials.PNG new file mode 100644 index 0000000..0d54b4d Binary files /dev/null and b/graphs/100trials.PNG differ diff --git a/graphs/10trials.PNG b/graphs/10trials.PNG new file mode 100644 index 0000000..e91f6ed Binary files /dev/null and b/graphs/10trials.PNG differ diff --git a/graphs/25trials.PNG b/graphs/25trials.PNG new file mode 100644 index 0000000..9034b8b Binary files /dev/null and b/graphs/25trials.PNG differ diff --git a/graphs/50trials.PNG b/graphs/50trials.PNG new file mode 100644 index 0000000..61f115a Binary files /dev/null and b/graphs/50trials.PNG differ diff --git a/learning_curve.py b/learning_curve.py index 2baa81b..e7a72ad 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -21,25 +21,26 @@ def display_digits(): def train_model(): """Train a model on pictures of digits. - + Read in 8x8 pictures of numbers and evaluate the accuracy of the model when different percentages of the data are used as training data. This function plots the average accuracy of the model as a function of the percent of data used to train it. """ data = load_digits() - num_trials = 10 + num_trials = 25 train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) - # train models with training percentages between 5 and 90 (see - # train_percentages) and evaluate the resultant accuracy for each. - # You should repeat each training percentage num_trials times to smooth out - # variability. - # For consistency with the previous example use - # model = LogisticRegression(C=10**-10) for your learner - - # TODO: your code here + for x,i in enumerate(train_percentages): + variable = 0 + for a in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, + train_size = i*.01) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + variable+=model.score(X_test, y_test) + test_accuracies[x] = (variable / num_trials) * 100 fig = plt.figure() plt.plot(train_percentages, test_accuracies) @@ -50,5 +51,5 @@ def train_model(): if __name__ == "__main__": # Feel free to comment/uncomment as needed - display_digits() - # train_model() + #display_digits() + train_model() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..7ace05a --- /dev/null +++ b/questions.txt @@ -0,0 +1,30 @@ +Write-up: + +1. What is the general trend in the curve? + + The graphs generated by plotting accuracy over percentage training make an + approximately linear line or a gradual decay of change in accuracy for + data training. + + +2. Are there parts of the curve that appear to be noisier than others? Why? + + The graph seems to spike up at around 40 and 60 percent and sharply drop + off immediately after when 10 trials are run. When the number of trials run + is higher, the graph still fluctuates around the middle section. + This might be a result of the algorithm adapting to specific features of the + training info and then having to unlearn it afterwards because it is not + representative of any actual number. + + +3. How many trials do you need to get a smooth curve? + + 50 trials gives a pretty smooth curve, but 100 trials makes it very smooth. + More than 100 would make it smoother, but not by much. + + +4. Try different values for C. What happens? + + Decreasing the C value makes the line very unpredictable and the accuracy + drops quickly. Increasing the C value makes the curve look more and more + like the graph of a square root but mostly maintains the accuracy.