diff --git a/learning_curve.py b/learning_curve.py index 2baa81b..50471a7 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -21,15 +21,15 @@ def display_digits(): def train_model(): """Train a model on pictures of digits. - + Read in 8x8 pictures of numbers and evaluate the accuracy of the model when different percentages of the data are used as training data. This function plots the average accuracy of the model as a function of the percent of data used to train it. """ data = load_digits() - num_trials = 10 - train_percentages = range(5, 95, 5) + num_trials = 30 + train_percentages = list(range(5, 95, 5)) test_accuracies = numpy.zeros(len(train_percentages)) # train models with training percentages between 5 and 90 (see @@ -39,7 +39,16 @@ def train_model(): # For consistency with the previous example use # model = LogisticRegression(C=10**-10) for your learner - # TODO: your code here + for i in range(0, len(train_percentages)): + score_sum = 0 + for n in range(0, num_trials): + x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, + train_size = train_percentages[i]/100) + model = LogisticRegression(C=10**-1) + model.fit(x_train, y_train) + score_sum += model.score(x_test, y_test) + score = score_sum / n + test_accuracies[i] = score fig = plt.figure() plt.plot(train_percentages, test_accuracies) @@ -47,8 +56,8 @@ def train_model(): plt.ylabel('Accuracy on Test Set') plt.show() - if __name__ == "__main__": # Feel free to comment/uncomment as needed - display_digits() - # train_model() + #display_digits() + train_model() + #be_model() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..054252c --- /dev/null +++ b/questions.txt @@ -0,0 +1,33 @@ +1.What is the general trend in the curve? + +Looks to be a generally linear positive trend + + +2. Are there parts of the curve that appear to be noisier than others? Why? + +The bottom parts of the curve (with small train sets). This is because the model +has very little information to pick from when trying to classify the larger test +sets. Because of this, there is a certain amount of variable guessing going on. +This guessing with inherrently vary from model to model, which is why we need to +run a large number of trials to determine how the model with perform with these +small training sets. + +3. How many trials do you need to get a smooth curve? + +Statistically, 30 is seen as a good number. When I run learning_curve.py with +num_trials = 30 though, there's still a good amount of noise going on. By +incrementally increasing num_trials, I've found that a very smooth curve arises +when looking at num_trials = 100. Obviously, the more trials, the smoother the +curve. + + +4. Try different values for C (by changing LogisticRegression(C=10** -10)). What +happens? If you want to know why this happens, see this Wikipedia page as well as +the documentation for LogisticRegression in scikit-learn. + +Increasing C makes the test results much higher, but takes a longer time to run. +From sci-kit documentation, C seems to be the inverse of regularization strength. +The smaller C is, the stronger the regularization. A strong regularization prevents +the model from overfitting from small training sets. Too strong of a regularization +means underfitting the model, which is characterized by a low score on both the +training and testing sets.