diff --git a/learning_curve.py b/learning_curve.py index 2baa81b..26aba48 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -21,14 +21,14 @@ def display_digits(): def train_model(): """Train a model on pictures of digits. - + Read in 8x8 pictures of numbers and evaluate the accuracy of the model when different percentages of the data are used as training data. This function plots the average accuracy of the model as a function of the percent of data used to train it. """ data = load_digits() - num_trials = 10 + num_trials = 50 train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) @@ -39,7 +39,17 @@ def train_model(): # For consistency with the previous example use # model = LogisticRegression(C=10**-10) for your learner - # TODO: your code here + for p in train_percentages: + accuracies_for_p = [] + for trial in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=p/100) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + accuracies_for_p.append(model.score(X_test, y_test)) + test_accuracies[train_percentages.index(p)] = numpy.mean(accuracies_for_p) + #print("Train accuracy %f" %model.score(X_train, y_train)) + #print("Test accuracy %f"%model.score(X_test, y_test)) + fig = plt.figure() plt.plot(train_percentages, test_accuracies) @@ -50,5 +60,5 @@ def train_model(): if __name__ == "__main__": # Feel free to comment/uncomment as needed - display_digits() - # train_model() + #display_digits() + train_model() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..44ba6cd --- /dev/null +++ b/questions.txt @@ -0,0 +1,33 @@ +1. What is the general trend in the curve? + +The curve has a positive linear trend. The more data that is used to train the +model, the more accurate the model will be on classifying the remaining data! + +2. Are there parts of the curve that appear to be noisier than others? Why? + +Lower training percentages tend to be noisier. Two small subsets of data are far +less likely to resemble each other, as opposed to two sets which contain a +majority of the data. The latter is far more likely to capture the +same overall patterns, and thus give a more consistent accuracy. + +3. How many trials do you need to get a smooth curve? + +Between 50 and 100 trials is the lower threshold to produce a smooth line. +Because of the sampling of percentages (taken at every 5%), slight +variations in accuracy will nearly always give a choppy appearance. + +4. Try different values for C (by changing LogisticRegression(C=10** -10)). What +happens? If you want to know why this happens, see this Wikipedia page as well +as the documentation for LogisticRegression in scikit-learn. + +Increasing C increases the accuracy of the model while adding to computational +time. C = 10^-2 achieves 88% accuracy with just 5% of the data and gives a peak +accuracy over 96%! + +Note that a C value over 10^-10 also makes the learning curve begin to appear +logistic! + +C represents the inverse of regularization strength. Smaller values give stronger +regularization, which is a process of introducing additional information to +prevent overfitting. It makes sense, then, that a larger C value will require +less data to give a high accuracy, as it is overfitting on training data.