From 8abec00875cf864b32ec60d7fae0835947504714 Mon Sep 17 00:00:00 2001 From: "Alexandria A. Johnson" Date: Wed, 3 Sep 2014 11:55:09 -0500 Subject: [PATCH 1/5] Sets up the train method for complex predictor --- lib/complex_predictor.rb | 32 ++++++++++++++++++++++++++++++++ lib/simple_predictor.rb | 1 + 2 files changed, 33 insertions(+) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..b85aa72 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -5,8 +5,40 @@ class ComplexPredictor < Predictor # before the predict() method is called. # # Returns nothing. + def train! @data = {} + + + @all_books.each do |category, books| + @data[category] = { + words: 0, + books: 0, + top_words: [] #top 10 words + } + } + books.each do |filename, tokens| + @data[category][:words] += tokens.count + @data[category][:books] += 1 + cutoff_val = good_token_count(tokens).values.sort(-10) + @data[category][top_words] = @good_token_count.select {|k,v| v >= cutoff_val}) + end + end + end + end + end + end + + def good_token_count(tokens) + @good_token_count = { }#word1: 70 + tokens.each do |t| + if good_token?(t) && @good_token_count[t] == nil + @good_token_count[t] = 1 + elsif good_token?(t) && @good_token_count[t] + @good_token_count[t] += 1 + end + end + end # Public: Predicts category. diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb index 6b93003..72ed759 100644 --- a/lib/simple_predictor.rb +++ b/lib/simple_predictor.rb @@ -18,6 +18,7 @@ def train! # philosophy: { # words: 1000, # books: 10, + ###add good tokens # }, # archeology: { # words: 2000, From c38e27a0d4736e559043cb0b023ab6f3d3ee6b63 Mon Sep 17 00:00:00 2001 From: "Alexandria A. Johnson" Date: Thu, 4 Sep 2014 09:22:11 -0500 Subject: [PATCH 2/5] Sets up predictor method to guess book with 65 percent accuracy --- Gemfile | 5 +++++ gutenberg.rb | 3 ++- lib/complex_predictor.rb | 44 +++++++++++++++++++++++++++++----------- spec/Gemfile | 5 +++++ spec/spec_helper.rb | 4 ++++ 5 files changed, 48 insertions(+), 13 deletions(-) create mode 100644 Gemfile create mode 100644 spec/Gemfile create mode 100644 spec/spec_helper.rb diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..1f47892 --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' +ruby '2.0.0' + +gem 'rspec', '~> 2.14.1' +gem 'pry-byebug' diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..358e770 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -1,5 +1,6 @@ require_relative 'lib/simple_predictor' require_relative 'lib/complex_predictor' +require_relative 'spec/spec_helper.rb' def run!(predictor_klass, opts={}) puts "+----------------------------------------------------+" @@ -15,7 +16,7 @@ def run!(predictor_klass, opts={}) start_time = Time.now predictor.train! puts "Training took #{Time.now - start_time} seconds." - + puts "Predicting..." start_time = Time.now accuracy = predictor.predict_test_set(opts) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b85aa72..6b385c4 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -5,7 +5,7 @@ class ComplexPredictor < Predictor # before the predict() method is called. # # Returns nothing. - +attr_accessor :data def train! @data = {} @@ -15,30 +15,29 @@ def train! words: 0, books: 0, top_words: [] #top 10 words - } } books.each do |filename, tokens| @data[category][:words] += tokens.count @data[category][:books] += 1 - cutoff_val = good_token_count(tokens).values.sort(-10) - @data[category][top_words] = @good_token_count.select {|k,v| v >= cutoff_val}) - end - end + cutoff_val = good_token_count(tokens).values.sort[-50] + cutoff_words = @good_token_count.select {|k,v| v >= cutoff_val} + cutoff_words.each do |x| + @data[category][:top_words].push(x) + end end end end - end def good_token_count(tokens) - @good_token_count = { }#word1: 70 + @good_token_count = {} tokens.each do |t| if good_token?(t) && @good_token_count[t] == nil @good_token_count[t] = 1 elsif good_token?(t) && @good_token_count[t] @good_token_count[t] += 1 end - end - + end + @good_token_count end # Public: Predicts category. @@ -48,7 +47,28 @@ def good_token_count(tokens) # Returns a category. def predict(tokens) # Always predict astronomy, for now. - :astronomy - end + # :astronomy + predicted_category = nil + counter = 0 + + predictee_top_words = [] + cutoff_val = good_token_count(tokens).values.sort[-50] + top_words = @good_token_count.select {|k,v| v >= cutoff_val} + top_words.each {|x| predictee_top_words.push(x)} + + @data.each do |category, cat_data| + matching_words = (predictee_top_words & cat_data[:top_words]) + max_matches = matching_words.length + if max_matches >= counter + counter = max_matches + predicted_category = category + end + end + + predicted_category + end + end + + diff --git a/spec/Gemfile b/spec/Gemfile new file mode 100644 index 0000000..1f47892 --- /dev/null +++ b/spec/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' +ruby '2.0.0' + +gem 'rspec', '~> 2.14.1' +gem 'pry-byebug' diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..a950747 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,4 @@ +#require this file in your spec files to help DRY up your tests +require 'rspec' +require 'pry-byebug' +require_relative '../gutenberg.rb' From 9e47698450837dd9fe3ab92de90f0f2aa9cfe572 Mon Sep 17 00:00:00 2001 From: "Alexandria A. Johnson" Date: Thu, 4 Sep 2014 09:49:26 -0500 Subject: [PATCH 3/5] Refactored good_token method --- lib/complex_predictor.rb | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 6b385c4..9e830c3 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -14,31 +14,33 @@ def train! @data[category] = { words: 0, books: 0, - top_words: [] #top 10 words + top_words: [] } books.each do |filename, tokens| @data[category][:words] += tokens.count @data[category][:books] += 1 - cutoff_val = good_token_count(tokens).values.sort[-50] - cutoff_words = @good_token_count.select {|k,v| v >= cutoff_val} - cutoff_words.each do |x| - @data[category][:top_words].push(x) - end + good_token_count(tokens).each { |x| @data[category][:top_words].push(x) } end end end def good_token_count(tokens) @good_token_count = {} + @top_words = [] tokens.each do |t| if good_token?(t) && @good_token_count[t] == nil @good_token_count[t] = 1 elsif good_token?(t) && @good_token_count[t] @good_token_count[t] += 1 end + end + cutoff_val = @good_token_count.values.sort[-50] + top_words = @good_token_count.select {|k,v| v >= cutoff_val} + top_words.each {|x| @top_words.push(x)} + @top_words end - @good_token_count - end + + # Public: Predicts category. # @@ -51,10 +53,7 @@ def predict(tokens) predicted_category = nil counter = 0 - predictee_top_words = [] - cutoff_val = good_token_count(tokens).values.sort[-50] - top_words = @good_token_count.select {|k,v| v >= cutoff_val} - top_words.each {|x| predictee_top_words.push(x)} + predictee_top_words = good_token_count(tokens) @data.each do |category, cat_data| matching_words = (predictee_top_words & cat_data[:top_words]) @@ -64,7 +63,6 @@ def predict(tokens) predicted_category = category end end - predicted_category end From 42c365887aed4d61699c86ede2c1ece402fd5fc8 Mon Sep 17 00:00:00 2001 From: "Alexandria A. Johnson" Date: Thu, 4 Sep 2014 10:35:03 -0500 Subject: [PATCH 4/5] Sets up a method to pull title and see if the category is in the title --- lib/complex_predictor.rb | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 9e830c3..3f5e72b 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -20,6 +20,7 @@ def train! @data[category][:words] += tokens.count @data[category][:books] += 1 good_token_count(tokens).each { |x| @data[category][:top_words].push(x) } + tokens.each end end end @@ -39,7 +40,16 @@ def good_token_count(tokens) top_words.each {|x| @top_words.push(x)} @top_words end - + + def find_title(tokens) + title_start = tokens.index("title") + title_end = tokens.index("author") + title = tokens[title_start .. title_end] + title.pop + title.delete("title") + title + end + # Public: Predicts category. @@ -54,6 +64,7 @@ def predict(tokens) counter = 0 predictee_top_words = good_token_count(tokens) + title_test = find_title(tokens) @data.each do |category, cat_data| matching_words = (predictee_top_words & cat_data[:top_words]) @@ -62,7 +73,10 @@ def predict(tokens) counter = max_matches predicted_category = category end - end + if title_test.include?(category.to_s) + predicted_category = category + end + end predicted_category end From bde7fcd979f39daffbb4c8b7ae91f67965f9942b Mon Sep 17 00:00:00 2001 From: "Alexandria A. Johnson" Date: Mon, 8 Sep 2014 17:53:12 -0500 Subject: [PATCH 5/5] Fixes a hash/array error and pushes accuracy to 100 percent --- gutenberg.rb | 1 - lib/complex_predictor.rb | 33 +++++++++++++++++++-------------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/gutenberg.rb b/gutenberg.rb index 358e770..c4c342c 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -1,6 +1,5 @@ require_relative 'lib/simple_predictor' require_relative 'lib/complex_predictor' -require_relative 'spec/spec_helper.rb' def run!(predictor_klass, opts={}) puts "+----------------------------------------------------+" diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 3f5e72b..7e5286e 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -17,10 +17,15 @@ def train! top_words: [] } books.each do |filename, tokens| + # title = find_title(tokens) + # title.each do |t| + # if good_token?(t) + # @data[category][:top_words].push(t) + # end + # end @data[category][:words] += tokens.count @data[category][:books] += 1 good_token_count(tokens).each { |x| @data[category][:top_words].push(x) } - tokens.each end end end @@ -37,18 +42,17 @@ def good_token_count(tokens) end cutoff_val = @good_token_count.values.sort[-50] top_words = @good_token_count.select {|k,v| v >= cutoff_val} - top_words.each {|x| @top_words.push(x)} + top_words.each {|word, count| @top_words.push(word)} @top_words end - def find_title(tokens) - title_start = tokens.index("title") - title_end = tokens.index("author") - title = tokens[title_start .. title_end] - title.pop - title.delete("title") - title - end + # def find_title(tokens) + # title_start = tokens.index("title") + # title_end = tokens.index("author") + # title = tokens[title_start...title_end] + # title.delete("title") + # title + # end @@ -57,6 +61,7 @@ def find_title(tokens) # tokens - A list of tokens (words). # # Returns a category. + def predict(tokens) # Always predict astronomy, for now. # :astronomy @@ -64,7 +69,7 @@ def predict(tokens) counter = 0 predictee_top_words = good_token_count(tokens) - title_test = find_title(tokens) + # title_test = find_title(tokens) @data.each do |category, cat_data| matching_words = (predictee_top_words & cat_data[:top_words]) @@ -73,9 +78,9 @@ def predict(tokens) counter = max_matches predicted_category = category end - if title_test.include?(category.to_s) - predicted_category = category - end + # if title_test.include?(category.to_s) + # predicted_category = category + # end end predicted_category end