From 8abec00875cf864b32ec60d7fae0835947504714 Mon Sep 17 00:00:00 2001
From: "Alexandria A. Johnson" <alexandria.abdallah@gmail.com>
Date: Wed, 3 Sep 2014 11:55:09 -0500
Subject: [PATCH 1/5] Sets up the train method for complex predictor

---
 lib/complex_predictor.rb | 32 ++++++++++++++++++++++++++++++++
 lib/simple_predictor.rb  |  1 +
 2 files changed, 33 insertions(+)

diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index b8921f3..b85aa72 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -5,8 +5,40 @@ class ComplexPredictor < Predictor
   # before the predict() method is called.
   #
   # Returns nothing.
+
   def train!
     @data = {}
+
+
+    @all_books.each do |category, books|
+      @data[category] = {
+        words: 0,
+        books: 0,
+        top_words: [] #top 10 words
+        }
+      }
+      books.each do |filename, tokens|
+        @data[category][:words] += tokens.count
+        @data[category][:books] += 1
+          cutoff_val = good_token_count(tokens).values.sort(-10)
+          @data[category][top_words] = @good_token_count.select {|k,v| v >= cutoff_val})
+        end
+        end
+      end
+    end
+  end
+  end
+
+  def good_token_count(tokens)
+    @good_token_count = { }#word1: 70
+    tokens.each do |t|
+      if good_token?(t) && @good_token_count[t] == nil
+          @good_token_count[t] = 1
+      elsif good_token?(t) && @good_token_count[t]
+          @good_token_count[t] += 1
+      end
+  end
+
   end
 
   # Public: Predicts category.
diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb
index 6b93003..72ed759 100644
--- a/lib/simple_predictor.rb
+++ b/lib/simple_predictor.rb
@@ -18,6 +18,7 @@ def train!
     #   philosophy: {
     #     words: 1000,
     #     books: 10,
+        ###add good tokens
     #   },
     #   archeology: {
     #     words: 2000,

From c38e27a0d4736e559043cb0b023ab6f3d3ee6b63 Mon Sep 17 00:00:00 2001
From: "Alexandria A. Johnson" <alexandria.abdallah@gmail.com>
Date: Thu, 4 Sep 2014 09:22:11 -0500
Subject: [PATCH 2/5] Sets up predictor method to guess book with 65 percent
 accuracy

---
 Gemfile                  |  5 +++++
 gutenberg.rb             |  3 ++-
 lib/complex_predictor.rb | 44 +++++++++++++++++++++++++++++-----------
 spec/Gemfile             |  5 +++++
 spec/spec_helper.rb      |  4 ++++
 5 files changed, 48 insertions(+), 13 deletions(-)
 create mode 100644 Gemfile
 create mode 100644 spec/Gemfile
 create mode 100644 spec/spec_helper.rb

diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..1f47892
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,5 @@
+source 'https://rubygems.org'
+ruby '2.0.0'
+
+gem 'rspec', '~> 2.14.1'
+gem 'pry-byebug'
diff --git a/gutenberg.rb b/gutenberg.rb
index 84d20f6..358e770 100644
--- a/gutenberg.rb
+++ b/gutenberg.rb
@@ -1,5 +1,6 @@
 require_relative 'lib/simple_predictor'
 require_relative 'lib/complex_predictor'
+require_relative  'spec/spec_helper.rb'
 
 def run!(predictor_klass, opts={})
   puts "+----------------------------------------------------+"
@@ -15,7 +16,7 @@ def run!(predictor_klass, opts={})
   start_time = Time.now
   predictor.train!
   puts "Training took #{Time.now - start_time} seconds."
-
+  
   puts "Predicting..."
   start_time = Time.now
   accuracy = predictor.predict_test_set(opts)
diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index b85aa72..6b385c4 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -5,7 +5,7 @@ class ComplexPredictor < Predictor
   # before the predict() method is called.
   #
   # Returns nothing.
-
+attr_accessor :data
   def train!
     @data = {}
 
@@ -15,30 +15,29 @@ def train!
         words: 0,
         books: 0,
         top_words: [] #top 10 words
-        }
       }
       books.each do |filename, tokens|
         @data[category][:words] += tokens.count
         @data[category][:books] += 1
-          cutoff_val = good_token_count(tokens).values.sort(-10)
-          @data[category][top_words] = @good_token_count.select {|k,v| v >= cutoff_val})
-        end
-        end
+          cutoff_val = good_token_count(tokens).values.sort[-50]
+        cutoff_words = @good_token_count.select {|k,v| v >= cutoff_val}
+        cutoff_words.each do |x|
+        @data[category][:top_words].push(x)
+      end
       end
     end
   end
-  end
 
   def good_token_count(tokens)
-    @good_token_count = { }#word1: 70
+    @good_token_count = {}
     tokens.each do |t|
       if good_token?(t) && @good_token_count[t] == nil
           @good_token_count[t] = 1
       elsif good_token?(t) && @good_token_count[t]
           @good_token_count[t] += 1
       end
-  end
-
+  end      
+  @good_token_count
   end
 
   # Public: Predicts category.
@@ -48,7 +47,28 @@ def good_token_count(tokens)
   # Returns a category.
   def predict(tokens)
     # Always predict astronomy, for now.
-    :astronomy
-  end
+    # :astronomy
+    predicted_category = nil
+    counter = 0
+
+    predictee_top_words = []
+    cutoff_val = good_token_count(tokens).values.sort[-50]
+    top_words = @good_token_count.select {|k,v| v >= cutoff_val}
+    top_words.each {|x| predictee_top_words.push(x)}
+
+    @data.each do |category, cat_data|
+    matching_words = (predictee_top_words & cat_data[:top_words])
+      max_matches = matching_words.length
+        if max_matches >= counter
+          counter = max_matches
+          predicted_category = category
+      end
+    end
+  
+    predicted_category
+    end
+    
 end
 
+
+
diff --git a/spec/Gemfile b/spec/Gemfile
new file mode 100644
index 0000000..1f47892
--- /dev/null
+++ b/spec/Gemfile
@@ -0,0 +1,5 @@
+source 'https://rubygems.org'
+ruby '2.0.0'
+
+gem 'rspec', '~> 2.14.1'
+gem 'pry-byebug'
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
new file mode 100644
index 0000000..a950747
--- /dev/null
+++ b/spec/spec_helper.rb
@@ -0,0 +1,4 @@
+#require this file in your spec files to help DRY up your tests
+require 'rspec'
+require 'pry-byebug'
+require_relative '../gutenberg.rb'

From 9e47698450837dd9fe3ab92de90f0f2aa9cfe572 Mon Sep 17 00:00:00 2001
From: "Alexandria A. Johnson" <alexandria.abdallah@gmail.com>
Date: Thu, 4 Sep 2014 09:49:26 -0500
Subject: [PATCH 3/5] Refactored good_token method

---
 lib/complex_predictor.rb | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index 6b385c4..9e830c3 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -14,31 +14,33 @@ def train!
       @data[category] = {
         words: 0,
         books: 0,
-        top_words: [] #top 10 words
+        top_words: [] 
       }
       books.each do |filename, tokens|
         @data[category][:words] += tokens.count
         @data[category][:books] += 1
-          cutoff_val = good_token_count(tokens).values.sort[-50]
-        cutoff_words = @good_token_count.select {|k,v| v >= cutoff_val}
-        cutoff_words.each do |x|
-        @data[category][:top_words].push(x)
-      end
+        good_token_count(tokens).each { |x| @data[category][:top_words].push(x) }
       end
     end
   end
 
   def good_token_count(tokens)
     @good_token_count = {}
+    @top_words = []
     tokens.each do |t|
       if good_token?(t) && @good_token_count[t] == nil
           @good_token_count[t] = 1
       elsif good_token?(t) && @good_token_count[t]
           @good_token_count[t] += 1
       end
+    end
+    cutoff_val = @good_token_count.values.sort[-50]
+    top_words = @good_token_count.select {|k,v| v >= cutoff_val}
+    top_words.each {|x| @top_words.push(x)}  
+    @top_words
   end      
-  @good_token_count
-  end
+    
+
 
   # Public: Predicts category.
   #
@@ -51,10 +53,7 @@ def predict(tokens)
     predicted_category = nil
     counter = 0
 
-    predictee_top_words = []
-    cutoff_val = good_token_count(tokens).values.sort[-50]
-    top_words = @good_token_count.select {|k,v| v >= cutoff_val}
-    top_words.each {|x| predictee_top_words.push(x)}
+    predictee_top_words = good_token_count(tokens)
 
     @data.each do |category, cat_data|
     matching_words = (predictee_top_words & cat_data[:top_words])
@@ -64,7 +63,6 @@ def predict(tokens)
           predicted_category = category
       end
     end
-  
     predicted_category
     end
     

From 42c365887aed4d61699c86ede2c1ece402fd5fc8 Mon Sep 17 00:00:00 2001
From: "Alexandria A. Johnson" <alexandria.abdallah@gmail.com>
Date: Thu, 4 Sep 2014 10:35:03 -0500
Subject: [PATCH 4/5] Sets up a method to pull title and see if the category is
 in the title

---
 lib/complex_predictor.rb | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index 9e830c3..3f5e72b 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -20,6 +20,7 @@ def train!
         @data[category][:words] += tokens.count
         @data[category][:books] += 1
         good_token_count(tokens).each { |x| @data[category][:top_words].push(x) }
+        tokens.each
       end
     end
   end
@@ -39,7 +40,16 @@ def good_token_count(tokens)
     top_words.each {|x| @top_words.push(x)}  
     @top_words
   end      
-    
+
+  def find_title(tokens)
+  title_start = tokens.index("title") 
+  title_end = tokens.index("author")
+  title = tokens[title_start .. title_end]
+  title.pop
+  title.delete("title")
+  title 
+  end   
+ 
 
 
   # Public: Predicts category.
@@ -54,6 +64,7 @@ def predict(tokens)
     counter = 0
 
     predictee_top_words = good_token_count(tokens)
+    title_test = find_title(tokens)
 
     @data.each do |category, cat_data|
     matching_words = (predictee_top_words & cat_data[:top_words])
@@ -62,7 +73,10 @@ def predict(tokens)
           counter = max_matches
           predicted_category = category
       end
-    end
+        if title_test.include?(category.to_s)
+        predicted_category = category
+    end 
+    end    
     predicted_category
     end
     

From bde7fcd979f39daffbb4c8b7ae91f67965f9942b Mon Sep 17 00:00:00 2001
From: "Alexandria A. Johnson" <alexandria.abdallah@gmail.com>
Date: Mon, 8 Sep 2014 17:53:12 -0500
Subject: [PATCH 5/5] Fixes a hash/array error and pushes accuracy to 100
 percent

---
 gutenberg.rb             |  1 -
 lib/complex_predictor.rb | 33 +++++++++++++++++++--------------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/gutenberg.rb b/gutenberg.rb
index 358e770..c4c342c 100644
--- a/gutenberg.rb
+++ b/gutenberg.rb
@@ -1,6 +1,5 @@
 require_relative 'lib/simple_predictor'
 require_relative 'lib/complex_predictor'
-require_relative  'spec/spec_helper.rb'
 
 def run!(predictor_klass, opts={})
   puts "+----------------------------------------------------+"
diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index 3f5e72b..7e5286e 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -17,10 +17,15 @@ def train!
         top_words: [] 
       }
       books.each do |filename, tokens|
+        # title = find_title(tokens)
+        #   title.each do |t|
+        #     if good_token?(t)
+        #       @data[category][:top_words].push(t)
+        #     end
+        #   end
         @data[category][:words] += tokens.count
         @data[category][:books] += 1
         good_token_count(tokens).each { |x| @data[category][:top_words].push(x) }
-        tokens.each
       end
     end
   end
@@ -37,18 +42,17 @@ def good_token_count(tokens)
     end
     cutoff_val = @good_token_count.values.sort[-50]
     top_words = @good_token_count.select {|k,v| v >= cutoff_val}
-    top_words.each {|x| @top_words.push(x)}  
+    top_words.each {|word, count| @top_words.push(word)}  
     @top_words
   end      
 
-  def find_title(tokens)
-  title_start = tokens.index("title") 
-  title_end = tokens.index("author")
-  title = tokens[title_start .. title_end]
-  title.pop
-  title.delete("title")
-  title 
-  end   
+  # def find_title(tokens)
+  # title_start = tokens.index("title") 
+  # title_end = tokens.index("author")
+  # title = tokens[title_start...title_end]
+  # title.delete("title")
+  # title 
+  # end   
  
 
 
@@ -57,6 +61,7 @@ def find_title(tokens)
   # tokens - A list of tokens (words).
   #
   # Returns a category.
+  
   def predict(tokens)
     # Always predict astronomy, for now.
     # :astronomy
@@ -64,7 +69,7 @@ def predict(tokens)
     counter = 0
 
     predictee_top_words = good_token_count(tokens)
-    title_test = find_title(tokens)
+    # title_test = find_title(tokens)
 
     @data.each do |category, cat_data|
     matching_words = (predictee_top_words & cat_data[:top_words])
@@ -73,9 +78,9 @@ def predict(tokens)
           counter = max_matches
           predicted_category = category
       end
-        if title_test.include?(category.to_s)
-        predicted_category = category
-    end 
+    #     if title_test.include?(category.to_s)
+    #     predicted_category = category
+    # end 
     end    
     predicted_category
     end