From b7865b77eb66b5b9b01a23afc3e9645438717692 Mon Sep 17 00:00:00 2001 From: Bradley Anderson Date: Thu, 4 Sep 2014 09:44:36 -0500 Subject: [PATCH] Adding solution to Project Gutenberg exercise; Jeffrey Penkar and I worked on this project together using one set of code, consulting each other. --- data/stopwords.txt | 2 +- gutenberg.rb | 2 +- lib/complex_predictor.rb | 30 ++++++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/data/stopwords.txt b/data/stopwords.txt index 7336c7c..f4a7dff 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg +light,because,been,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,in,into,just,least,like,likely,might,most,must,my,neither,often,on,only,or,other,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,being,through,here,true,see,time,those,place,much,without,body,whole,another,thus,set,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation,men,man,many,feet,small,distance,observations,footnote,found,illustration,large,stone,years,world,nature,miles,itself,earth,sense,hello,planet,about,house,ancient,works,under,three,after,thing,theory,times,against,death,spirit,christ, \ No newline at end of file diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..f28f5d0 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,6 +23,6 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +# run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..829528b 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,4 +1,5 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called @@ -7,6 +8,18 @@ class ComplexPredictor < Predictor # Returns nothing. def train! @data = {} + @all_books.each do |category, books| + @data[category] = Hash.new(0) + tokenHash = Hash.new(0) + books.each do |filename, tokens| + tokens.each do |word| + if word.length > 4 && good_token?(word) + tokenHash[word]+= 1 + end + end + end + @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-7..-1].map!{|x| x[0]} + end end # Public: Predicts category. @@ -15,8 +28,17 @@ def train! # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy + newarray = tokens[0..18000] + @decider = Hash.new(0) + newarray.each do |word| + if good_token?(word) + @data.each do |key,val| + if val.values[0].include?(word) + @decider[key]+=1 + end + end + end + end + return @decider.max_by{|x,y| y}[0] end -end - +end \ No newline at end of file