diff --git a/A_Midsummer_Nights_Dream.pickle b/A_Midsummer_Nights_Dream.pickle new file mode 100644 index 0000000..813f6c1 Binary files /dev/null and b/A_Midsummer_Nights_Dream.pickle differ diff --git a/Alls_Well_That_Ends_Well.pickle b/Alls_Well_That_Ends_Well.pickle new file mode 100644 index 0000000..abd192b Binary files /dev/null and b/Alls_Well_That_Ends_Well.pickle differ diff --git a/Antony_and_Cleopatra.pickle b/Antony_and_Cleopatra.pickle new file mode 100644 index 0000000..927d07c Binary files /dev/null and b/Antony_and_Cleopatra.pickle differ diff --git a/Comedy_Of_Errors.pickle b/Comedy_Of_Errors.pickle new file mode 100644 index 0000000..7ec037f Binary files /dev/null and b/Comedy_Of_Errors.pickle differ diff --git a/Coriolanus.pickle b/Coriolanus.pickle new file mode 100644 index 0000000..ed36be7 Binary files /dev/null and b/Coriolanus.pickle differ diff --git a/Cymbeline.pickle b/Cymbeline.pickle new file mode 100644 index 0000000..e10877f Binary files /dev/null and b/Cymbeline.pickle differ diff --git a/Hamlet.pickle b/Hamlet.pickle new file mode 100644 index 0000000..b5780ae Binary files /dev/null and b/Hamlet.pickle differ diff --git a/Julius_Caesar.pickle b/Julius_Caesar.pickle new file mode 100644 index 0000000..df3b547 Binary files /dev/null and b/Julius_Caesar.pickle differ diff --git a/King_Henry_IV.pickle b/King_Henry_IV.pickle new file mode 100644 index 0000000..7486ee3 Binary files /dev/null and b/King_Henry_IV.pickle differ diff --git a/King_John.pickle b/King_John.pickle new file mode 100644 index 0000000..798f9ab Binary files /dev/null and b/King_John.pickle differ diff --git a/King_Richard_II.pickle b/King_Richard_II.pickle new file mode 100644 index 0000000..cd392a9 Binary files /dev/null and b/King_Richard_II.pickle differ diff --git a/Loves_Labours_Lost.pickle b/Loves_Labours_Lost.pickle new file mode 100644 index 0000000..0f53237 Binary files /dev/null and b/Loves_Labours_Lost.pickle differ diff --git a/Measure_for_Measure.pickle b/Measure_for_Measure.pickle new file mode 100644 index 0000000..1c536d0 Binary files /dev/null and b/Measure_for_Measure.pickle differ diff --git a/README.md b/README.md index 8cce527..877b8c4 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ -# TextMining +To run this program you will need to download the GitHub repository as well as the Natural Language Toolkit (NKTL), the requests package and the Vader sentiment analysis package -This is the base repo for the text mining and analysis project for Software Design at Olin College. +The requests package is used to request date from the internet. I can be downloaded by typing pip install requests into the windows command line. + +The Vader sentiment analysis package contains tools for preforming sentiment analysis. It can be downloaded by typing pip install vaderSentiment into the windows command line. + +The natural language toolkit has a wide variety of tools related to language. To download the NLTK simply type pip install nltk into your windows command line. + +After all of the appropriate packages are downloaded the go to the repository and type in python minebooks2.py into the command line. +To run this program you will need to download the GitHub repository as well as the Natural Language Toolkit (NKTL), the requests package and the Vader sentiment analysis package + +The report for this project can be found at https://github.com/hthomas60/TextMining/blob/master/Report2.pdf diff --git a/Report.pdf b/Report.pdf new file mode 100644 index 0000000..8e68444 Binary files /dev/null and b/Report.pdf differ diff --git a/Report2.pdf b/Report2.pdf new file mode 100644 index 0000000..ac6ef58 Binary files /dev/null and b/Report2.pdf differ diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..c50ff38 --- /dev/null +++ b/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-merlot \ No newline at end of file diff --git a/loadBooks.py b/loadBooks.py new file mode 100644 index 0000000..0deef1d --- /dev/null +++ b/loadBooks.py @@ -0,0 +1,29 @@ +import pickle +import requests +import codecs + +def loadbooks(): + """ + Loads books from gutenberg.org. Book id has to be manualy changed each book. + """ + downloaded_book = requests.get('http://www.gutenberg.org/ebooks/1522.txt.utf-8').text + return downloaded_book + +def savebook(book_text, filename): + """ + Saves a the text of a book into a file. + """ + f = open(filename, 'wb') + pickle.dump(book_text, f) + f.close() + +def opensavedbook(file): + """ + Opens a file that is saved on the computer + """ + input_file = open(file, 'rb') + opened_text = pickle.load(input_file) + return opened_text + + + diff --git a/minebooks2.py b/minebooks2.py new file mode 100644 index 0000000..cd9fff4 --- /dev/null +++ b/minebooks2.py @@ -0,0 +1,186 @@ +from loadBooks import * +import string +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +import nltk +from collections import defaultdict #frequwords +import random +import operator + + +def removegutnburg(text): + """ + Removes the Gutenberg license information for so that the text can + be analyzed. + + """ + + licence = "ject Gutenberg Association / Carnegie Mellon University" #last line in license + bookstart = "by William Shakespeare" # set to Act V to + bookend = "THE END" + start_pos = text.find(licence) + 60 #finds the end of the licensing agreement + start_reading = text.find(bookstart,start_pos) + 22 #all of the books start with "title" by William Shakespeare + end_reading = text.find(bookend,start_reading ) #finds THE END at the end of the book + return (text[start_reading:end_reading]) #returns text of the play + + +def loadjustbooks(playfiles): + """ + Takes a list of play names and a list of text files of those plays. + Removes Gutenberg license, newlines and punctuation from the play text. + + Ex. Makes a list of all of the text of the comedic plays + + """ + + plays = [] + + + for i in playfiles: + play = opensavedbook(i) #opens play text file + just_play = removegutnburg(play) #removes gutenberge license + just_play = RemoveAllButLettersAndSpaces(just_play) #removes newlines and punctuation + plays.append(just_play) #adds current play to the play list + + return plays #returns list of plays + +def RemoveAllButLettersAndSpaces(mystring): + """ + removes special characters and punctuation from play texts. + >>> RemoveAllButLettersAndSpaces("\r\nqwetr.,") + 'qwe2tr' + """ + toremove = ['\r', '\n','\'','[', ']','.','?', ';',':','-', '\"',','] + for c in toremove: + mystring = mystring.replace(c, ' ') + + + return mystring.lower() #makes everything lower case words like "The" and "the" are the same + +def RunSentAnalysis(mylist): + """ + runs sentimient analysis and returns positive and negitive sentiments + """ + res = [] + analyzer = SentimentIntensityAnalyzer() + answer = analyzer.polarity_scores(mylist) + res.append(answer["pos"]) + res.append(answer["neg"]) + return res + + +def Most_Common(text): + """ + takes a string and returns the 25 most common words in the string + """ + top_words = [] + freqwords = defaultdict(int) #initializes a dictionary + + for words in text.split(): #goes through a list of all the words in the string text + freqwords[words] += 1 #if a word is found add 1 to the counter + sortedwords = sorted(freqwords.items(), key=operator.itemgetter(1), reverse = True) #sorts dictionary of words based on how many times the word was found in reverse order. + for i in range(25): + top_words.append(sortedwords[i][0]) #store the 25 most common words + return (top_words) #Rreturns a list of the 24 most common words + +def commonOverAll(list1,list2,list3): + """ + returns a list of all the most universally common words of all three story types + + """ + + return( list(set(list(set(list1).intersection(list2))).intersection(list3))) + +def removewords(words,wordstoremove): + """ + Removes words from the plays that are univeraly common amonge all types + """ + words = words.split() + return ([x for x in words if x not in wordstoremove]) #return elements in words that are not in wordstoremove + +def listtostring(mylist): + """ + convers a list of words to string of words + """ + return ' '.join(mylist) + +def linklists(mylist): + """ + Add several list together into one large list + """ + res = [] + for i in range(len(mylist)): + res += [mylist[i]] + return listtostring(res) + +def sampling(mylist, trials): + pos = 0 + neg = 0 + for i in range(trials): + sample = listtostring(random.sample(mylist,10)) + sentiment = RunSentAnalysis(sample) + pos += sentiment[0] + neg += sentiment[1] + + return[pos/trials, neg/trials] + + +def textmining(): # Main function that runs the textmining code. + """ + 13 of Shakespeare’s plays were saved from gutenberg.org. + I sorted the file names of all of the plays into tree lists + comedies, tragedies, and histories. + + """ + + comedies = ['A_Midsummer_Nights_Dream.pickle', 'Alls_Well_That_Ends_Well.pickle'] + tragedies = ['Antony_and_Cleopatra.pickle','Coriolanus.pickle','Cymbeline.pickle'] + histories = ['King_Henry_IV.pickle','King_John.pickle','King_Richard_II.pickle'] + + + colletion = [] #list to store all three types of books + colletion.append(loadjustbooks(comedies)) #loads text from the comedies into the first element + colletion.append(loadjustbooks(tragedies)) #loads text from the trageties into the second element + colletion.append(loadjustbooks(histories)) #loads text from the histories into the second element + #collection was broken up into comedies, tragedies and + #histories to increase code readability + all_comedies = (linklists(colletion[0])) #combines all of the saved comedies into one list + all_trageties = (linklists(colletion[1])) #combines all of the saved trageties into one list + all_histories = (linklists(colletion[2])) #combines all of the saved histories into one list + + common_comedies = (Most_Common(all_comedies)) #finds the most common words in Shakespeare’s comedies + common_trageties = (Most_Common(all_trageties)) #finds the most common words in Shakespeare’s trageties + common_histories = (Most_Common(all_histories)) #finds the most common words in Shakespeare’s histories + + common_words = (commonOverAll(common_comedies,common_trageties, common_histories)) #make a list of words common along all three play types + + comedy_uncommon = removewords(all_comedies,common_words) #removes the univeraly common words from the comedic plays + tragety_uncommon = removewords(all_trageties,common_words) #removes the universally common words from the tragic plays + history_uncommon = removewords(all_histories,common_words) #removes the universally common words from the historic plays + + + + + + print("\n") + print("Sentiment Analysis Average of Comedic Plays") + average = sampling(comedy_uncommon,500) #preform Sentiment Analyses on all three play types + print (average) + print("Sentiment Analysis of Tragic Plays") + average = sampling(tragety_uncommon,500) #preform Sentiment Analyses on all three play types + print (average) + print("Sentiment Analysis of Historic Plays") + average = sampling(tragety_uncommon,500) #preform Sentiment Analyses on all three play types + print (average) + #RunSentAnalysis(tragic_string) + #print("\n") + #print("Sentiment Analysis of Historic Plays") + #RunSentAnalysis(historic_string) + + + + +textmining() +#if __name__ == "__main__": +# import doctest +#doctest.testmod() +