diff --git a/README.md b/README.md index 8cce527..7d74153 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,21 @@ # TextMining -This is the base repo for the text mining and analysis project for Software Design at Olin College. +## Description +This project analyzes philosophical texts for linguistic similarity and visualizes their relationship spatially using Metric Multidimensional Scaling. +It also includes a Markov text synthesizer to generate a philosophical "maxim" across all included schools of thought. + +## Getting Started + +### Required Packages: +pip install nltk requests vaderSentiment +pip install matplotlib scikit-learn scip + +### Usage: +To run the text analysis, use: +python text_mining.py + +### Existing Files: +philtexts3.pickle was generated using python pulltexts.py + +## Links +[Project Reflection](Reflection.pdf) \ No newline at end of file diff --git a/Reflection.pdf b/Reflection.pdf new file mode 100644 index 0000000..484a9f5 Binary files /dev/null and b/Reflection.pdf differ diff --git a/TextCluster.png b/TextCluster.png new file mode 100644 index 0000000..79719df Binary files /dev/null and b/TextCluster.png differ diff --git a/philtexts3.pickle b/philtexts3.pickle new file mode 100644 index 0000000..9b4d19a Binary files /dev/null and b/philtexts3.pickle differ diff --git a/pulltexts.py b/pulltexts.py new file mode 100644 index 0000000..db79efa --- /dev/null +++ b/pulltexts.py @@ -0,0 +1,143 @@ +""" +File: text_similarity.py +Name: Ava Lakmazaheri +Date: 10/11/17 +Desc: Load, pickle texts from Project Gutenberg +""" +import pickle +import numpy as np +import math +from sklearn.manifold import MDS +import matplotlib.pyplot as plt +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + +all_names = ['tao', 'analects', 'plato', 'aristotle', 'machiavelli', 'spinoza', +'locke', 'hume', 'kant', 'marx', 'mill', 'cousin', 'nietzsche'] + +num = len(all_names) + +all_texts = [' '] * num + +def clean(text): + """ + Removes header and footer text from Gutenberg document + Input: string + Output: string + """ + startidx = text.find(" ***") + endidx = text.rfind("*** ") + return text[startidx:endidx] + +def load_texts(filename): + """ + Loads in all books from a .pickle file and stores each as a string element in a list + Input: none (change to .pickle file name?) + Output: list of strings + """ + input_file = open(filename, 'rb') + reloaded_copy_of_texts = pickle.load(input_file) + + for i in range(num-1): + all_texts[i] = clean(reloaded_copy_of_texts[i]) + +def histogram(text): + """ + Counts occurrences of each word in text + Input: string + Output: dict + """ + d = dict() + + # break giant string of text into list of words + words = text.split(); + for word in words: + d[word] = d.get(word, 0) + 1 + return d + +def all_unique_words(all_texts): + """ + Accounts for all unique words in all texts provided, to assist with similarity analysis + Input: list of strings + Output: list of strings + """ + allwords = [] + + for text in all_texts: + wordlist = text.split() + for word in wordlist: + if(word not in allwords): + allwords.append(word) + return allwords + +def gen_vector(text, wordbank): + """ + Generate an n-dimensional vector for word count (where n is the total number of unique words) + Inputs: string, list of strings + Output: list of values (in this case, floats >= 0) + """ + v = [] + h = histogram(text) + + for word in wordbank: + v.append(h.get(word, 0)) + + return v + +def comp_cos(vec1, vec2): + """ + Compute the cosine similarity between two vectors + Inputs: list of floats + Output: float + """ + dot_product = np.dot(vec1, vec2) + norm_1 = np.linalg.norm(vec1) + norm_2 = np.linalg.norm(vec2) + cos_val = dot_product / (norm_1 * norm_2) + if math.isnan(cos_val): + cos_val = 0 + return cos_val + +def similarity(): + """ + Run linguistic similarity analysis on on philosophy texts. Print the raw + similarity comparisons and plot their relationships spatially. + """ + wordbank = all_unique_words(all_texts) + + vecs = [[]] * num + for i in range(num-1): + vecs[i] = gen_vector(all_texts[i], wordbank) + + sim = np.zeros((num, num)) + for i in range(num-1): + for j in range(num-1): + sim[i][j] = comp_cos(vecs[i], vecs[j]) + #print(sim[i][j]) + + dissimilarities = 1 - sim + coord = MDS(dissimilarity='precomputed').fit_transform(dissimilarities) + + plt.scatter(coord[:,0], coord[:,1]) + + # Label the points + for i in range(coord.shape[0]): + plt.annotate(str(i), (coord[i,:])) + + plt.show() + +def sentiment(text): + """ + Run valence sentiment analysis on text + Input: string + Output: dict + """ + analyzer = SentimentIntensityAnalyzer() + f = analyzer.polarity_scores(text) + return f + +if __name__ == "__main__": + load_texts('philtexts2.pickle') + similarity() + # for i in range(num-1): + # print(all_names[i]) + # print(sentiment(all_texts[i])) diff --git a/text_mining.py b/text_mining.py new file mode 100644 index 0000000..7e8362a --- /dev/null +++ b/text_mining.py @@ -0,0 +1,266 @@ +""" +File: text_mining.py +Name: Ava Lakmazaheri +Date: 10/11/17 +Desc: Reads in text of books from a .pickle file, runs word frequency analysis for determining cosine similarity between them. + Also uses compilation of all books for Markov Text Synthesis! +""" +import math +import random +import pickle +import doctest +import numpy as np +import matplotlib.pyplot as plt +from sklearn.manifold import MDS +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + +# Global variables: +all_names = ['Lao-Tze', 'Plato', 'Aristotle', 'Machiavelli', 'Spinoza', 'Locke', 'Hume', 'Kant', 'Marx', 'Mill', 'Cousin', 'Nietzsche'] +num = len(all_names) +all_texts = [' '] * num + +suffix_map = {} # map prefixes to list of suffixes +prefix = () + +def load_texts(filename): + """ + Load in all books from a .pickle file and store each as a string element in a list. + + Note that instead of returning the list, the function saves it to a global variable. + + Args: + filename: name of .pickle file as a string + """ + input_file = open(filename, 'rb') + reloaded_copy_of_texts = pickle.load(input_file) + + for i in range(num): + all_texts[i] = clean(reloaded_copy_of_texts[i]) + +def clean(text): + """ + Cleans text of Project Gutenberg header and footer + + Args: + text: Gutenberg text as a string + + Returns: + Book-only text as a string + """ + #startidx = text.find(" ***") + startidx = text.find(' ***') + endidx = text.rfind("*** END") + return text[startidx:endidx] + + +def run_similarity(): + """ + Run linguistic similarity analysis on on philosophy texts and plot their relationships spatially using MDS + """ + wordbank = all_unique_words(all_texts) + + vecs = [[]] * num + for i in range(num): + vecs[i] = gen_vector(all_texts[i], wordbank) + + sim = np.zeros((num, num)) + for i in range(num): + for j in range(num): + sim[i][j] = comp_cos(vecs[i], vecs[j]) + #print(sim[i][j]) + + dissimilarities = 1 - sim + coord = MDS(dissimilarity='precomputed').fit_transform(dissimilarities) + + plt.scatter(coord[:,0], coord[:,1]) + + # Label the points + for i in range(coord.shape[0]): + plt.annotate(all_names[i], (coord[i,:])) + + plt.show() + +def histogram(text): + """ + Counts occurrences of each word in text + Args: + text: book text as string + + Returns: + dict (key: words in the text, value: frequency of occurence) + """ + d = dict() + + # break giant string of text into list of words + words = text.split(); + for word in words: + d[word] = d.get(word, 0) + 1 + return d + +def all_unique_words(all_texts): + """ + Compiles all unique words that appear in all loaded texts + Args: + all_texts: list of all book-texts as strings + + Returns: + List of all unique words (strings) in all loaded texts -- no repeats! + """ + allwords = [] + + for text in all_texts: + wordlist = text.split() + for word in wordlist: + if(word not in allwords): + allwords.append(word) + return allwords + +def gen_vector(text, wordbank): + """ + Generate an n-dimensional vector for word count (where n is the total number of unique words) + Args: + text: book text as string + wordbank: single list of all unique words in all texts + + Returns: + List of frequencies (numbers >= 0) + """ + v = [] + h = histogram(text) + + for word in wordbank: + # If a word does not appear in the text, store it as 0 frequency + v.append(h.get(word, 0)) + + return v + +def comp_cos(vec1, vec2): + """ + Compute the cosine similarity between two vectors + Args: + vec1: list of floats (word frequencies) for text 1 + vec2: list of floats (word frequencies) for text 2 + + Returns: + Cosine similarity as float between texts 1 and 2 + + >>> comp_cos([1,0], [-1,0]) + -1.0 + """ + dot_product = np.dot(vec1, vec2) + norm_1 = np.linalg.norm(vec1) + norm_2 = np.linalg.norm(vec2) + + prod = norm_1 * norm_2 + + if prod == 0: # avoid dviding by 0 + return 0 + return dot_product/prod + + +def run_sentiments(all_texts): + """ + Print names of text and their corresponding sentiment analysis + Args: + all_texts: list of all book-texts as strings + """ + for i in range(num-1): + print(all_names[i]) + print(sentiment(all_texts[i])) + +def sentiment(text): + """ + Run valence sentiment analysis on text + Args: + text: book contents as string + + Returns: + dict of negative, neutral, and positive valence scores + """ + analyzer = SentimentIntensityAnalyzer() + f = analyzer.polarity_scores(text) + return f + + + +def run_markov(n=100): + """ + Create a giant text file that combines all of the books together. Split it + up and process each word individually. Then generate a random string of text + using this processing! + """ + concatall = ' '.join(all_texts) + + for word in concatall.rstrip().split(): + process_word(word) + + gen_text(n) + +def process_word(word, order=2): + """ + Take each word and add corresponding entries to the Markov dictionary + + Args: + word: string + order: integer length of tuple + + """ + global prefix + + if len(prefix) < order: # keep adding words until order length is fulfilled + prefix += (word,) + return + + try: + suffix_map[prefix].append(word) # if there is no entry for this prefix, make one + except KeyError: + suffix_map[prefix] = [word] + + prefix = shift(prefix, word) # increment tuple + +def shift(t, word): + """ + Creates tuple by removing the existing head and adding word to the tail + + Args: + t: tuple of strings + word: string + + Returns: + Tuple of strings + """ + return t[1:] + (word,) + +def gen_text(n): + """ + Generate sentence(s) of n random words based on the analyzed text. + + Start with a random prefix from the dictionary! + + Args: + n: number of words to generate + """ + # Start with a random prefix + start = random.choice(list(suffix_map.keys())) + + # While there are still words to be written... + for i in range(n): + suffixes = suffix_map.get(start, None) # find an appropriate suffix + if suffixes == None: + gen_text(n-i) # if it isn't in map, start again! + return + + word = random.choice(suffixes) # choose a random suffix + print(word, end=' ') # print the next word + start = shift(start, word) + + +if __name__ == "__main__": + doctest.testmod() + + load_texts('philtexts3.pickle') + run_markov(100) + run_similarity() + + # Sentiment analysis didn't end up being as interesting as I hoped, so I left it out of the final analysis! + #run_sentiments(all_texts)