sd17fall · ewesterhoff · Oct 9, 2017 · Oct 12, 2017 · Oct 12, 2017 · Oct 12, 2017
diff --git a/README.md b/README.md
@@ -1,3 +1,8 @@
-# TextMining
-
+# TextMining 
 This is the base repo for the text mining and analysis project for Software Design at Olin College.
+
+## Required Libraries
+This set of code requires a small set of python libraries to be pre-installed on the computer: sys, string, random, requests, bs4, urllib, glob, and os.  These libraries can be easily installed on windows using the 'pip install' command.
+
+## Running the Code
+Run fetch_files.py first to create a folder in your workspace called religious documents, which will contain on the order of 1000 religious text files.  You only need to run this once.  From here, run mining.py to randomly generate text.
diff --git a/emma.txt b/emma.txt
diff --git a/fetch_files.py b/fetch_files.py
@@ -0,0 +1,76 @@
+import requests
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup
+import urllib
+import string
+import os
+
+
+def find_all_religions(url):
+    """returns list of urls for all listed religions"""
+    url_string = requests.get(url)
+    soup = BeautifulSoup(url_string.text, 'lxml') # Parse the HTML as a string
+
+    a_tags = soup.find_all('a')
+    links = [urljoin(url, a['href'])for a in a_tags]
+
+    #return only the first sixteen
+    links = links[:16]
+    return links
+
+def find_text_files(url):
+    """returns list of text urls from the each religion url"""
+    url_string = requests.get(url)
+    soup = BeautifulSoup(url_string.text, 'lxml') # Parse the HTML as a string
+
+    a_tags = soup.find_all('a')
+
+    url += '/'
+    files = [urljoin(url, a['href'])for a in a_tags]
+
+    texts = []
+    #return only text file links
+    for a in files:
+        x = str(a)
+        if x[-3:] == 'txt':
+            texts.append(a)
+    return texts
+
+
+def get_file_name(url):
+    """returns file name for text document"""
+    for i in range(0,len(url)):
+        x = len(url)-1-i
+        if url[x] == "/":
+            return url[x+1:]
+
+def get_content(url):
+    """saves content to a text file in folder 'religious documents'"""
+    data = urllib.request.urlopen(url)
+    path = 'religious documents'
+
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    file_name = get_file_name(url)
+
+    with open(os.path.join(path, file_name), 'wb') as temp_file:
+        for line in data:
+            temp_file.write(line)
+
+    temp_file.close()
+
+def download_files(links):
+    """saves text file from link to a folder named religious docs"""
+    for t in links:
+        get_content(t)
+
+if __name__ == '__main__':
+    url = 'http://textfiles.com/occult/'
+    religions = find_all_religions(url)
+    text_urls = []
+    for r in religions:
+        x = find_text_files(r)
+        for i in x:
+            text_urls.append(i)
+    download_files(text_urls)
diff --git a/get_content.py b/get_content.py
@@ -0,0 +1,36 @@
+import requests
+from urllib.parse import urljoin
+import urllib
+from bs4 import BeautifulSoup
+import string
+import os
+
+def get_file_name(url):
+    """returns file name for text document"""
+    for i in range(0,len(url)):
+        x = len(url)-1-i
+        if url[x] == "/":
+            return url[x+1:]
+
+def get_content(url):
+    """saves content to a text file in folder 'religious documents'"""
+    print(url)
+    data = urllib.request.urlopen(url) # it's a file like object and works just like a file
+
+    path = 'religious documents'
+
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    file_name = get_file_name(url)
+
+    with open(os.path.join(path, file_name), 'wb') as temp_file:
+        for line in data:
+            temp_file.write(line)
+
+    temp_file.close()
+
+if __name__ == '__main__':
+    url = 'http://textfiles.com/occult/ATHEISM/aaffirmative.txt'
+    print(get_file_name(url))
+    get_content(url)
diff --git a/mining.py b/mining.py
@@ -0,0 +1,132 @@
+from __future__ import print_function, division
+from os.path import exists
+
+import sys
+import string
+import random
+import requests
+import glob
+
+# global variables
+suffix_map = {}        # map from prefixes to a list of suffixes
+#{(prefix, [sf1, sf2, sf3, ..., sfn])}
+prefix = ()            # current tuple of words
+
+def get_book(file_name):
+    """Checks to see if filename is on file, and loads it
+
+    returns: lines of file
+    """
+    f = open(file_name, 'rb')
+    lines = f.readlines()
+    return lines
+
+def process_file(f, order):
+    """Reads a file and performs Markov analysis.
+
+    filename: string
+    order: integer number of words in the prefix
+
+    returns: map from prefix to list of possible suffixes.
+    """
+    for line in f:
+        for word in line.rstrip().split():
+            process_word(word, order)
+
+def clean_words(word):
+    #removes errant punctuation and numbers
+    punctuation = ['[', ']', "'", ":", "@", "*", "/", "(", ")", '1', '2', '3', '4', '5', '6', '7', '8', '9']
+    for letter in word:
+        if letter in punctuation:
+            return False
+    #removes character names
+    if (word == word.upper()) and (len(word) > 3):
+        return False
+    #removes numbers
+    try:
+        int(word)
+        return False
+    except:
+        return True
+    return True
+
+def process_word(word, order):
+    """Processes the words in each paragraph, mapping words to all possible suffixes
+    """
+    global prefix
+
+    #boolean false if the word shouldn't be included
+    check = clean_words(word)
+    if check == False:
+        return
+
+    try:
+        word = word.decode("utf-8")
+    except:
+        return
+
+    #starts off the dictionary
+    if len(prefix) < order:
+        prefix += (word,)
+        return
+
+    #check if the previous word already has a chain started
+    try:
+        suffix_map[prefix].append(word)
+    except KeyError:
+        # if not, start one
+        suffix_map[prefix] = [word]
+
+    prefix = shift(prefix, word)
+
+def random_text(n):
+    """Generates random wordsfrom the analyzed text.
+
+    Starts with a random prefix from the dictionary.
+
+    n: number of words to generate
+    """
+    # choose a random prefix (not weighted by frequency)
+    start = random.choice(list(suffix_map.keys()))
+
+    for i in range(n):
+        suffixes = suffix_map.get(start, None)
+        if suffixes == None:
+            # if the start isn't in map, we got to the end of the
+            # original text, so we have to start again.
+            random_text(n-i)
+            return
+        # choose a random suffix
+        word = random.choice(suffixes)
+        print(word, end=' ')
+        start = shift(start, word)
+
+def shift(t, word):
+    """Forms a new tuple by removing the head and adding word to the tail.
+
+    t: tuple of strings
+    word: string
+
+    Returns: tuple of strings
+    """
+    return t[1:] + (word,)
+
+def main(files, n, order):
+    for t in files:
+        f = get_book(t)
+        n = int(n)
+        order = int(order)
+        process_file(f, order)
+    random_text(n)
+    print()
+
+
+if __name__ == '__main__':
+    files = glob.glob('religious documents/*.txt')
+    #files = ['emma.txt']
+
+    texts = []
+    for f in files:
+       texts.append(f)
+
+    main(texts, 500, 3)