Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# TextMining

# TextMining
This is the base repo for the text mining and analysis project for Software Design at Olin College.

## Required Libraries
This set of code requires a small set of python libraries to be pre-installed on the computer: sys, string, random, requests, bs4, urllib, glob, and os. These libraries can be easily installed on windows using the 'pip install' command.

## Running the Code
Run fetch_files.py first to create a folder in your workspace called religious documents, which will contain on the order of 1000 religious text files. You only need to run this once. From here, run mining.py to randomly generate text.
16,632 changes: 16,632 additions & 0 deletions emma.txt

Large diffs are not rendered by default.

76 changes: 76 additions & 0 deletions fetch_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import urllib
import string
import os


def find_all_religions(url):
"""returns list of urls for all listed religions"""
url_string = requests.get(url)
soup = BeautifulSoup(url_string.text, 'lxml') # Parse the HTML as a string

a_tags = soup.find_all('a')
links = [urljoin(url, a['href'])for a in a_tags]

#return only the first sixteen
links = links[:16]
return links

def find_text_files(url):
"""returns list of text urls from the each religion url"""
url_string = requests.get(url)
soup = BeautifulSoup(url_string.text, 'lxml') # Parse the HTML as a string

a_tags = soup.find_all('a')

url += '/'
files = [urljoin(url, a['href'])for a in a_tags]

texts = []
#return only text file links
for a in files:
x = str(a)
if x[-3:] == 'txt':
texts.append(a)
return texts


def get_file_name(url):
"""returns file name for text document"""
for i in range(0,len(url)):
x = len(url)-1-i
if url[x] == "/":
return url[x+1:]

def get_content(url):
"""saves content to a text file in folder 'religious documents'"""
data = urllib.request.urlopen(url)
path = 'religious documents'

if not os.path.exists(path):
os.makedirs(path)

file_name = get_file_name(url)

with open(os.path.join(path, file_name), 'wb') as temp_file:
for line in data:
temp_file.write(line)

temp_file.close()

def download_files(links):
"""saves text file from link to a folder named religious docs"""
for t in links:
get_content(t)

if __name__ == '__main__':
url = 'http://textfiles.com/occult/'
religions = find_all_religions(url)
text_urls = []
for r in religions:
x = find_text_files(r)
for i in x:
text_urls.append(i)
download_files(text_urls)
36 changes: 36 additions & 0 deletions get_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import requests
from urllib.parse import urljoin
import urllib
from bs4 import BeautifulSoup
import string
import os

def get_file_name(url):
"""returns file name for text document"""
for i in range(0,len(url)):
x = len(url)-1-i
if url[x] == "/":
return url[x+1:]

def get_content(url):
"""saves content to a text file in folder 'religious documents'"""
print(url)
data = urllib.request.urlopen(url) # it's a file like object and works just like a file

path = 'religious documents'

if not os.path.exists(path):
os.makedirs(path)

file_name = get_file_name(url)

with open(os.path.join(path, file_name), 'wb') as temp_file:
for line in data:
temp_file.write(line)

temp_file.close()

if __name__ == '__main__':
url = 'http://textfiles.com/occult/ATHEISM/aaffirmative.txt'
print(get_file_name(url))
get_content(url)
132 changes: 132 additions & 0 deletions mining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from __future__ import print_function, division
from os.path import exists

import sys
import string
import random
import requests
import glob

# global variables
suffix_map = {} # map from prefixes to a list of suffixes
#{(prefix, [sf1, sf2, sf3, ..., sfn])}
prefix = () # current tuple of words

def get_book(file_name):
"""Checks to see if filename is on file, and loads it

returns: lines of file
"""
f = open(file_name, 'rb')
lines = f.readlines()
return lines

def process_file(f, order):
"""Reads a file and performs Markov analysis.

filename: string
order: integer number of words in the prefix

returns: map from prefix to list of possible suffixes.
"""
for line in f:
for word in line.rstrip().split():
process_word(word, order)

def clean_words(word):
#removes errant punctuation and numbers
punctuation = ['[', ']', "'", ":", "@", "*", "/", "(", ")", '1', '2', '3', '4', '5', '6', '7', '8', '9']
for letter in word:
if letter in punctuation:
return False
#removes character names
if (word == word.upper()) and (len(word) > 3):
return False
#removes numbers
try:
int(word)
return False
except:
return True
return True

def process_word(word, order):
"""Processes the words in each paragraph, mapping words to all possible suffixes
"""
global prefix

#boolean false if the word shouldn't be included
check = clean_words(word)
if check == False:
return

try:
word = word.decode("utf-8")
except:
return

#starts off the dictionary
if len(prefix) < order:
prefix += (word,)
return

#check if the previous word already has a chain started
try:
suffix_map[prefix].append(word)
except KeyError:
# if not, start one
suffix_map[prefix] = [word]

prefix = shift(prefix, word)

def random_text(n):
"""Generates random wordsfrom the analyzed text.

Starts with a random prefix from the dictionary.

n: number of words to generate
"""
# choose a random prefix (not weighted by frequency)
start = random.choice(list(suffix_map.keys()))

for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes == None:
# if the start isn't in map, we got to the end of the
# original text, so we have to start again.
random_text(n-i)
return
# choose a random suffix
word = random.choice(suffixes)
print(word, end=' ')
start = shift(start, word)

def shift(t, word):
"""Forms a new tuple by removing the head and adding word to the tail.

t: tuple of strings
word: string

Returns: tuple of strings
"""
return t[1:] + (word,)

def main(files, n, order):
for t in files:
f = get_book(t)
n = int(n)
order = int(order)
process_file(f, order)
random_text(n)
print()


if __name__ == '__main__':
files = glob.glob('religious documents/*.txt')
#files = ['emma.txt']

texts = []
for f in files:
texts.append(f)

main(texts, 500, 3)
Loading