Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# created by virtualenv automatically

34 changes: 34 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from audio_utils import convert_audio_to_text
from file_knowledge import FileKnowledge

from crawler import crawl_page

def main():
load_dotenv()
st.set_page_config(page_title="Journalist's Toolbox", page_icon=":smiley:")
Expand Down Expand Up @@ -68,6 +70,7 @@ def initialize_sidebar(session):
with st.expander("Upload files"):
process_files("pdf", get_splitter(), session)
process_files("m4a", get_splitter(), session)
crawl_site(get_splitter(),session)

st.header("Journalist toolbox")
st.write("Upload your PDF file or audio file")
Expand Down Expand Up @@ -95,6 +98,37 @@ def process_files(file_type, splitter, session):

add_document_to_vector_store(file_knowledge)

def crawl_site(splitter,session):
"""
user enters a url
crawl_page gets files linked from that page
and returns them to st session
"""
# user input
url = st.text_input("scrape a url :articulated_lorry:")
# try to scrape
page_data = crawl_page(url)
# check if we have data
if page_data:
# we may have multiple results
for datum in page_data:
# buffer file
bufferwb = open('/tmp/' + datum['name'],'wb')
bufferwb.write(datum['content'])
bufferwb.close()
buffer = open('/tmp/' + datum['name'],'rb')
# get file knowledge
try:
file_knowledge = FileKnowledge(name=datum['name'],file=buffer,filetype=datum['ext'],splitter=splitter)
# add to session info
session[datum['name']] = file_knowledge
# add to vector store
add_document_to_vector_store(file_knowledge)

except Exception as e:
print(e)
import ipdb; ipdb.set_trace()

def get_vector_store():
if not hasattr(st.session_state, "vector_store"):
raise ValueError("No vector store found")
Expand Down
64 changes: 64 additions & 0 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import requests
import streamlit as st
from bs4 import BeautifulSoup

# we can only support certain filetypes rn
crawlable_exts = ['pdf','download']
# save $$$
limit = 2

def crawl_page(url,file_exts=None,keywords=None):
"""
given a web page url -
and optional list of file exts and keywords -
crawl page for matching files
and return download urls
"""
# keep track of files to download
get_list = []
# return name, content, ext payload list
payload = []
# no url will be given on page load
if url:
if 'http' not in url and 'https' not in url:
url = 'http://' + url
# get page
page = requests.get(url).content
# parse with beautiful soup
soup = BeautifulSoup(page,'html.parser')
# get links on page
links = soup.find_all('a')
# get pdf links
for link in links:
# check if ext is in accept list
for href_ext in crawlable_exts:
# check if this link contains href to acceptable ext
if 'href' in link.attrs and href_ext in link.attrs['href']:
# if so get it
get_list.append(link.attrs['href'])

# keep track of how many files you've scraped
counter = 0
# get the acceptable links we've collected
for link in get_list:
# file name
name = link.split('/')[-1]
# file ext
ext = name.split('.')[-1]
# hack TODO use .html or .txt if it's not a supported ext
if 'download' in link:
ext = 'pdf'
link = url + link
link = link.replace('capitol-breach-cases/usao-dc/','')
print('zyx',link)
# binary content
content = requests.get(link).content
# return as dict
payload.append({'name':name,'content':content,'ext':ext})
# increment
counter += 1
if counter == limit:
break
# zero, one or more file name, content exts
return payload[0:limit] # save $$$