diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d017ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# created by virtualenv automatically + diff --git a/app.py b/app.py index 7021c1c..07491dd 100644 --- a/app.py +++ b/app.py @@ -11,6 +11,8 @@ from audio_utils import convert_audio_to_text from file_knowledge import FileKnowledge +from crawler import crawl_page + def main(): load_dotenv() st.set_page_config(page_title="Journalist's Toolbox", page_icon=":smiley:") @@ -68,6 +70,7 @@ def initialize_sidebar(session): with st.expander("Upload files"): process_files("pdf", get_splitter(), session) process_files("m4a", get_splitter(), session) + crawl_site(get_splitter(),session) st.header("Journalist toolbox") st.write("Upload your PDF file or audio file") @@ -95,6 +98,37 @@ def process_files(file_type, splitter, session): add_document_to_vector_store(file_knowledge) +def crawl_site(splitter,session): + """ + user enters a url + crawl_page gets files linked from that page + and returns them to st session + """ + # user input + url = st.text_input("scrape a url :articulated_lorry:") + # try to scrape + page_data = crawl_page(url) + # check if we have data + if page_data: + # we may have multiple results + for datum in page_data: + # buffer file + bufferwb = open('/tmp/' + datum['name'],'wb') + bufferwb.write(datum['content']) + bufferwb.close() + buffer = open('/tmp/' + datum['name'],'rb') + # get file knowledge + try: + file_knowledge = FileKnowledge(name=datum['name'],file=buffer,filetype=datum['ext'],splitter=splitter) + # add to session info + session[datum['name']] = file_knowledge + # add to vector store + add_document_to_vector_store(file_knowledge) + + except Exception as e: + print(e) + import ipdb; ipdb.set_trace() + def get_vector_store(): if not hasattr(st.session_state, "vector_store"): raise ValueError("No vector store found") diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..3821de3 --- /dev/null +++ b/crawler.py @@ -0,0 +1,64 @@ +import requests +import streamlit as st +from bs4 import BeautifulSoup + +# we can only support certain filetypes rn +crawlable_exts = ['pdf','download'] +# save $$$ +limit = 2 + +def crawl_page(url,file_exts=None,keywords=None): + """ + given a web page url - + and optional list of file exts and keywords - + crawl page for matching files + and return download urls + """ + # keep track of files to download + get_list = [] + # return name, content, ext payload list + payload = [] + # no url will be given on page load + if url: + if 'http' not in url and 'https' not in url: + url = 'http://' + url + # get page + page = requests.get(url).content + # parse with beautiful soup + soup = BeautifulSoup(page,'html.parser') + # get links on page + links = soup.find_all('a') + # get pdf links + for link in links: + # check if ext is in accept list + for href_ext in crawlable_exts: + # check if this link contains href to acceptable ext + if 'href' in link.attrs and href_ext in link.attrs['href']: + # if so get it + get_list.append(link.attrs['href']) + + # keep track of how many files you've scraped + counter = 0 + # get the acceptable links we've collected + for link in get_list: + # file name + name = link.split('/')[-1] + # file ext + ext = name.split('.')[-1] + # hack TODO use .html or .txt if it's not a supported ext + if 'download' in link: + ext = 'pdf' + link = url + link + link = link.replace('capitol-breach-cases/usao-dc/','') + print('zyx',link) + # binary content + content = requests.get(link).content + # return as dict + payload.append({'name':name,'content':content,'ext':ext}) + # increment + counter += 1 + if counter == limit: + break + # zero, one or more file name, content exts + return payload[0:limit] # save $$$ +