diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6769e21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/README.md b/README.md index e1423aa..96b194c 100644 --- a/README.md +++ b/README.md @@ -7,3 +7,11 @@ Harnessing cutting-edge artificial intelligence, QuickTrace accelerates the inve providing journalists with lightning-fast access to critical information. Maximize efficiency, uncover the truth, and elevate your investigations with QuickTrace—the trusted companion of every journalist committed to impactful and comprehensive reporting. + +## Connecting to Google Drive +Enable your Google Drive API by following the instructions [here](https://developers.google.com/drive/api/quickstart/python). Save the `credentials.json` file in the top level directory. + +To download all files from your Google Drive account to be uploaded to QuickTrace, +run `python google_drive.py`. + +To search for a specific filetype, use `python -c "from google_drive import search_filetype('filename.ext')"` diff --git a/app.py b/app.py index 7021c1c..61d888e 100644 --- a/app.py +++ b/app.py @@ -7,6 +7,7 @@ from langchain.chat_models import ChatOpenAI from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import FAISS +from pathlib import Path from audio_utils import convert_audio_to_text from file_knowledge import FileKnowledge @@ -66,11 +67,11 @@ def initialize_sidebar(session): with st.sidebar: show_all_konwledge = st.button("Show all knowledge", key="show_all_konwledge") with st.expander("Upload files"): - process_files("pdf", get_splitter(), session) - process_files("m4a", get_splitter(), session) + process_files(get_splitter(), session) + st.header("Journalist toolbox") - st.write("Upload your PDF file or audio file") + st.write("Upload your PDF, audio, text, or csv files") st.write("Then ask a question and get an answer") st.write("You can also download the text of the uploaded files") st.divider() @@ -86,9 +87,10 @@ def get_splitter(): length_function=len, ) -def process_files(file_type, splitter, session): - files = st.file_uploader(f"Upload your {file_type} file", type=[file_type], accept_multiple_files=True) +def process_files(splitter, session): + files = st.file_uploader(f"Upload your files!", accept_multiple_files=True) for file in files: + file_type = Path(file.name).suffix.split('.')[1] if file.name not in st.session_state["knowledge"].keys(): file_knowledge = FileKnowledge(name=file.name, file=file, filetype=file_type, splitter=splitter) session[file.name] = file_knowledge diff --git a/file_knowledge.py b/file_knowledge.py index 3abf7fb..b836bb8 100644 --- a/file_knowledge.py +++ b/file_knowledge.py @@ -2,6 +2,7 @@ import tempfile from dataclasses import dataclass, field from typing import Any, List, TypeVar +from io import StringIO from langchain.text_splitter import CharacterTextSplitter from PyPDF2 import PdfReader @@ -9,6 +10,7 @@ from audio_utils import convert_audio_to_text UploadedFile = TypeVar('UploadedFile', bound=Any) +SUPPORTED_FILE_TYPES = ['pdf', 'csv', 'txt', 'html', 'm4a', 'eml', 'msg', 'mbox'] @dataclass @@ -23,6 +25,7 @@ class FileKnowledge: def __post_init__(self): self.content = self.extract_text() self.chunks = self.splitter.split_text(self.content) + @property def content(self): @@ -42,6 +45,7 @@ def chunks(self, value): self._chunks = value self.save_to_session_state() + def save_to_session_state(self): st.session_state.knowledge[self.name] = self @@ -50,8 +54,14 @@ def extract_text(self): return self.extract_text_from_pdf() elif self.filetype == 'm4a': return self.extract_text_from_audio() + elif self.filetype == 'txt': + return self.extract_text_generic() + elif self.filetype == 'csv': + return self.extract_text_generic() + else: - raise ValueError(f'Unsupported filetype: {self.filetype}') + if not self.filetype in SUPPORTED_FILE_TYPES: + raise ValueError(f'Unsupported filetype: {self.filetype}') def extract_text_from_pdf(self): # Add your code here to extract text from a PDF file @@ -60,7 +70,11 @@ def extract_text_from_pdf(self): for page in pdf_reader.pages: text += page.extract_text() return text - + + def extract_text_generic(self): + stringio = StringIO(self.file.getvalue().decode("utf-8")) + return stringio.read() + def extract_text_from_audio(self): with tempfile.NamedTemporaryFile(delete=False, suffix=".m4a") as tmp: tmp.write(self.file.read()) diff --git a/google_drive.py b/google_drive.py new file mode 100644 index 0000000..d8171ba --- /dev/null +++ b/google_drive.py @@ -0,0 +1,192 @@ +# Taken from https://www.thepythoncode.com/article/using-google-drive--api-in-python +from __future__ import print_function + +import re +import pickle +import os +from googleapiclient.discovery import build +from google_auth_oauthlib.flow import InstalledAppFlow +from google.auth.transport.requests import Request +from tabulate import tabulate +import requests +import tqdm + +import os.path +from googleapiclient.http import MediaFileUpload +# If modifying these scopes, delete the file token.pickle. +SCOPES = ['https://www.googleapis.com/auth/drive.metadata', + 'https://www.googleapis.com/auth/drive', + 'https://www.googleapis.com/auth/drive.file' + ] + +def get_gdrive_service(): + creds = None + # The file token.pickle stores the user's access and refresh tokens, and is + # created automatically when the authorization flow completes for the first + # time. + if os.path.exists('token.pickle'): + with open('token.pickle', 'rb') as token: + creds = pickle.load(token) + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + 'credentials.json', SCOPES) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open('token.pickle', 'wb') as token: + pickle.dump(creds, token) + # return Google Drive API service + return build('drive', 'v3', credentials=creds) + +def download_all_files(n: int = 50): + """Shows basic usage of the Drive v3 API. + Prints the names and ids of the first 5 files the user has access to. + """ + service = get_gdrive_service() + # Call the Drive v3 API + results = service.files().list( + pageSize=n, fields="nextPageToken, files(id, name, mimeType, size, parents, modifiedTime)").execute() + # get the results + items = results.get('files', []) + # list all 20 files & folders + for file in list_files(items): + download_file_from_google_drive(file['id'], file['name']) + + +def search_filetype(filetype: str = "text/plain"): + # You can also use "filename.ext" + # authenticate Google Drive API + service = get_gdrive_service() + # search for files that has type of text/plain + search_result = search(service, query=f"mimeType='{filetype}'") + # convert to table to print well + table = tabulate(search_result, headers=["ID", "Name", "Type"]) + print(table) + + +def list_files(items): + """given items returned by Google Drive API, prints them in a tabular way""" + if not items: + # empty drive + print('No files found.') + else: + rows = [] + for item in items: + # get the File ID + id = item["id"] + # get the name of file + name = item["name"] + try: + # parent directory ID + parents = item["parents"] + except: + # has no parrents + parents = "N/A" + try: + # get the size in nice bytes format (KB, MB, etc.) + size = get_size_format(int(item["size"])) + except: + # not a file, may be a folder + size = "N/A" + # get the Google Drive type of file + mime_type = item["mimeType"] + # get last modified date time + modified_time = item["modifiedTime"] + # append everything to the list + rows.append((id, name, parents, size, mime_type, modified_time)) + print("Files:") + # convert to a human readable table + table = tabulate(rows, headers=["ID", "Name", "Parents", "Size", "Type", "Modified Time"]) + # print the table + print(table) + return rows + +def get_size_format(b, factor=1024, suffix="B"): + """ + Scale bytes to its proper byte format + e.g: + 1253656 => '1.20MB' + 1253656678 => '1.17GB' + """ + for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: + if b < factor: + return f"{b:.2f}{unit}{suffix}" + b /= factor + return f"{b:.2f}Y{suffix}" + +def search(service, query): + # search for the file + result = [] + page_token = None + while True: + response = service.files().list(q=query, + spaces="drive", + fields="nextPageToken, files(id, name, mimeType)", + pageToken=page_token).execute() + # iterate over filtered files + for file in response.get("files", []): + result.append((file["id"], file["name"], file["mimeType"])) + page_token = response.get('nextPageToken', None) + if not page_token: + # no more files + break + return result + +def download(filename): + service = get_gdrive_service() + # the name of the file you want to download from Google Drive + # search for the file by name + search_result = search(service, query=f"name='{filename}'") + # get the GDrive ID of the file + file_id = search_result[0][0] + # make it shareable + service.permissions().create(body={"role": "reader", "type": "anyone"}, fileId=file_id).execute() + # download file + download_file_from_google_drive(file_id, filename) + +def download_file_from_google_drive(id, destination): + def get_confirm_token(response): + for key, value in response.cookies.items(): + if key.startswith('download_warning'): + return value + return None + + def save_response_content(response, destination): + CHUNK_SIZE = 32768 + # get the file size from Content-length response header + file_size = int(response.headers.get("Content-Length", 0)) + # extract Content disposition from response headers + content_disposition = response.headers.get("content-disposition") + # parse filename + filename = re.findall("filename=\"(.+)\"", content_disposition)[0] + print("[+] File size:", file_size) + print("[+] File name:", filename) + progress = tqdm(response.iter_content(CHUNK_SIZE), f"Downloading {filename}", total=file_size, unit="Byte", unit_scale=True, unit_divisor=1024) + with open(destination, "wb") as f: + for chunk in progress: + if chunk: # filter out keep-alive new chunks + f.write(chunk) + # update the progress bar + progress.update(len(chunk)) + progress.close() + + # base URL for download + URL = "https://docs.google.com/uc?export=download" + # init a HTTP session + session = requests.Session() + # make a request + response = session.get(URL, params = {'id': id}, stream=True) + print("[+] Downloading", response.url) + # get confirmation token + token = get_confirm_token(response) + if token: + params = {'id': id, 'confirm':token} + response = session.get(URL, params=params, stream=True) + # download to disk + save_response_content(response, destination) + +if __name__ == '__main__': + download_all_files() diff --git a/requirements.txt b/requirements.txt index c6af25d..39f8b2b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,10 @@ openai==0.27.6 tiktoken==0.4.0 faiss_cpu==1.7.4 pydub==0.25.1 -ffmpeg-python==0.2.0 \ No newline at end of file +ffmpeg-python==0.2.0 +google-api-python-client==2.88.0 +google-auth-httplib2==0.1.0 +google-auth-oauthlib==1.0.0 +tabulate==0.9.0 +requests==2.31.0 +tqdm==4.65.0 \ No newline at end of file