11import json
22import os
33import urllib .parse
4- import urllib .request
54import requests
6- from typing import Literal , List , Dict , Any , Union , Optional
5+ from typing import Literal , List , Dict , Any , Optional
76from clients import openai_client , groq_client
87from openai import OpenAI
98import fitz
1312import ast
1413import re
1514import hal9 as h9
15+ from replicate import Client
1616
17- # Define the allowed client types.
17+ # Define the allowed client types.
1818ClientType = Literal ["openai" , "groq" ]
1919
2020def get_client (client_type : ClientType ) -> OpenAI :
@@ -262,41 +262,46 @@ def process_chunk(chunk_info):
262262 "page" : page_num + 1 # Page numbers start from 1
263263 }
264264
265- def generate_text_embeddings_parquet (url , model = "text-embedding-3-small" , client_type = "openai" , n_words = 300 , overlap = 0 , max_threads = 8 ):
266- # Download and read the PDF
267- response = requests .get (url )
268- pdf_document = fitz .open (stream = BytesIO (response .content ))
269-
270- # Prepare chunk info for parallel processing
271- chunk_info_list = []
272- for page_num in range (len (pdf_document )):
273- page = pdf_document [page_num ]
274- page_text = page .get_text ()
275-
276- # Split the page text into chunks
277- text_chunks = split_text (page_text , n_words = n_words , overlap = overlap )
278-
279- # Add chunk info to the list
280- for chunk in text_chunks :
281- chunk_info_list .append ((chunk , page_num , model , client_type ))
282-
283- pdf_document .close ()
284-
285- # Process chunks in parallel
286- rows = []
287- with ThreadPoolExecutor (max_threads ) as executor :
288- for result in executor .map (process_chunk , chunk_info_list ):
289- rows .append (result )
290-
291- # Create the DataFrame
292- df = pd .DataFrame (rows )
293-
294- # Add a global chunk ID column
295- df ['chunk_id' ] = range (len (df ))
296- df ['filename' ] = '.' + url .split ("/" )[- 1 ]
265+ def generate_text_embeddings_parquet (
266+ url ,
267+ model = "text-embedding-3-small" ,
268+ client_type = "openai" ,
269+ n_words = 300 ,
270+ overlap = 0 ,
271+ max_threads = 8 ,
272+ storage_path = "./.storage/.text_files.parquet"
273+ ):
274+ # Download PDF
275+ resp = requests .get (url )
276+ doc = fitz .open (stream = BytesIO (resp .content ))
277+
278+ # Prepare chunks
279+ tasks = []
280+ for i in range (len (doc )):
281+ text = doc [i ].get_text ()
282+ for chunk in split_text (text , n_words = n_words , overlap = overlap ):
283+ tasks .append ((chunk , i , model , client_type ))
284+ doc .close ()
285+
286+ # Process in parallel
287+ rows = list (ThreadPoolExecutor (max_threads ).map (process_chunk , tasks ))
288+
289+ # Build new DataFrame
290+ df_new = pd .DataFrame (rows )
291+ df_new ['chunk_id' ] = range (len (df_new ))
292+ df_new ['filename' ] = os .path .basename (url )
293+
294+ os .makedirs (os .path .dirname (storage_path ), exist_ok = True )
295+
296+ # Load existing and append
297+ if os .path .exists (storage_path ):
298+ df_old = pd .read_parquet (storage_path , engine = "pyarrow" )
299+ df = pd .concat ([df_old , df_new ], ignore_index = True )
300+ else :
301+ df = df_new
297302
298- # Save as Parquet
299- df .to_parquet ("./.storage/.text_files.parquet" , engine = "pyarrow" , index = False )
303+ # Save all
304+ df .to_parquet (storage_path , engine = "pyarrow" , index = False )
300305
301306def load_json_file (json_path ):
302307 if os .path .exists (json_path ):
@@ -307,4 +312,76 @@ def load_json_file(json_path):
307312def extract_code_block (code : str , language : str ) -> str :
308313 pattern = rf"```{ language } \n(.*?)```"
309314 match = re .search (pattern , code , re .DOTALL )
310- return match .group (1 ) if match else ""
315+ return match .group (1 ) if match else ""
316+
317+
318+ def is_url_list (prompt ):
319+ urls_list = prompt .split ("," )
320+ for url in urls_list :
321+ result = urllib .parse .urlparse (url .strip ())
322+ if not all ([result .scheme , result .netloc ]):
323+ return False
324+ return True
325+
326+ def add_images_descriptions (image_path ):
327+ description = generate_description (image_path )
328+
329+ file_name = './.storage/.images_description.json'
330+
331+ if os .path .exists (file_name ):
332+ with open (file_name , 'r' ) as file :
333+ data = json .load (file )
334+ else :
335+ data = []
336+
337+ new_record = {
338+ "image_path" : image_path ,
339+ "image_description" : description
340+ }
341+
342+ data .append (new_record )
343+
344+ with open (file_name , 'w' ) as file :
345+ json .dump (data , file , indent = 4 )
346+
347+ return description
348+
349+ replicate = Client (api_token = os .environ ['HAL9_TOKEN' ], base_url = "https://api.hal9.com/proxy/server=https://api.replicate.com" )
350+
351+ def generate_description (image_path ):
352+ try :
353+ file_input = open (image_path , 'rb' )
354+ input = {
355+ "image" : file_input ,
356+ "prompt" : """Generate a detailed image prompt that includes all specific visual details in the image. This should include precise descriptions of colors, textures, lighting, positions of all elements, proportions, background details,
357+ foreground details, and any unique stylistic choices. Ensure the description is exhaustive enough to allow an artist or AI to recreate the image accurately without visual reference."""
358+ }
359+
360+ description = ""
361+ for event in replicate .stream (
362+ "yorickvp/llava-13b:80537f9eead1a5bfa72d5ac6ea6414379be41d4d4f6679fd776e9535d1eb58bb" ,
363+ input = input
364+ ):
365+ description += event .data
366+ file_input .close ()
367+ except Exception as e :
368+ return (f"Couldn't describe that image. -> Error: { e } " )
369+
370+ return description .replace ("{" , "" ).replace ("}" , "" )
371+
372+ def process_url (url , messages ):
373+ h9 .event ("Uploaded File" , f"{ url } " )
374+ filename = url .split ("/" )[- 1 ]
375+ file_extension = filename .split ("." )[- 1 ] if "." in filename else "No extension"
376+
377+ download_file (url )
378+ messages = insert_message (messages , "system" , f"Consider use the file available at path: './.storage/.{ filename } ' for the following questions." )
379+ messages = insert_message (messages , "assistant" , f"I'm ready to answer questions about your file: { filename } " )
380+
381+ if file_extension .lower () == "pdf" :
382+ generate_text_embeddings_parquet (url )
383+ elif file_extension .lower () in ['jpg' , 'jpeg' , 'png' , 'webp' ]:
384+ add_images_descriptions (f"./.storage/.{ filename } " )
385+
386+ print (f"I'm ready to answer questions about your file: { filename } " )
387+ return messages
0 commit comments