From 92cf702ce49f2066110c589d97b57b0ed3ab8fa4 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Mon, 17 Nov 2025 17:07:34 +0530
Subject: [PATCH 01/15] Add S3 Bedrock BDA ingestion support with user
 confirmation and pymupdf4llm integration

---
 common/requirements.txt              |   3 +-
 common/utils/image_data_extractor.py | 163 +++---------
 common/utils/markdown_parsing.py     |  63 +++++
 common/utils/text_extractors.py      | 254 +++++++++---------
 graphrag-ui/src/pages/Setup.tsx      | 370 +++++++++++++--------------
 5 files changed, 403 insertions(+), 450 deletions(-)
 create mode 100644 common/utils/markdown_parsing.py

diff --git a/common/requirements.txt b/common/requirements.txt
index 562c2f6..f0022f3 100644
--- a/common/requirements.txt
+++ b/common/requirements.txt
@@ -110,7 +110,8 @@ packaging==24.2
 pandas==2.2.3
 #pathtools==0.1.2
 pillow==11.2.1
-PyMuPDF==1.26.4
+#PyMuPDF==1.26.4
+pymupdf4llm==0.2.0
 platformdirs==4.3.8
 pluggy==1.6.0
 prometheus_client==0.22.1
diff --git a/common/utils/image_data_extractor.py b/common/utils/image_data_extractor.py
index bde9c97..74e8d2f 100644
--- a/common/utils/image_data_extractor.py
+++ b/common/utils/image_data_extractor.py
@@ -11,155 +11,54 @@
 
 logger = logging.getLogger(__name__)
 
-
-
-def describe_image_with_llm(image_input):
+def describe_image_with_llm(file_path):
     """
-    Send image (pixmap or PIL image) to LLM vision model and return description.
-    Uses multimodal_service from config if available, otherwise falls back to completion_service.
-    Currently supports: OpenAI, Azure OpenAI, Google GenAI, and Google VertexAI
+    Read image file and convert to base64 to send to LLM.
     """
     try:
+        from PIL import Image as PILImage
+        
         client = get_multimodal_service()
         if not client:
             return "[Image: Failed to create multimodal LLM client]"
-        
+
+        # Read image and convert to base64
+        pil_image = PILImage.open(file_path)
         buffer = io.BytesIO()
-        # Convert to RGB if needed for better compatibility
-        if image_input.mode != 'RGB':
-            image_input = image_input.convert('RGB')
-        image_input.save(buffer, format="JPEG", quality=95)
-        b64_img = base64.b64encode(buffer.getvalue()).decode("utf-8")
+        if pil_image.mode != 'RGB':
+            pil_image = pil_image.convert('RGB')
+        pil_image.save(buffer, format="JPEG", quality=95)
+        image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
 
-        # Build messages (system + human)
         messages = [
-        SystemMessage(
-            content="You are a helpful assistant that describes images concisely for document analysis."
-        ),
-        HumanMessage(
-            content=[
-                {
-                    "type": "text",
-                    "text": (
-                        "Please describe what you see in this image and "
-                        "if the image has scanned text then extract all the text. "
-                        "if the image has any logo, icon, or branding element, try to describe it with text. "
-                        "Focus on any text, diagrams, charts, or other visual elements."
-                        "If the image is purely a logo, icon, or branding element, start your response with 'LOGO:' or 'ICON:'."
-                    ),
-                },
-                 {
-                     "type": "image_url",
-                     "image_url": {"url": f"data:image/jpeg;base64,{b64_img}"},
-                 },
-            ]
-        ),
+            SystemMessage(
+                content="You are a helpful assistant that describes images concisely for document analysis."
+            ),
+            HumanMessage(
+                content=[
+                    {
+                        "type": "text",
+                        "text": (
+                            "Please describe what you see in this image and "
+                            "if the image has scanned text then extract all the text. "
+                            "If the image has any graph, chart, table, or other diagram, describe it. "
+                        ),
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+                    },
+                ],
+            ),
         ]
 
-        # Get response from LangChain LLM client
-        # Access the underlying LangChain client
         langchain_client = client.llm
         response = langchain_client.invoke(messages)
 
-        return response.content if hasattr(response, 'content') else str(response)
+        return response.content if hasattr(response, "content") else str(response)
 
     except Exception as e:
         logger.error(f"Failed to describe image with LLM: {str(e)}")
         return "[Image: Error processing image description]"
 
 
-def save_image_and_get_markdown(image_input, context_info="", graphname=None):
-    """
-    Save image locally to static/images/ folder and return markdown reference with description.
-    
-    LEGACY/OLD APPROACH: Used for backward compatibility with JSONL-based loading.
-    Images are saved as files and served via /ui/images/ endpoint with img:// protocol.
-    
-    For NEW direct loading approach, images are stored in Image vertex as base64
-    and served via /ui/image_vertex/ endpoint with image:// protocol.
-    
-    Args:
-        image_input: PIL Image object
-        context_info: Optional context (e.g., "page 3 of invoice.pdf")
-        graphname: Graph name to organize images by graph (optional)
-    
-    Returns:
-        dict with:
-            - 'markdown': Markdown string with img:// reference
-            - 'image_id': Unique identifier for the saved image
-            - 'image_path': Path where image was saved to static/images/
-    """
-    try:
-        # FIRST: Get description from LLM to check if it's a logo
-        description = describe_image_with_llm(image_input)
-        
-        # Check if the image is a logo, icon, or decorative element BEFORE saving
-        # These should be filtered out as they're not content-relevant
-        description_lower = description.lower()
-        logo_indicators = ['logo', 'icon', 'branding', 'watermark', 'trademark', 'company logo', 'brand logo']
-        
-        if any(indicator in description_lower for indicator in logo_indicators):
-            logger.info(f"Detected logo/icon in image, skipping: {description[:100]}")
-            return None
-        
-        # If not a logo, proceed with saving the image
-        # Generate unique image ID using hash of image content
-        buffer = io.BytesIO()
-        if image_input.mode != 'RGB':
-            image_input = image_input.convert('RGB')
-        image_input.save(buffer, format="JPEG", quality=95)
-        image_bytes = buffer.getvalue()
-        
-        # Create hash-based ID (deterministic for same image)
-        image_hash = hashlib.sha256(image_bytes).hexdigest()[:16]
-        image_id = f"{image_hash}.jpg"
-        
-        # Save image to local storage directory organized by graphname
-        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-        
-        # If graphname is provided, organize images by graph
-        if graphname:
-            images_dir = os.path.join(project_root, "static", "images", graphname)
-            # Include graphname in the image reference for URL construction
-            image_reference = f"{graphname}/{image_id}"
-        else:
-            images_dir = os.path.join(project_root, "static", "images")
-            image_reference = image_id
-        
-        os.makedirs(images_dir, exist_ok=True)
-        
-        image_path = os.path.join(images_dir, image_id)
-        
-        # Save image file (skip if already exists with same hash)
-        if not os.path.exists(image_path):
-            with open(image_path, 'wb') as f:
-                f.write(image_bytes)
-            logger.info(f"Saved content image to: {image_path}")
-        else:
-            logger.debug(f"Image already exists: {image_path}")
-        
-        # Generate markdown with custom img:// protocol (will be replaced later)
-        # Format: ![description](img://graphname/image_id) or ![description](img://image_id)
-        markdown = f"![{description}](img://{image_reference})"
-        
-        logger.info(f"Created image reference: {image_reference} with description")
-        
-        return {
-            'markdown': markdown,
-            'image_id': image_reference,
-            'image_path': image_path,
-            'description': description
-        }
-        
-    except Exception as e:
-        logger.error(f"Failed to save image and generate markdown: {str(e)}")
-        # Fallback to text description only
-        fallback_desc = f"[Image: {context_info} - processing failed]"
-        return {
-            'markdown': fallback_desc,
-            'image_id': None,
-            'image_path': None,
-            'description': fallback_desc
-        }
-
-
diff --git a/common/utils/markdown_parsing.py b/common/utils/markdown_parsing.py
new file mode 100644
index 0000000..7c8c476
--- /dev/null
+++ b/common/utils/markdown_parsing.py
@@ -0,0 +1,63 @@
+import re
+import os
+import pymupdf4llm
+
+class MarkdownProcessor:
+    """
+    A helper class to extract markdown image entries and
+    update descriptions based on image_id.
+    """
+
+    # regex for markdown images: ![alt](path)
+    _pattern = re.compile(r'!\[([^\]]*)\]\(([^)\s]+)\)')
+
+    @classmethod
+    def extract_images(cls, md_text):
+        """
+        Returns list of {"path": path, "image_id": image_id}
+        image_id = basename without extension
+        """
+        images = []
+        for m in cls._pattern.finditer(md_text):
+            path = m.group(2)
+            basename = os.path.basename(path)
+            image_id = os.path.splitext(basename)[0]
+            images.append({"path": path, "image_id": image_id})
+        return images
+
+    @classmethod
+    def insert_description_by_id(cls, md_text, image_id, description):
+        """
+        Replace the description for an image whose basename == image_id.
+        """
+
+        def repl(m):
+            old_path = m.group(2)
+            candidate_id = os.path.splitext(os.path.basename(old_path))[0]
+
+            if candidate_id == image_id:
+                # Insert new description
+                return f'![{description}]({old_path})'
+
+            return m.group(0)
+
+        return cls._pattern.sub(repl, md_text)
+
+    @classmethod
+    def replace_path_with_tg_protocol(cls, md_text, image_id, tg_reference):
+        """
+        Replace the file path for an image whose basename == image_id with tg:// protocol reference.
+        tg_reference should be like 'Graphs_image_1'
+        """
+        def repl(m):
+            old_path = m.group(2)
+            candidate_id = os.path.splitext(os.path.basename(old_path))[0]
+
+            if candidate_id == image_id:
+                # Replace path with tg:// protocol reference
+                alt_text = m.group(1)
+                return f'![{alt_text}](tg://{tg_reference})'
+
+            return m.group(0)
+
+        return cls._pattern.sub(repl, md_text)
\ No newline at end of file
diff --git a/common/utils/text_extractors.py b/common/utils/text_extractors.py
index da3e22d..b900cae 100644
--- a/common/utils/text_extractors.py
+++ b/common/utils/text_extractors.py
@@ -183,137 +183,154 @@ def extract_text_from_file_with_images_as_docs(file_path, graphname=None):
 
 def _extract_pdf_with_images_as_docs(file_path, base_doc_id, graphname=None):
     """
-    Extract PDF as ONE markdown document with inline image references.
+    Extract PDF as ONE markdown document with inline image references using pymupdf4llm.
+    Uses unique temporary folder per PDF to allow parallel processing.
+    After processing, delete the extracted image folder.
     """
+    # Use unique folder per PDF to allow parallel processing without conflicts
+    unique_folder_id = uuid.uuid4().hex[:12]
+    image_output_folder = Path(f"tg_temp_{unique_folder_id}")
+
     try:
-        import fitz  # PyMuPDF
+        import pymupdf4llm
         from PIL import Image as PILImage
+        from common.utils.image_data_extractor import describe_image_with_llm
+        from common.utils.markdown_parsing import MarkdownProcessor
+
+        # Ensure clean slate - remove folder if it exists from failed previous run
+        if image_output_folder.exists():
+            shutil.rmtree(image_output_folder, ignore_errors=True)
+
+        # Convert PDF to markdown with extracted image files
+        try:
+            markdown_content = pymupdf4llm.to_markdown(
+                file_path,
+                write_images=True,
+                image_path=str(image_output_folder),  # unique folder per PDF
+                force_text=False,
+                margins=0,
+                image_size_limit=0.08,
+            )
+        except Exception as e:
+            logger.error(f"pymupdf4llm failed for {file_path}: {e}")
+            # Cleanup folder if it was created
+            if image_output_folder.exists():
+                shutil.rmtree(image_output_folder, ignore_errors=True)
+            return [{
+                "doc_id": base_doc_id,
+                "doc_type": "markdown",
+                "content": f"[PDF extraction failed: {e}]",
+                "position": 0
+            }]
+
+        if not markdown_content or not markdown_content.strip():
+            logger.warning(f"No content extracted from PDF: {file_path}")
+
+        # Extract image references from markdown
+        image_refs = MarkdownProcessor.extract_images(markdown_content)
+
+        if not image_refs:
+            # cleanup folder anyway
+            if image_output_folder.exists():
+                shutil.rmtree(image_output_folder, ignore_errors=True)
+
+            return [{
+                "doc_id": base_doc_id,
+                "doc_type": "markdown",
+                "content": markdown_content,
+                "position": 0
+            }]
 
-        doc = fitz.open(file_path)
-        markdown_parts = []
         image_entries = []
         image_counter = 0
 
-        for page_num, page in enumerate(doc, start=1):
-            if page_num > 1:
-                markdown_parts.append("\n\n")
-            markdown_parts.append(f"--- Page {page_num} ---\n") #Avoid to be splitted as a single chunk
-
-            blocks = page.get_text("blocks", sort=True)
-            text_blocks_with_pos = []
-
-            for block in blocks:
-                block_type = block[6] if len(block) > 6 else 0
-                if block_type == 0:
-                    text = block[4].strip()
-                    if text:
-                        y_pos = block[1]
-                        text_blocks_with_pos.append({'type': 'text', 'content': text, 'y_pos': y_pos})
-
-            image_list = page.get_images(full=True)
-            images_with_pos = []
-
-            if image_list:
-                for img_index, img_info in enumerate(image_list):
-                    try:
-                        xref = img_info[0]
-                        base_image = doc.extract_image(xref)
-                        image_bytes = base_image["image"]
-                        image_ext = base_image["ext"]
-
-                        img_rects = page.get_image_rects(xref)
-                        y_pos = img_rects[0].y0 if img_rects else 999999
-
-                        pil_image = PILImage.open(io.BytesIO(image_bytes))
-                        if pil_image.width < 100 or pil_image.height < 100:
-                            continue
-
-                        from common.utils.image_data_extractor import describe_image_with_llm
-                        description = describe_image_with_llm(pil_image)
-                        description_lower = description.lower()
-                        logo_indicators = [
-                            'logo:', 'icon:', 'logo', 'icon', 'branding',
-                            'watermark', 'trademark', 'stylized letter',
-                            'stylized text', 'word "', "word '"
-                        ]
-                        if any(indicator in description_lower for indicator in logo_indicators):
-                            continue
-
-                        buffer = io.BytesIO()
-                        if pil_image.mode != 'RGB':
-                            pil_image = pil_image.convert('RGB')
-                        pil_image.save(buffer, format="JPEG", quality=95)
-                        image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
-
-                        image_counter += 1
-                        image_doc_id = f"{base_doc_id}_image_{image_counter}"
-
-                        images_with_pos.append({
-                            'type': 'image',
-                            'image_doc_id': image_doc_id,
-                            'description': description,
-                            'y_pos': y_pos,
-                            'image_data': image_base64,
-                            'image_format': image_ext,
-                            'width': pil_image.width,
-                            'height': pil_image.height
-                        })
-                    except Exception as img_error:
-                        logger.warning(f"Failed to extract image on page {page_num}: {img_error}")
-
-            all_elements = text_blocks_with_pos + images_with_pos
-            all_elements.sort(key=lambda x: x['y_pos'])
-
-            for element in all_elements:
-                if element['type'] == 'text':
-                    markdown_parts.append(element['content'])
-                    markdown_parts.append("\n\n")
-                else:
-                    # Add image description as text, then markdown image reference
-                    # Use short alt text in markdown, full description as regular text
-                    markdown_parts.append(f"![{element['description']}](tg://{element['image_doc_id']})\n\n")
-
-                    image_entries.append({
-                        "doc_id": element['image_doc_id'],
-                        "doc_type": "image",
-                        "image_description": element['description'],
-                        "image_data": element['image_data'],
-                        "image_format": element['image_format'],
-                        "parent_doc": base_doc_id,
-                        "page_number": page_num,
-                        "width": element['width'],
-                        "height": element['height'],
-                        "position": int(element['image_doc_id'].split('_')[-1])
-                    })
-
-        doc.close()
-
-        markdown_content = "".join(markdown_parts) if markdown_parts else "" #No content extracted from PDF
-        if not markdown_content:
-            return []
+        for img_ref in image_refs:
+            try:
+                img_path = Path(img_ref["path"])  # convert to Path
+                image_id = img_ref["image_id"]
+
+                # Image description
+                description = describe_image_with_llm(str(img_path))
+
+                markdown_content = MarkdownProcessor.insert_description_by_id(
+                    markdown_content,
+                    image_id,
+                    description
+                )
+
+                # Convert image to base64
+                pil_image = PILImage.open(img_path)
+                buffer = io.BytesIO()
+
+                if pil_image.mode != "RGB":
+                    pil_image = pil_image.convert("RGB")
+
+                pil_image.save(buffer, format="JPEG", quality=95)
+                image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+                image_counter += 1
+                image_doc_id = f"{base_doc_id}_image_{image_counter}"
+
+                # Replace file path with tg:// protocol reference in markdown
+                markdown_content = MarkdownProcessor.replace_path_with_tg_protocol(
+                    markdown_content,
+                    image_id,
+                    image_doc_id
+                )
+
+                image_entries.append({
+                    "doc_id": image_doc_id,
+                    "doc_type": "image",
+                    "image_description": description,
+                    "image_data": image_base64,
+                    "image_format": "jpg",
+                    "parent_doc": base_doc_id,
+                    "page_number": 0,
+                    "width": pil_image.width,
+                    "height": pil_image.height,
+                    "position": image_counter
+                })
+
+            except Exception as img_error:
+                logger.warning(f"Failed to process image {img_ref.get('path')}: {img_error}")
+
+        # FINAL CLEANUP — delete folder after processing everything
+        if image_output_folder.exists() and image_output_folder.is_dir():
+            try:
+                shutil.rmtree(image_output_folder)
+                logger.debug(f"Deleted image folder: {image_output_folder}")
+            except Exception as delete_err:
+                logger.warning(f"Failed to delete folder {image_output_folder}: {delete_err}")
 
+        # Build final result
         result = [{
             "doc_id": base_doc_id,
-            "doc_type": "",
+            "doc_type": "markdown",
             "content": markdown_content,
             "position": 0
         }]
         result.extend(image_entries)
+
         return result
 
-    except ImportError:
-        logger.error("PyMuPDF not available")
+    except ImportError as import_err:
+        logger.error(f"Required library missing: {import_err}")
+        # Cleanup on import error
+        if image_output_folder.exists():
+            shutil.rmtree(image_output_folder, ignore_errors=True)
         return [{
             "doc_id": base_doc_id,
-            "doc_type": "",
-            "content": "[PDF extraction requires PyMuPDF]",
+            "doc_type": "markdown",
+            "content": "[PDF extraction requires pymupdf4llm and PyMuPDF]",
             "position": 0
         }]
     except Exception as e:
         logger.error(f"Error extracting PDF: {e}")
+        # Cleanup on any other error
+        if image_output_folder.exists():
+            shutil.rmtree(image_output_folder, ignore_errors=True)
         raise
 
-
 def _extract_standalone_image_as_doc(file_path, base_doc_id, graphname=None):
     """
     Extract standalone image file as ONE markdown document with inline image reference.
@@ -324,25 +341,15 @@ def _extract_standalone_image_as_doc(file_path, base_doc_id, graphname=None):
 
         pil_image = PILImage.open(file_path)
         if pil_image.width < 100 or pil_image.height < 100:
-            return [{
-                "doc_id": base_doc_id,
-                "doc_type": "",
-                "content": f"[Skipped small image: {file_path.name}]",
-                "position": 0
-            }]
+            pass
 
-        description = describe_image_with_llm(pil_image)
+        description = describe_image_with_llm(str(Path(file_path).absolute()))
         description_lower = description.lower()
         logo_indicators = ['logo:', 'icon:', 'logo', 'icon', 'branding',
                            'watermark', 'trademark', 'stylized letter',
                            'stylized text', 'word "', "word '"]
         if any(indicator in description_lower for indicator in logo_indicators):
-            return [{
-                "doc_id": base_doc_id,
-                "doc_type": "",
-                "content": f"[Skipped logo/icon: {file_path.name}]",
-                "position": 0
-            }]
+            return []
 
         buffer = io.BytesIO()
         if pil_image.mode != 'RGB':
@@ -353,7 +360,6 @@ def _extract_standalone_image_as_doc(file_path, base_doc_id, graphname=None):
         image_id = f"{base_doc_id}_image_1"
         # Put description as text, then markdown image reference with short alt text
         content = f"![{description}](tg://{image_id})"
-
         return [
             {
                 "doc_id": base_doc_id,
@@ -379,7 +385,7 @@ def _extract_standalone_image_as_doc(file_path, base_doc_id, graphname=None):
         logger.error(f"Error extracting image: {e}")
         return [{
             "doc_id": base_doc_id,
-            "doc_type": "",
+            "doc_type": "markdown",
             "content": f"[Image extraction failed: {str(e)}]",
             "position": 0
         }]
@@ -441,12 +447,10 @@ def get_doc_type_from_extension(extension):
 
     if extension in ['.html', '.htm']:
         return 'html'
-    elif extension in ['.md']:
-        return 'markdown'
     elif extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
         return 'image'
     else:
-        return ''
+        return 'markdown'
 
 
 def get_supported_extensions():
diff --git a/graphrag-ui/src/pages/Setup.tsx b/graphrag-ui/src/pages/Setup.tsx
index b7d357d..2aaee99 100644
--- a/graphrag-ui/src/pages/Setup.tsx
+++ b/graphrag-ui/src/pages/Setup.tsx
@@ -2,7 +2,7 @@ import React, { useState, useEffect } from "react";
 import { useNavigate } from "react-router-dom";
 import { Button } from "@/components/ui/button";
 import { Input } from "@/components/ui/input";
-import { Database, Upload, RefreshCw, Loader2, Trash2, FolderUp, Cloud, ArrowLeft, CloudDownload, CloudLightning } from "lucide-react";
+import { Database, Upload, RefreshCw, Loader2, Trash2, FolderUp, Cloud, ArrowLeft, CloudDownload, CloudCog } from "lucide-react";
 import {
   Dialog,
   DialogContent,
@@ -56,7 +56,6 @@ const Setup = () => {
   const [uploadMessage, setUploadMessage] = useState("");
   const [isIngesting, setIsIngesting] = useState(false);
   const [ingestMessage, setIngestMessage] = useState("");
-  const [activeTab, setActiveTab] = useState("upload");
 
   // Refresh state
   const [refreshOpen, setRefreshOpen] = useState(false);
@@ -67,12 +66,13 @@ const Setup = () => {
   const [isCheckingStatus, setIsCheckingStatus] = useState(false);
   
   // S3 state
+  const [fileFormat, setFileFormat] = useState<"json" | "multi">("json");
   const [awsAccessKey, setAwsAccessKey] = useState("");
   const [awsSecretKey, setAwsSecretKey] = useState("");
+  const [dataPath, setDataPath] = useState("");
   const [inputBucket, setInputBucket] = useState("");
   const [outputBucket, setOutputBucket] = useState("");
   const [regionName, setRegionName] = useState("");
-  const [skipBDAProcessing, setSkipBDAProcessing] = useState(false);
 
   // Cloud Download state
   const [cloudProvider, setCloudProvider] = useState<"s3" | "gcs" | "azure">("s3");
@@ -458,7 +458,7 @@ const Setup = () => {
       }
 
       const createData = await createResponse.json();
-      //console.log("Create ingest response:", createData);
+      console.log("Create ingest response:", createData);
 
       // Step 2: Run ingest
       setIngestMessage("Step 2/2: Running document ingest...");
@@ -484,7 +484,7 @@ const Setup = () => {
       }
 
       const ingestData = await ingestResponse.json();
-      //console.log("Ingest response:", ingestData);
+      console.log("Ingest response:", ingestData);
 
       setIngestMessage(`✅ Data ingested successfully! Processed documents from ${folderPath}/`);
     } catch (error: any) {
@@ -495,8 +495,8 @@ const Setup = () => {
     }
   };
 
-  // Ingest files from S3 with Amazon BDA
-  const handleAmazonBDAIngest = async () => {
+  // Ingest files from S3 with Bedrock BDA
+  const handleS3BedrockIngest = async () => {
     if (!ingestGraphName) {
       setIngestMessage("Please select a graph");
       return;
@@ -508,112 +508,92 @@ const Setup = () => {
       return;
     }
 
-    if (skipBDAProcessing) {
-      // When skipping BDA, only output bucket and region are required
-      if (!outputBucket || !regionName) {
-        setIngestMessage("❌ Please provide Output Bucket and Region Name");
-        return;
-      }
-    } else {
-      // When using BDA, all fields are required
+    if (fileFormat === "multi") {
       if (!inputBucket || !outputBucket || !regionName) {
         setIngestMessage("❌ Please provide Input Bucket, Output Bucket, and Region Name");
         return;
       }
-    }
 
-    // Ask for confirmation
-    const confirmMessage = skipBDAProcessing
-      ? `You're skipping Amazon BDA processing and will ingest directly from the output bucket (${outputBucket}). Please confirm to proceed.`
-      : `You're using Amazon BDA for multimodal document processing. This will trigger Amazon BDA to process your documents from the input bucket (${inputBucket}) and store the results in the output bucket (${outputBucket}) and then ingest them into your knowledge graph. Please confirm to proceed.`;
-    
-    const shouldProceed = await confirm(confirmMessage);
-    if (!shouldProceed) {
-      setIngestMessage("Operation cancelled by user.");
-      return;
+      // Ask for confirmation if using Bedrock (multi format)
+      const shouldProceed = await confirm(
+        `Are you using AWS Bedrock for multimodal document processing? This will trigger AWS Bedrock BDA to process your documents from the input bucket (${inputBucket}) and store the results in the output bucket (${outputBucket}).`
+      );
+      if (!shouldProceed) {
+        setIngestMessage("Operation cancelled by user.");
+        return;
+      }
+    } else if (fileFormat === "json") {
+      if (!dataPath) {
+        setIngestMessage("❌ Please provide Data Path (e.g., s3://bucket-name/path/to/data)");
+        return;
+      }
     }
 
     setIsIngesting(true);
+    setIngestMessage("Step 1/2: Creating ingest job...");
 
     try {
       const creds = localStorage.getItem("creds");
-      let loadingInfo: any = {};
 
-      if (skipBDAProcessing) {
-        // Skip BDA processing - create ingest job that reads directly from output bucket
-        const runIngestConfig: any = {
-          data_source: "bda",
+      // Step 1: Create ingest job
+      const createIngestConfig: any = {
+        data_source: "s3",
+        data_source_config: {
           aws_access_key: awsAccessKey,
           aws_secret_key: awsSecretKey,
-          output_bucket: outputBucket,
-          region_name: regionName,
-          bda_jobs:[],
-          loader_config: {
-            doc_id_field: "doc_id",
-            content_field: "content",
-            doc_type: "markdown",
-          },
-          file_format: "multi"
-        };
-
-        setIngestMessage("Step 1/2: Creating ingest job from output bucket...");
-
-        // Run ingest directly
-        loadingInfo = {
-          load_job_id: "load_documents_content_json",
-          data_source_id: runIngestConfig,
-          file_path: outputBucket,
-        };
-        setIngestMessage(`Step 2/2: Running document ingestion for all files in ${outputBucket}...`);
-      } else {
-        // Step 1: Create ingest job with BDA processing
-        const createIngestConfig: any = {
-          data_source: "bda",
-          data_source_config: {
-            aws_access_key: awsAccessKey,
-            aws_secret_key: awsSecretKey,
-            input_bucket: inputBucket,
-            output_bucket: outputBucket,
-            region_name: regionName,
-          },
-          loader_config: {
-            doc_id_field: "doc_id",
-            content_field: "content",
-            doc_type: "markdown",
-          },
-          file_format: "multi"
-        };
+        },
+        loader_config: {
+          doc_id_field: "doc_id",
+          content_field: "content",
+          doc_type: fileFormat === "multi" ? "markdown" : "",
+        },
+        file_format: fileFormat
+      };
 
-        setIngestMessage("Step 1/2: Triggering Amazon BDA processing and creating ingest job...");
+      // Add format-specific configuration
+      if (fileFormat === "multi") {
+        createIngestConfig.data_source_config.input_bucket = inputBucket;
+        createIngestConfig.data_source_config.output_bucket = outputBucket;
+        createIngestConfig.data_source_config.region_name = regionName;
+        setIngestMessage("Step 1/2: Creating ingest job and triggering AWS Bedrock BDA processing...");
+      } else if (fileFormat === "json") {
+        createIngestConfig.loader_config.doc_id_field = "url";
+      }
 
-        const createResponse = await fetch(`/ui/${ingestGraphName}/create_ingest`, {
-          method: "POST",
-          headers: {
-            "Content-Type": "application/json",
-            Authorization: `Basic ${creds}`,
-          },
-          body: JSON.stringify(createIngestConfig),
-        });
+      const createResponse = await fetch(`/ui/${ingestGraphName}/create_ingest`, {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Basic ${creds}`,
+        },
+        body: JSON.stringify(createIngestConfig),
+      });
 
-        if (!createResponse.ok) {
-          const errorData = await createResponse.json();
-          throw new Error(errorData.detail || `Failed to create ingest job: ${createResponse.statusText}`);
-        }
+      if (!createResponse.ok) {
+        const errorData = await createResponse.json();
+        throw new Error(errorData.detail || `Failed to create ingest job: ${createResponse.statusText}`);
+      }
 
-        const createData = await createResponse.json();
-        //console.log("Create ingest response:", createData);
+      const createData = await createResponse.json();
+      console.log("Create ingest response:", createData);
 
-        // Step 2: Run ingest
-        loadingInfo = {
-          load_job_id: createData.load_job_id,
-          data_source_id: createData.data_source_id,
-          file_path: outputBucket,
-        };
+      // Step 2: Run ingest
+      setIngestMessage("Step 2/2: Running document ingest...");
 
-        const filesToIngest = createData.data_source_id.bda_jobs.map((job: any) => job.jobId.split("/")[-1]);
-        setIngestMessage(`Step 2/2: Running document ingest for ${filesToIngest.length} files in ${outputBucket}...`);
+      // Determine file path based on format
+      let filePath = "";
+      if (fileFormat === "multi") {
+        filePath = outputBucket; // For multi format, use output bucket
+      } else if (fileFormat === "json") {
+        filePath = dataPath; // For json format, use the provided data path
       }
 
+      const loadingInfo = {
+        load_job_id: createData.load_job_id,
+        data_source_id: createData.data_source_id,
+        file_path: filePath,
+      };
+
       const ingestResponse = await fetch(`/ui/${ingestGraphName}/ingest`, {
         method: "POST",
         headers: {
@@ -629,13 +609,15 @@ const Setup = () => {
       }
 
       const ingestData = await ingestResponse.json();
-      //console.log("Ingest response:", ingestData);
-      const filesIngested = ingestData.summary.map((file: any) => file.file_path);
-
-      setIngestMessage(`✅ Document ingestion completed successfully! Ingested ${filesIngested.length} into your knowledge graph.`);
+      console.log("Ingest response:", ingestData);
 
+      if (fileFormat === "multi") {
+        setIngestMessage(`✅ Data ingested successfully! AWS Bedrock BDA processed documents from ${inputBucket} and loaded results from ${outputBucket}.`);
+      } else {
+        setIngestMessage(`✅ Data ingested successfully! Processed documents from ${dataPath}.`);
+      }
     } catch (error: any) {
-      console.error("Error ingesting files:", error);
+      console.error("Error ingesting S3 data:", error);
       setIngestMessage(`❌ Error: ${error.message}`);
     } finally {
       setIsIngesting(false);
@@ -1121,8 +1103,8 @@ const Setup = () => {
               <label className="block text-sm font-medium mb-2 text-black dark:text-white">
                 Target Graph Name
               </label>
-              <Select value={ingestGraphName} onValueChange={setIngestGraphName} disabled={isIngesting}>
-                <SelectTrigger className="dark:border-[#3D3D3D] dark:bg-shadeA" disabled={isIngesting}>
+              <Select value={ingestGraphName} onValueChange={setIngestGraphName}>
+                <SelectTrigger className="dark:border-[#3D3D3D] dark:bg-shadeA">
                   <SelectValue placeholder="Select a graph" />
                 </SelectTrigger>
                 <SelectContent>
@@ -1139,35 +1121,32 @@ const Setup = () => {
                   )}
                 </SelectContent>
               </Select>
+              {ingestGraphName && (
+                <p className="text-xs text-gray-500 dark:text-gray-400 mt-1">
+                  Files will be uploaded to: uploads/{ingestGraphName}/
+                </p>
+              )}
             </div>
 
-            <Tabs value={activeTab} onValueChange={(value) => {
-              // Block tab switching when ingesting
-              if (!isIngesting) {
-                setActiveTab(value);
-              }
-            }} className="w-full">
+            <Tabs defaultValue="upload" className="w-full">
               <TabsList className="grid w-full grid-cols-3">
-                <TabsTrigger value="upload" disabled={isIngesting}>
+                <TabsTrigger value="upload">
                   <FolderUp className="h-4 w-4 mr-2" />
                   Upload Files
                 </TabsTrigger>
-                <TabsTrigger value="cloudDownload" disabled={isIngesting}>
+                <TabsTrigger value="cloudDownload">
                   <CloudDownload className="h-4 w-4 mr-2" />
                   Download from Cloud
                 </TabsTrigger>
-                <TabsTrigger value="AmazonBDA" disabled={isIngesting}>
-                  <CloudLightning className="h-4 w-4 mr-2" />
-                  Use Amazon BDA
+                <TabsTrigger value="s3">
+                  <CloudCog className="h-4 w-4 mr-2" />
+                  Amazon BDA Configuration
                 </TabsTrigger>
               </TabsList>
 
               {/* Upload Data Tab */}
               <TabsContent value="upload" className="space-y-4">
                 <div className="space-y-4">
-                  <p className="text-sm font-medium text-gray-500 dark:text-gray-400 mb-3">
-                    Upload local files to the server and ingest them into your knowledge graph.
-                  </p>
                   <div>
                     <label className="block text-sm font-medium mb-2 text-black dark:text-white">
                       Select Files
@@ -1179,9 +1158,9 @@ const Setup = () => {
                       disabled={isUploading}
                       className="dark:border-[#3D3D3D] dark:bg-shadeA"
                     />
-                    <p className="text-xs text-gray-500 dark:text-gray-400 mt-2">
-                      Maximum upload per request: {MAX_UPLOAD_SIZE_MB} MB. {ingestGraphName ? `Upload destination: uploads/${ingestGraphName}/` : ""}
-                    </p>
+                  <p className="text-xs text-gray-500 dark:text-gray-400 mt-2">
+                    Maximum upload per request: {MAX_UPLOAD_SIZE_MB} MB.
+                  </p>
                   </div>
 
                   <div className="flex gap-2">
@@ -1295,9 +1274,6 @@ const Setup = () => {
               {/* Download from Cloud Storage Tab */}
               <TabsContent value="cloudDownload" className="space-y-4">
                 <div className="space-y-4">
-                  <p className="text-sm font-medium text-gray-500 dark:text-gray-400 mb-3">
-                    Download files from cloud storage and ingest them into your knowledge graph.
-                  </p>
                   <div>
                     <label className="block text-sm font-medium mb-2 text-black dark:text-white">
                       Cloud Storage Provider
@@ -1487,13 +1463,11 @@ const Setup = () => {
                       </div>
                     </>
                   )}
-                  {ingestGraphName && (
-                    <p className="text-xs text-gray-500 dark:text-gray-400 mb-2">
-                      Download destination: downloaded_files_cloud/{ingestGraphName}/
-                    </p>
-                  )}
 
                   <div className="pt-4 border-t border-gray-300 dark:border-[#3D3D3D]">
+                    <p className="text-xs text-gray-500 dark:text-gray-400 mb-2">
+                      Files will be downloaded to: downloaded_files_cloud/{ingestGraphName}/
+                    </p>
                     <Button 
                       onClick={handleCloudDownload}
                       disabled={isDownloading}
@@ -1607,12 +1581,23 @@ const Setup = () => {
                 </div>
               </TabsContent>
 
-              {/* Amazon BDA Configuration Tab */}
-              <TabsContent value="AmazonBDA" className="space-y-4">
-                <div className="space-y-4">              
-                  <p className="text-sm font-medium text-gray-500 dark:text-gray-400 mb-3">
-                    Process multimodal documents stored in S3 with Amazon Bedrock Data Automation and ingest them into your knowledge graph.
-                  </p>
+              {/* S3 Bedrock Configuration Tab */}
+              <TabsContent value="s3" className="space-y-4">
+                <div className="space-y-4">
+                  <div>
+                    <label className="block text-sm font-medium mb-2 text-black dark:text-white">
+                      File Format
+                    </label>
+                    <Select value={fileFormat} onValueChange={(value: "json" | "multi") => setFileFormat(value)}>
+                      <SelectTrigger className="dark:border-[#3D3D3D] dark:bg-shadeA">
+                        <SelectValue placeholder="Select file format" />
+                      </SelectTrigger>
+                      <SelectContent>
+                        <SelectItem value="json">JSON</SelectItem>
+                        <SelectItem value="multi">Multi</SelectItem>
+                      </SelectContent>
+                    </Select>
+                  </div>
 
                   {/* Common fields */}
                   <div>
@@ -1625,7 +1610,6 @@ const Setup = () => {
                       onChange={(e) => setAwsAccessKey(e.target.value)}
                       placeholder="Enter AWS access key"
                       className="dark:border-[#3D3D3D] dark:bg-shadeA"
-                      disabled={isIngesting}
                     />
                   </div>
 
@@ -1639,74 +1623,76 @@ const Setup = () => {
                       onChange={(e) => setAwsSecretKey(e.target.value)}
                       placeholder="Enter AWS secret key"
                       className="dark:border-[#3D3D3D] dark:bg-shadeA"
-                      disabled={isIngesting}
                     />
                   </div>
 
-                  <div>
-                    <div className="flex items-center justify-between mb-2">
-                      <label className="block text-sm font-medium text-black dark:text-white">
-                        Input Bucket
-                      </label>
-                      <label className="flex items-center gap-2 text-sm text-gray-600 dark:text-gray-400 cursor-pointer">
-                        <input
-                          type="checkbox"
-                          checked={skipBDAProcessing}
-                          onChange={(e) => setSkipBDAProcessing(e.target.checked)}
-                          disabled={isIngesting}
-                          className="h-4 w-4 rounded border-gray-300 dark:border-gray-600"
-                        />
-                        <span>Skip BDA (ingest existing BDA output bucket directly)</span>
+                  {/* Conditional fields based on file format */}
+                  {fileFormat === "json" ? (
+                    <div>
+                      <label className="block text-sm font-medium mb-2 text-black dark:text-white">
+                        Data Path
                       </label>
+                      <Input
+                        type="text"
+                        value={dataPath}
+                        onChange={(e) => setDataPath(e.target.value)}
+                        placeholder="s3://bucket-name/path/to/data"
+                        className="dark:border-[#3D3D3D] dark:bg-shadeA"
+                      />
                     </div>
-                    <Input
-                      type="text"
-                      value={inputBucket}
-                      onChange={(e) => setInputBucket(e.target.value)}
-                      placeholder="Enter input bucket name"
-                      className="dark:border-[#3D3D3D] dark:bg-shadeA"
-                      disabled={isIngesting || skipBDAProcessing}
-                    />
-                  </div>
-
-                  <div>
-                    <label className="block text-sm font-medium mb-2 text-black dark:text-white">
-                      Output Bucket
-                    </label>
-                    <Input
-                      type="text"
-                      value={outputBucket}
-                      onChange={(e) => setOutputBucket(e.target.value)}
-                      placeholder="Enter output bucket name"
-                      className="dark:border-[#3D3D3D] dark:bg-shadeA"
-                      disabled={isIngesting}
-                    />
-                  </div>
+                  ) : (
+                    <>
+                      <div>
+                        <label className="block text-sm font-medium mb-2 text-black dark:text-white">
+                          Input Bucket
+                        </label>
+                        <Input
+                          type="text"
+                          value={inputBucket}
+                          onChange={(e) => setInputBucket(e.target.value)}
+                          placeholder="Enter input bucket name"
+                          className="dark:border-[#3D3D3D] dark:bg-shadeA"
+                        />
+                      </div>
 
-                  <div>
-                    <label className="block text-sm font-medium mb-2 text-black dark:text-white">
-                      Region Name
-                    </label>
-                    <Input
-                      type="text"
-                      value={regionName}
-                      onChange={(e) => setRegionName(e.target.value)}
-                      placeholder="e.g., us-east-1"
-                      className="dark:border-[#3D3D3D] dark:bg-shadeA"
-                      disabled={isIngesting}
-                    />
-                  </div>
+                      <div>
+                        <label className="block text-sm font-medium mb-2 text-black dark:text-white">
+                          Output Bucket
+                        </label>
+                        <Input
+                          type="text"
+                          value={outputBucket}
+                          onChange={(e) => setOutputBucket(e.target.value)}
+                          placeholder="Enter output bucket name"
+                          className="dark:border-[#3D3D3D] dark:bg-shadeA"
+                        />
+                      </div>
 
-                  {ingestGraphName && (
-                    <p className="text-xs text-gray-500 dark:text-gray-400 mb-2">
-                      Processing destination: Input bucket ({inputBucket || "not specified"}) → Output bucket ({outputBucket || "not specified"}) → Knowledge graph ({ingestGraphName})
-                    </p>
+                      <div>
+                        <label className="block text-sm font-medium mb-2 text-black dark:text-white">
+                          Region Name
+                        </label>
+                        <Input
+                          type="text"
+                          value={regionName}
+                          onChange={(e) => setRegionName(e.target.value)}
+                          placeholder="e.g., us-east-1"
+                          className="dark:border-[#3D3D3D] dark:bg-shadeA"
+                        />
+                      </div>
+                    </>
                   )}
 
-                  {/* Ingest S3 Files with Amazon BDA Section */}
+                  {/* Ingest S3 Bedrock Data Section */}
                   <div className="border-t border-gray-300 dark:border-[#3D3D3D] pt-4 mt-4">
+                    <h3 className="text-sm font-medium mb-2 text-black dark:text-white">
+                      Ingest S3 Data into Knowledge Graph
+                    </h3>
+                    <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
+                      Process S3 data and add it to the knowledge graph using AWS Bedrock BDA for multimodal documents
+                    </p>
                     <Button
-                      onClick={handleAmazonBDAIngest}
+                      onClick={handleS3BedrockIngest}
                       disabled={isIngesting}
                       className="gradient text-white w-full"
                     >
@@ -1718,7 +1704,7 @@ const Setup = () => {
                       ) : (
                         <>
                           <Database className="h-4 w-4 mr-2" />
-                          Ingest from S3 Bucket into {ingestGraphName}
+                          Ingest from S3 into {ingestGraphName}
                         </>
                       )}
                     </Button>
@@ -1771,7 +1757,7 @@ const Setup = () => {
             <DialogHeader>
               <DialogTitle className="text-black dark:text-white">Refresh Knowledge Graph</DialogTitle>
               <DialogDescription className="text-gray-600 dark:text-[#D9D9D9]">
-                Rebuild the graph content and rerun community detection for your knowledge graph
+                Rebuild the graph content of your knowledge graph
               </DialogDescription>
             </DialogHeader>
 
@@ -1780,8 +1766,8 @@ const Setup = () => {
                 <label className="block text-sm font-medium mb-2 text-black dark:text-white">
                   Select Graph to Refresh
                 </label>
-                <Select value={refreshGraphName} onValueChange={setRefreshGraphName} disabled={isRefreshing || isRebuildRunning || isCheckingStatus}>
-                  <SelectTrigger className="dark:border-[#3D3D3D] dark:bg-shadeA" disabled={isRefreshing || isRebuildRunning || isCheckingStatus}>
+                <Select value={refreshGraphName} onValueChange={setRefreshGraphName}>
+                  <SelectTrigger className="dark:border-[#3D3D3D] dark:bg-shadeA">
                     <SelectValue placeholder="Select a graph" />
                   </SelectTrigger>
                   <SelectContent>
@@ -1805,7 +1791,7 @@ const Setup = () => {
                   ⚠️ Warning
                 </p>
                 <p className="text-sm text-yellow-700 dark:text-yellow-300 mt-1">
-                  This operation will process new documents and rerun community detection that will interrupt related queries.
+                  This operation will rebuild the graph content that will interrupt related queries. 
                   Please confirm to proceed.
                 </p>
               </div>

From b752b8606a85c84273c491df3a8d4d2174fe366c Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Tue, 18 Nov 2025 18:27:24 +0530
Subject: [PATCH 02/15] Update README for OpenAI and Bedrock config, add
 pymupdf4llm license

---
 README.md                         |  78 ++--
 licenses/pymupdf4llm-AGPL-3.0.txt | 661 ++++++++++++++++++++++++++++++
 2 files changed, 704 insertions(+), 35 deletions(-)
 create mode 100644 licenses/pymupdf4llm-AGPL-3.0.txt

diff --git a/README.md b/README.md
index 8c38f6c..13c88b3 100644
--- a/README.md
+++ b/README.md
@@ -103,24 +103,23 @@ Organizing the data as a knowledge graph allows a chatbot to access accurate, fa
 ### Quick Start
 
 #### Use TigerGraph Docker-Based Instance
-Set your LLM Provider (supported `openai` or `gemini`) api key as environment varabiel LLM_API_KEY and use the following command for a one-step quick deployment with TigerGraph Community Edition and default configurations:
+Set your OpenAI api key as environment varabiel OPENAI_API_KEY and use the following command for a one-step quick deployment with TigerGraph Community Edition and default configurations:
 ```
-curl -k https://raw.githubusercontent.com/tigergraph/graphrag/refs/heads/main/docs/tutorials/setup_graphrag.sh | bash
+curl -k https://raw.githubusercontent.com/tigergraph/graphrag/refs/heads/main/docs/tutorials/setup_graphrag.sh | sh
 ```
 
 The GraphRAG instances will be deployed at `./graphrag` folder and TigerGraph instance will be available at `http://localhost:14240`.
-To change installation folder, use `bash -s -- <graphrag_folder> <llm_provider>` instead of `bash` at the end of the above command.
-
-> Note: for other LLM providers, manually update `configs/server_config.json` accordingly and re-run `docker compose up -d`
+To change installation folder, use `sh -s -- <graphrag_folder>` instead of `sh` at the end of the above command.
 
 #### Use Pre-Installed TigerGraph Instance
-Similar to the above setup, and use the following command for a one-step quick deployment connecting to a pre-installed TigerGraph with default configurations:
+
+Using the following command for a one-step quick deployment with TigerGraph Community Edition and default configurations:
 ```
-curl -k https://raw.githubusercontent.com/tigergraph/graphrag/refs/heads/main/docs/tutorials/setup_graphrag_tg.sh | bash
+curl -k https://raw.githubusercontent.com/tigergraph/graphrag/refs/heads/main/docs/tutorials/setup_graphrag_tg.sh | sh
 ```
 
 The GraphRAG instances will be deployed at `./graphrag` folder and connect to TigerGraph instance at `http://localhost:14240` by default.
-To change installation folder, TigerGraph instance location or username/password, use `bash -s -- <graphrag_folder> <llm_provider> <tg_host> <tg_port> <tg_username> <tg_password>` instead of `bash` at the end of the above command.
+To change installation folder, TigerGraph instance location or username/password, use `sh -s -- <graphrag_loc> <tg_host> <tg_port> <tg_username> <tg_password>` instead of `sh` at the end of the above command.
 
 [Go back to top](#top)
 
@@ -152,7 +151,7 @@ Here’s what the folder structure looks like:
 
 ##### Step 3: Adjust configurations
 
-Edit `llm_config` section of `configs/server_config.json` and replace `<YOUR_LLM_API_KEY>` to your own LLM_API_KEY for the LLM provider. 
+Edit `llm_config` section of `configs/server_config.json` and replace `<YOUR_OPENAI_API_KEY>` to your own OPENAI_API_KEY. 
  
 > If desired, you can also change the model to be used for the embedding service and completion service to your preferred models to adjust the output from the LLM service.
 
@@ -470,23 +469,27 @@ In addition to the `OPENAI_API_KEY`, `llm_model` and `model_name` can be edited
 ```json
 {
     "llm_config": {
+        "authentication_configuration": {
+            "OPENAI_API_KEY": "YOUR_OPENAI_API_KEY_HERE"
+        },
         "embedding_service": {
-            "embedding_model_service": "openai",
             "model_name": "text-embedding-3-small",
-            "authentication_configuration": {
-                "OPENAI_API_KEY": "YOUR_OPENAI_API_KEY_HERE"
-            }
+            "embedding_model_service": "openai"
         },
         "completion_service": {
             "llm_service": "openai",
             "llm_model": "gpt-4.1-mini",
-            "authentication_configuration": {
-                "OPENAI_API_KEY": "YOUR_OPENAI_API_KEY_HERE"
-            },
             "model_kwargs": {
                 "temperature": 0
             },
             "prompt_path": "./common/prompts/openai_gpt4/"
+        },
+        "multimodal_service": {
+            "llm_service": "openai",
+            "llm_model": "gpt-4o-mini",
+            "model_kwargs": {
+                "temperature": 0
+            }
         }
     }
 }
@@ -546,7 +549,7 @@ And your JSON config should follow as:
             "model_kwargs": {
                 "temperature": 0
             },
-            "prompt_path": "./common/prompts/gcp_vertexai_palm/"
+            "prompt_path": "./app/prompts/gcp_vertexai_palm/"
         }
     }
 }
@@ -583,7 +586,7 @@ In addition to the `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, and `azure_d
             "model_kwargs": {
                 "temperature": 0
             },
-            "prompt_path": "./common/prompts/azure_open_ai_gpt35_turbo_instruct/"
+            "prompt_path": "./app/prompts/azure_open_ai_gpt35_turbo_instruct/"
         }
     }
 }
@@ -594,27 +597,32 @@ In addition to the `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, and `azure_d
 ```json
 {
     "llm_config": {
+        "authentication_configuration": {
+            "AWS_ACCESS_KEY_ID": "YOUR_AWS_ACCESS_KEY",
+            "AWS_SECRET_ACCESS_KEY": "YOUR_AWS_SECRET_KEY",
+            "AWS_REGION_NAME": "us-west-2"
+        },
         "embedding_service": {
+            "model_name": "amazon.titan-embed-text-v1",
             "embedding_model_service": "bedrock",
-            "model_name":"amazon.titan-embed-text-v2",
-            "region_name":"us-west-2",
-            "authentication_configuration": {
-                "AWS_ACCESS_KEY_ID": "ACCESS_KEY",
-                "AWS_SECRET_ACCESS_KEY": "SECRET"
-            }
+            "dimensions": 1536
         },
         "completion_service": {
             "llm_service": "bedrock",
-            "llm_model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
-            "region_name":"us-west-2",
-            "authentication_configuration": {
-                "AWS_ACCESS_KEY_ID": "ACCESS_KEY",
-                "AWS_SECRET_ACCESS_KEY": "SECRET"
-            },
+            "llm_model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
             "model_kwargs": {
                 "temperature": 0,
+                "max_tokens": 4096
             },
-            "prompt_path": "./common/prompts/aws_bedrock_claude3haiku/"
+            "prompt_path": "./common/prompts/openai_gpt4/"
+        },
+        "multimodal_service": {
+            "llm_service": "bedrock",
+            "llm_model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+            "model_kwargs": {
+                "temperature": 0,
+                "max_tokens": 4096
+            }
         }
     }
 }
@@ -640,7 +648,7 @@ In addition to the `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, and `azure_d
             "model_kwargs": {
                 "temperature": 0.0000001
             },
-            "prompt_path": "./common/prompts/openai_gpt4/"
+            "prompt_path": "./app/prompts/openai_gpt4/"
         }
     }
 }
@@ -670,7 +678,7 @@ Example configuration for a model on Hugging Face with a dedicated endpoint is s
             "model_kwargs": {
                 "temperature": 0.1
             },
-            "prompt_path": "./common/prompts/openai_gpt4/"
+            "prompt_path": "./app/prompts/openai_gpt4/"
         }
     }
 }
@@ -697,7 +705,7 @@ Example configuration for a model on Hugging Face with a serverless endpoint is
             "model_kwargs": {
                 "temperature": 0.1
             },
-            "prompt_path": "./common/prompts/llama_70b/"
+            "prompt_path": "./app/prompts/llama_70b/"
         }
     }
 }
@@ -724,7 +732,7 @@ Example configuration for a model on Hugging Face with a serverless endpoint is
             "model_kwargs": {
                 "temperature": 0.1
             },
-            "prompt_path": "./common/prompts/openai_gpt4/"
+            "prompt_path": "./app/prompts/openai_gpt4/"
         }
     }
 }
diff --git a/licenses/pymupdf4llm-AGPL-3.0.txt b/licenses/pymupdf4llm-AGPL-3.0.txt
new file mode 100644
index 0000000..0ad25db
--- /dev/null
+++ b/licenses/pymupdf4llm-AGPL-3.0.txt
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.

From a73de5d4a39759c7dd657d12ab1a27d77e06777f Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Tue, 18 Nov 2025 18:32:55 +0530
Subject: [PATCH 03/15] Update README for OpenAI and Bedrock config, add
 pymupdf4llm license

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 13c88b3..9469ad6 100644
--- a/README.md
+++ b/README.md
@@ -482,7 +482,7 @@ In addition to the `OPENAI_API_KEY`, `llm_model` and `model_name` can be edited
             "model_kwargs": {
                 "temperature": 0
             },
-            "prompt_path": "./common/prompts/openai_gpt4/"
+            "prompt_path": "./app/prompts/openai_gpt4/"
         },
         "multimodal_service": {
             "llm_service": "openai",
@@ -614,7 +614,7 @@ In addition to the `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, and `azure_d
                 "temperature": 0,
                 "max_tokens": 4096
             },
-            "prompt_path": "./common/prompts/openai_gpt4/"
+            "prompt_path": "./app/prompts/aws_bedrock_claude3haiku/"
         },
         "multimodal_service": {
             "llm_service": "bedrock",

From 5b18648869eb1b2251444a1a81774ea753d1bfa0 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Tue, 18 Nov 2025 22:55:22 +0530
Subject: [PATCH 04/15] Fix prompt_path to use ./common/prompts/ for OpenAI and
 Bedrock

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9469ad6..13c88b3 100644
--- a/README.md
+++ b/README.md
@@ -482,7 +482,7 @@ In addition to the `OPENAI_API_KEY`, `llm_model` and `model_name` can be edited
             "model_kwargs": {
                 "temperature": 0
             },
-            "prompt_path": "./app/prompts/openai_gpt4/"
+            "prompt_path": "./common/prompts/openai_gpt4/"
         },
         "multimodal_service": {
             "llm_service": "openai",
@@ -614,7 +614,7 @@ In addition to the `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, and `azure_d
                 "temperature": 0,
                 "max_tokens": 4096
             },
-            "prompt_path": "./app/prompts/aws_bedrock_claude3haiku/"
+            "prompt_path": "./common/prompts/openai_gpt4/"
         },
         "multimodal_service": {
             "llm_service": "bedrock",

From 380aa7c72ecfe0ab6314b9d75b40de9c8ea936fe Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Fri, 21 Nov 2025 20:57:53 +0530
Subject: [PATCH 05/15] bug fixes

---
 graphrag/app/routers/ui.py          |  1 +
 graphrag/app/supportai/supportai.py | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/graphrag/app/routers/ui.py b/graphrag/app/routers/ui.py
index 9637347..114b489 100644
--- a/graphrag/app/routers/ui.py
+++ b/graphrag/app/routers/ui.py
@@ -395,6 +395,7 @@ async def serve_image_from_vertex(
         LogWriter.info(f"Serving image {image_id} from graph {graphname}")
 
         # Fetch the Image vertex by ID
+        # TigerGraph loading job uses gsql_lower() so all IDs are stored in lowercase
         image_vertices = conn.getVerticesById('Image', [image_id.lower()])
         
         if not image_vertices:
diff --git a/graphrag/app/supportai/supportai.py b/graphrag/app/supportai/supportai.py
index d2efe8a..6b93df0 100644
--- a/graphrag/app/supportai/supportai.py
+++ b/graphrag/app/supportai/supportai.py
@@ -337,9 +337,9 @@ def create_ingest(
     conn: TigerGraphConnection,
 ):
     # Check for invalid combination of multi format and non-s3 data source
-    if ingest_config.data_source.lower() in ["bda", "server"] and ingest_config.get("file_format", "").lower() != "multi":
-        logger.warning(f"File format {ingest_config.get('file_format', '').lower()} is not supported for data source {ingest_config.data_source.lower()}")
-        ingest_config["file_format"] = "multi"
+    if ingest_config.data_source.lower() in ["bda", "server"] and ingest_config.file_format.lower() != "multi":
+        logger.warning(f"File format {ingest_config.file_format.lower()} is not supported for data source {ingest_config.data_source.lower()}")
+        ingest_config.file_format = "multi"
 
     res_ingest_config = {"data_source": ingest_config.data_source.lower()}
     res_ingest_config["file_format"] = ingest_config.file_format.lower()
@@ -481,9 +481,9 @@ def create_ingest(
         except Exception as e:
             raise Exception(f"Error during Amazon BDA preprocessing: {e}")
     elif ingest_config.data_source.lower() == "server":
-        data_path = ingest_config.data_source_config.get("data_path", None)
+        data_path = ingest_config.data_source_config.get("folder_path", None)
         if data_path is None:
-            raise Exception("Data path not provided for server processing")
+            raise Exception("Folder path not provided for server processing")
         try:
             extractor = TextExtractor()
             server_processing_result = extractor.process_folder(data_path, graphname=graphname)
@@ -652,7 +652,10 @@ def ingest(
                 data_source_id = ingest_config.get("data_source_id", "DocumentContent")
                 if ingest_config.get("server_jobs"):
                     for doc_data in ingest_config.get("server_jobs"):
-                        if not doc_data.get("doc_id") or not doc_data.get("content"):
+                        if not doc_data.get("doc_id"):
+                            continue
+                        # Skip documents with neither content nor image_data
+                        if not doc_data.get("content") and not doc_data.get("image_data"):
                             continue
                         if doc_data.get("image_data"):
                             payload = {
@@ -660,8 +663,11 @@ def ingest(
                                 "doc_type": "image",
                                 "image_data": doc_data.get("image_data", ""),
                                 "image_format": doc_data.get("image_format", "jpg"),
+                                "image_description": doc_data.get("image_description", ""),
                                 "parent_doc": doc_data.get("parent_doc", ""),
                                 "page_number": doc_data.get("page_number", 0),
+                                "width": doc_data.get("width", 0),
+                                "height": doc_data.get("height", 0),
                                 "position": doc_data.get("position", 0),
                                 "content": ""
                             }

From 16c8721bc25d9fe064df88321e9bbeb677dd4ea0 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Mon, 24 Nov 2025 20:00:28 +0530
Subject: [PATCH 06/15] Add local temp file storage for ingestion review

---
 common/utils/text_extractors.py     |  59 +++--
 graphrag-ui/src/pages/Setup.tsx     | 319 +++++++++++++++++++++++++---
 graphrag/app/routers/ui.py          | 136 ++++++++++++
 graphrag/app/supportai/supportai.py |  70 +++++-
 4 files changed, 530 insertions(+), 54 deletions(-)

diff --git a/common/utils/text_extractors.py b/common/utils/text_extractors.py
index b900cae..ec5b140 100644
--- a/common/utils/text_extractors.py
+++ b/common/utils/text_extractors.py
@@ -8,6 +8,7 @@
 import uuid
 import base64
 import io
+import threading
 from pathlib import Path
 import shutil
 import asyncio
@@ -15,6 +16,9 @@
 
 logger = logging.getLogger(__name__)
 
+# Global lock for pymupdf4llm calls (not thread-safe)
+_pymupdf4llm_lock = threading.Lock()
+
 
 class TextExtractor:
     """Class for handling text extraction from various file formats and cleanup."""
@@ -202,26 +206,39 @@ def _extract_pdf_with_images_as_docs(file_path, base_doc_id, graphname=None):
             shutil.rmtree(image_output_folder, ignore_errors=True)
 
         # Convert PDF to markdown with extracted image files
-        try:
-            markdown_content = pymupdf4llm.to_markdown(
-                file_path,
-                write_images=True,
-                image_path=str(image_output_folder),  # unique folder per PDF
-                force_text=False,
-                margins=0,
-                image_size_limit=0.08,
-            )
-        except Exception as e:
-            logger.error(f"pymupdf4llm failed for {file_path}: {e}")
-            # Cleanup folder if it was created
-            if image_output_folder.exists():
-                shutil.rmtree(image_output_folder, ignore_errors=True)
-            return [{
-                "doc_id": base_doc_id,
-                "doc_type": "markdown",
-                "content": f"[PDF extraction failed: {e}]",
-                "position": 0
-            }]
+        # Use lock because pymupdf4llm's table extraction is not thread-safe
+        # See: https://github.com/pymupdf/PyMuPDF/issues/3241
+        with _pymupdf4llm_lock:
+            try:
+                markdown_content = pymupdf4llm.to_markdown(
+                    file_path,
+                    write_images=True,
+                    image_path=str(image_output_folder),  # unique folder per PDF
+                    margins=0,
+                    image_size_limit=0.08,
+                )
+            except Exception:
+                # Retry with table_strategy="lines" if first attempt fails
+                try:
+                    markdown_content = pymupdf4llm.to_markdown(
+                        file_path,
+                        write_images=True,
+                        image_path=str(image_output_folder),  # unique folder per PDF
+                        margins=0,
+                        image_size_limit=0.08,
+                        table_strategy="lines",
+                    )
+                except Exception as e:
+                    logger.error(f"pymupdf4llm failed for {file_path}: {e}")
+                    # Cleanup folder if it was created
+                    if image_output_folder.exists():
+                        shutil.rmtree(image_output_folder, ignore_errors=True)
+                    return [{
+                        "doc_id": base_doc_id,
+                        "doc_type": "markdown",
+                        "content": f"[PDF extraction failed: {e}]",
+                        "position": 0
+                    }]
 
         if not markdown_content or not markdown_content.strip():
             logger.warning(f"No content extracted from PDF: {file_path}")
@@ -461,4 +478,4 @@ def get_supported_extensions():
 def is_supported_file(file_path):
     """Check if a file is supported for text extraction."""
     extension = Path(file_path).suffix.lower()
-    return extension in get_supported_extensions()
+    return extension in get_supported_extensions()
\ No newline at end of file
diff --git a/graphrag-ui/src/pages/Setup.tsx b/graphrag-ui/src/pages/Setup.tsx
index 2aaee99..c844896 100644
--- a/graphrag-ui/src/pages/Setup.tsx
+++ b/graphrag-ui/src/pages/Setup.tsx
@@ -56,6 +56,12 @@ const Setup = () => {
   const [uploadMessage, setUploadMessage] = useState("");
   const [isIngesting, setIsIngesting] = useState(false);
   const [ingestMessage, setIngestMessage] = useState("");
+  
+  // Ingestion temp files state
+  const [tempSessionId, setTempSessionId] = useState<string | null>(null);
+  const [tempFiles, setTempFiles] = useState<any[]>([]);
+  const [showTempFiles, setShowTempFiles] = useState(false);
+  const [ingestJobData, setIngestJobData] = useState<any>(null);
 
   // Refresh state
   const [refreshOpen, setRefreshOpen] = useState(false);
@@ -416,6 +422,125 @@ const Setup = () => {
     }
   };
 
+  // Fetch temp processed files
+  const fetchTempFiles = async (sessionId: string) => {
+    if (!ingestGraphName || !sessionId) return;
+
+    try {
+      const creds = localStorage.getItem("creds");
+      const response = await fetch(`/ui/${ingestGraphName}/ingestion_temp/list?session_id=${sessionId}`, {
+        headers: { Authorization: `Basic ${creds}` },
+      });
+      const data = await response.json();
+      if (data.status === "success" && data.sessions.length > 0) {
+        setTempFiles(data.sessions[0].files || []);
+        setShowTempFiles(true);
+      }
+    } catch (error) {
+      console.error("Error fetching temp files:", error);
+    }
+  };
+
+  // Delete a specific temp file
+  const handleDeleteTempFile = async (filename: string) => {
+    if (!ingestGraphName || !tempSessionId) return;
+
+    try {
+      const creds = localStorage.getItem("creds");
+      const response = await fetch(
+        `/ui/${ingestGraphName}/ingestion_temp/delete?session_id=${tempSessionId}&filename=${encodeURIComponent(filename)}`,
+        {
+          method: "DELETE",
+          headers: { Authorization: `Basic ${creds}` },
+        }
+      );
+      const data = await response.json();
+      if (data.status === "success") {
+        setIngestMessage(`✅ ${data.message}`);
+        // Refresh the temp files list
+        await fetchTempFiles(tempSessionId);
+      }
+    } catch (error: any) {
+      setIngestMessage(`❌ Error: ${error.message}`);
+    }
+  };
+
+  // Delete all temp files for session
+  const handleDeleteAllTempFiles = async () => {
+    if (!ingestGraphName || !tempSessionId) return;
+
+    try {
+      const creds = localStorage.getItem("creds");
+      const response = await fetch(
+        `/ui/${ingestGraphName}/ingestion_temp/delete?session_id=${tempSessionId}`,
+        {
+          method: "DELETE",
+          headers: { Authorization: `Basic ${creds}` },
+        }
+      );
+      const data = await response.json();
+      if (data.status === "success") {
+        setIngestMessage(`✅ ${data.message}`);
+        setTempFiles([]);
+        setShowTempFiles(false);
+        setTempSessionId(null);
+      }
+    } catch (error: any) {
+      setIngestMessage(`❌ Error: ${error.message}`);
+    }
+  };
+
+  // Run final ingest after user reviews temp files
+  const handleRunIngest = async () => {
+    if (!ingestJobData) {
+      setIngestMessage("❌ No ingest job data available");
+      return;
+    }
+
+    setIsIngesting(true);
+    setIngestMessage("Running final document ingest...");
+
+    try {
+      const creds = localStorage.getItem("creds");
+
+      const loadingInfo = {
+        load_job_id: ingestJobData.load_job_id,
+        data_source_id: ingestJobData.data_source_id,
+        file_path: ingestJobData.data_path,
+      };
+
+      const ingestResponse = await fetch(`/ui/${ingestGraphName}/ingest`, {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Basic ${creds}`,
+        },
+        body: JSON.stringify(loadingInfo),
+      });
+
+      if (!ingestResponse.ok) {
+        const errorData = await ingestResponse.json();
+        throw new Error(errorData.detail || `Failed to run ingest: ${ingestResponse.statusText}`);
+      }
+
+      const ingestData = await ingestResponse.json();
+      console.log("Ingest response:", ingestData);
+
+      setIngestMessage(`✅ Data ingested successfully! Processed ${tempFiles.length} documents.`);
+      
+      // Clear temp state
+      setTempFiles([]);
+      setShowTempFiles(false);
+      setTempSessionId(null);
+      setIngestJobData(null);
+    } catch (error: any) {
+      console.error("Error running ingest:", error);
+      setIngestMessage(`❌ Error: ${error.message}`);
+    } finally {
+      setIsIngesting(false);
+    }
+  };
+
   // Ingest files into knowledge graph (uploaded or downloaded)
   const handleIngestDocuments = async (sourceType: "uploaded" | "downloaded" = "uploaded") => {
     if (!ingestGraphName) {
@@ -460,37 +585,53 @@ const Setup = () => {
       const createData = await createResponse.json();
       console.log("Create ingest response:", createData);
 
-      // Step 2: Run ingest
-      setIngestMessage("Step 2/2: Running document ingest...");
-
-      const loadingInfo = {
-        load_job_id: createData.load_job_id,
-        data_source_id: createData.data_source_id,
-        file_path: createData.data_path || createData.file_path, // Handle both field names
-      };
+      // Check if temp files were created (for server data source)
+      const sessionId = createData.data_source_id?.temp_session_id;
+      
+      if (sessionId) {
+        // Files are saved to temp storage - show them for review
+        setTempSessionId(sessionId);
+        setIngestJobData({
+          load_job_id: createData.load_job_id,
+          data_source_id: createData.data_source_id,
+          data_path: createData.data_path || createData.file_path,
+        });
+        setIngestMessage(`✅ Processed ${createData.data_source_id.file_count} files. Review them below before ingesting.`);
+        await fetchTempFiles(sessionId);
+        setIsIngesting(false);
+      } else {
+        // No temp files (e.g., S3 Bedrock) - proceed directly to ingest
+        setIngestMessage("Step 2/2: Running document ingest...");
 
-      const ingestResponse = await fetch(`/ui/${ingestGraphName}/ingest`, {
-        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-          Authorization: `Basic ${creds}`,
-        },
-        body: JSON.stringify(loadingInfo),
-      });
+        const loadingInfo = {
+          load_job_id: createData.load_job_id,
+          data_source_id: createData.data_source_id,
+          file_path: createData.data_path || createData.file_path,
+        };
 
-      if (!ingestResponse.ok) {
-        const errorData = await ingestResponse.json();
-        throw new Error(errorData.detail || `Failed to run ingest: ${ingestResponse.statusText}`);
-      }
+        const ingestResponse = await fetch(`/ui/${ingestGraphName}/ingest`, {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+            Authorization: `Basic ${creds}`,
+          },
+          body: JSON.stringify(loadingInfo),
+        });
+
+        if (!ingestResponse.ok) {
+          const errorData = await ingestResponse.json();
+          throw new Error(errorData.detail || `Failed to run ingest: ${ingestResponse.statusText}`);
+        }
 
-      const ingestData = await ingestResponse.json();
-      console.log("Ingest response:", ingestData);
+        const ingestData = await ingestResponse.json();
+        console.log("Ingest response:", ingestData);
 
-      setIngestMessage(`✅ Data ingested successfully! Processed documents from ${folderPath}/`);
+        setIngestMessage(`✅ Data ingested successfully! Processed documents from ${folderPath}/`);
+        setIsIngesting(false);
+      }
     } catch (error: any) {
       console.error("Error ingesting data:", error);
       setIngestMessage(`❌ Error: ${error.message}`);
-    } finally {
       setIsIngesting(false);
     }
   };
@@ -1237,6 +1378,71 @@ const Setup = () => {
                           {ingestMessage}
                         </div>
                       )}
+
+                      {/* Processed Temp Files - Review before ingesting */}
+                      {showTempFiles && tempFiles.length > 0 && (
+                        <div className="mt-4 border border-gray-300 dark:border-[#3D3D3D] rounded-lg p-4">
+                          <div className="flex items-center justify-between mb-3">
+                            <h3 className="text-sm font-medium text-black dark:text-white">
+                              Processed Files ({tempFiles.length})
+                            </h3>
+                            <Button
+                              onClick={handleDeleteAllTempFiles}
+                              variant="outline"
+                              size="sm"
+                              className="dark:border-[#3D3D3D]"
+                            >
+                              <Trash2 className="h-3 w-3 mr-1" />
+                              Clear All
+                            </Button>
+                          </div>
+                          <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
+                            Review the processed files below. You can delete any file before ingesting.
+                          </p>
+                          <div className="space-y-2 max-h-64 overflow-y-auto mb-3">
+                            {tempFiles.map((file, index) => (
+                              <div
+                                key={index}
+                                className="flex items-center justify-between p-2 bg-gray-50 dark:bg-shadeA rounded"
+                              >
+                                <div className="flex-1 min-w-0">
+                                  <p className="text-sm text-black dark:text-white truncate">
+                                    {file.doc_id}
+                                  </p>
+                                  <p className="text-xs text-gray-500 dark:text-gray-400">
+                                    {(file.size / 1024).toFixed(2)} KB
+                                  </p>
+                                </div>
+                                <Button
+                                  onClick={() => handleDeleteTempFile(file.filename)}
+                                  variant="outline"
+                                  size="sm"
+                                  className="ml-2 dark:border-[#3D3D3D]"
+                                >
+                                  <Trash2 className="h-3 w-3" />
+                                </Button>
+                              </div>
+                            ))}
+                          </div>
+                          <Button
+                            onClick={handleRunIngest}
+                            disabled={isIngesting}
+                            className="gradient text-white w-full"
+                          >
+                            {isIngesting ? (
+                              <>
+                                <Loader2 className="h-4 w-4 mr-2 animate-spin" />
+                                Ingesting...
+                              </>
+                            ) : (
+                              <>
+                                <Database className="h-4 w-4 mr-2" />
+                                Run Final Ingest
+                              </>
+                            )}
+                          </Button>
+                        </div>
+                      )}
                     </div>
                   )}
 
@@ -1576,6 +1782,71 @@ const Setup = () => {
                           {ingestMessage}
                         </div>
                       )}
+
+                      {/* Processed Temp Files - Review before ingesting */}
+                      {showTempFiles && tempFiles.length > 0 && (
+                        <div className="mt-4 border border-gray-300 dark:border-[#3D3D3D] rounded-lg p-4">
+                          <div className="flex items-center justify-between mb-3">
+                            <h3 className="text-sm font-medium text-black dark:text-white">
+                              Processed Files ({tempFiles.length})
+                            </h3>
+                            <Button
+                              onClick={handleDeleteAllTempFiles}
+                              variant="outline"
+                              size="sm"
+                              className="dark:border-[#3D3D3D]"
+                            >
+                              <Trash2 className="h-3 w-3 mr-1" />
+                              Clear All
+                            </Button>
+                          </div>
+                          <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
+                            Review the processed files below. You can delete any file before ingesting.
+                          </p>
+                          <div className="space-y-2 max-h-64 overflow-y-auto mb-3">
+                            {tempFiles.map((file, index) => (
+                              <div
+                                key={index}
+                                className="flex items-center justify-between p-2 bg-gray-50 dark:bg-shadeA rounded"
+                              >
+                                <div className="flex-1 min-w-0">
+                                  <p className="text-sm text-black dark:text-white truncate">
+                                    {file.doc_id}
+                                  </p>
+                                  <p className="text-xs text-gray-500 dark:text-gray-400">
+                                    {(file.size / 1024).toFixed(2)} KB
+                                  </p>
+                                </div>
+                                <Button
+                                  onClick={() => handleDeleteTempFile(file.filename)}
+                                  variant="outline"
+                                  size="sm"
+                                  className="ml-2 dark:border-[#3D3D3D]"
+                                >
+                                  <Trash2 className="h-3 w-3" />
+                                </Button>
+                              </div>
+                            ))}
+                          </div>
+                          <Button
+                            onClick={handleRunIngest}
+                            disabled={isIngesting}
+                            className="gradient text-white w-full"
+                          >
+                            {isIngesting ? (
+                              <>
+                                <Loader2 className="h-4 w-4 mr-2 animate-spin" />
+                                Ingesting...
+                              </>
+                            ) : (
+                              <>
+                                <Database className="h-4 w-4 mr-2" />
+                                Run Final Ingest
+                              </>
+                            )}
+                          </Button>
+                        </div>
+                      )}
                     </div>
                   )}
                 </div>
diff --git a/graphrag/app/routers/ui.py b/graphrag/app/routers/ui.py
index 114b489..9b012ec 100644
--- a/graphrag/app/routers/ui.py
+++ b/graphrag/app/routers/ui.py
@@ -1380,3 +1380,139 @@ async def delete_cloud_downloads(
         logger.debug_pii(f"Delete error trace:\n{exc}")
         raise HTTPException(status_code=500, detail=f"Error deleting files: {str(e)}")
 
+
+# Ingestion Temp Files Endpoints
+
+@router.get(route_prefix + "/{graphname}/ingestion_temp/list")
+async def list_ingestion_temp_files(
+    graphname: str,
+    credentials: Annotated[HTTPBase, Depends(security)],
+    session_id: str = None,
+):
+    """
+    List processed files in the ingestion temp folder for a specific graph.
+    """
+    try:
+        base_temp_dir = os.path.join("uploads", "ingestion_temp", graphname)
+        
+        if not os.path.exists(base_temp_dir):
+            return {
+                "status": "success",
+                "graphname": graphname,
+                "sessions": [],
+                "total_files": 0,
+            }
+        
+        sessions = []
+        total_files = 0
+        
+        # If session_id provided, list only that session
+        if session_id:
+            session_dir = os.path.join(base_temp_dir, session_id)
+            if os.path.exists(session_dir) and os.path.isdir(session_dir):
+                files = []
+                for filename in os.listdir(session_dir):
+                    filepath = os.path.join(session_dir, filename)
+                    if os.path.isfile(filepath) and filename.endswith('.json'):
+                        file_stat = os.stat(filepath)
+                        # Read doc_id from file
+                        try:
+                            with open(filepath, 'r', encoding='utf-8') as f:
+                                doc_data = json.load(f)
+                                doc_id = doc_data.get('doc_id', 'unknown')
+                        except:
+                            doc_id = 'unknown'
+                        
+                        files.append({
+                            "filename": filename,
+                            "doc_id": doc_id,
+                            "size": file_stat.st_size,
+                            "modified": file_stat.st_mtime,
+                        })
+                sessions.append({
+                    "session_id": session_id,
+                    "files": files,
+                    "file_count": len(files),
+                })
+                total_files = len(files)
+        
+        return {
+            "status": "success",
+            "graphname": graphname,
+            "sessions": sessions,
+            "total_files": total_files,
+        }
+    
+    except Exception as e:
+        exc = traceback.format_exc()
+        logger.error(f"Error listing ingestion temp files for graph {graphname}: {e}")
+        logger.debug_pii(f"List error trace:\n{exc}")
+        raise HTTPException(status_code=500, detail=f"Error listing temp files: {str(e)}")
+
+
+@router.delete(route_prefix + "/{graphname}/ingestion_temp/delete")
+async def delete_ingestion_temp_files(
+    graphname: str,
+    credentials: Annotated[HTTPBase, Depends(security)],
+    session_id: str = None,
+    filename: str = None,
+):
+    """
+    Delete files from ingestion temp folder.
+    """
+    try:
+        base_temp_dir = os.path.join("uploads", "ingestion_temp", graphname)
+        
+        if not session_id:
+            raise HTTPException(status_code=400, detail="session_id is required")
+        
+        session_dir = os.path.join(base_temp_dir, session_id)
+        
+        if not os.path.exists(session_dir):
+            return {
+                "status": "success",
+                "message": f"No temp files found for session {session_id}",
+                "deleted_files": [],
+            }
+        
+        deleted_files = []
+        
+        if filename:
+            # Delete specific file
+            file_path = os.path.join(session_dir, filename)
+            if os.path.exists(file_path) and os.path.isfile(file_path):
+                os.remove(file_path)
+                deleted_files.append(filename)
+                logger.info(f"Deleted temp file {filename} from session {session_id}")
+                
+                # If session folder is now empty, remove it
+                if not os.listdir(session_dir):
+                    os.rmdir(session_dir)
+                    logger.info(f"Removed empty session folder {session_id}")
+            else:
+                raise HTTPException(status_code=404, detail=f"File {filename} not found")
+        else:
+            # Delete entire session folder
+            import shutil
+            for filename in os.listdir(session_dir):
+                if os.path.isfile(os.path.join(session_dir, filename)):
+                    deleted_files.append(filename)
+            
+            shutil.rmtree(session_dir)
+            logger.info(f"Deleted session folder {session_id} for graph {graphname}")
+        
+        return {
+            "status": "success",
+            "message": f"Successfully deleted {len(deleted_files)} file(s)",
+            "deleted_files": deleted_files,
+            "session_id": session_id,
+        }
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        exc = traceback.format_exc()
+        logger.error(f"Error deleting ingestion temp files for graph {graphname}: {e}")
+        logger.debug_pii(f"Delete error trace:\n{exc}")
+        raise HTTPException(status_code=500, detail=f"Error deleting temp files: {str(e)}")
+
diff --git a/graphrag/app/supportai/supportai.py b/graphrag/app/supportai/supportai.py
index 6b93df0..88542dc 100644
--- a/graphrag/app/supportai/supportai.py
+++ b/graphrag/app/supportai/supportai.py
@@ -489,14 +489,37 @@ def create_ingest(
             server_processing_result = extractor.process_folder(data_path, graphname=graphname)
             if server_processing_result.get("statusCode") != 200:
                 raise Exception(f"Server folder processing failed: {server_processing_result}")
-            else:
-                logger.info(f"Server folder processing completed successfully: {server_processing_result}")
-
-            res_ingest_config["server_jobs"] = server_processing_result.get("documents", [])
+            
+            # Log only summary, NOT the full documents to avoid memory logging
+            logger.info(f"Server folder processing completed: {server_processing_result.get('message')}")
+
+            # Save processed documents to temporary folder instead of keeping in memory
+            temp_session_id = str(uuid.uuid4())
+            temp_folder = os.path.join("uploads", "ingestion_temp", graphname, temp_session_id)
+            os.makedirs(temp_folder, exist_ok=True)
+            
+            documents = server_processing_result.get("documents", [])
+            doc_count = len(documents)
+            
+            # Save each document as a separate JSON file
+            for idx, doc_data in enumerate(documents):
+                doc_filename = f"doc_{idx}_{doc_data.get('doc_id', 'unknown')}.json"
+                doc_filepath = os.path.join(temp_folder, doc_filename)
+                with open(doc_filepath, 'w', encoding='utf-8') as f:
+                    json.dump(doc_data, f, ensure_ascii=False, indent=2)
+            
+            # Clear documents from memory immediately after saving
+            documents.clear()
+            server_processing_result.clear()
+            
+            logger.info(f"Saved {doc_count} processed documents to {temp_folder}")
+            
+            res_ingest_config["temp_session_id"] = temp_session_id
+            res_ingest_config["temp_folder"] = temp_folder
+            res_ingest_config["file_count"] = doc_count
             res_ingest_config["data_source_id"] = "DocumentContent"
-            # Use a placeholder path that doesn't start with "/" to avoid pyTigerGraph treating it as a file
-            # The actual folder path is stored in server_jobs, this is just for the API call
-            res["data_path"] = "in_response"
+            # Use a placeholder path to indicate temp storage
+            res["data_path"] = "in_temp_storage"
             res["data_source_id"] = res_ingest_config
         except Exception as e:
             raise Exception(f"Error during server folder processing: {e}")
@@ -650,13 +673,30 @@ def ingest(
             try:
                 processed_files = []
                 data_source_id = ingest_config.get("data_source_id", "DocumentContent")
-                if ingest_config.get("server_jobs"):
-                    for doc_data in ingest_config.get("server_jobs"):
+                
+                # Read from temporary folder
+                temp_folder = ingest_config.get("temp_folder")
+                if not temp_folder or not os.path.exists(temp_folder):
+                    raise Exception(f"Temporary folder not found: {temp_folder}")
+                
+                # Read all JSON files from temp folder
+                json_files = [f for f in os.listdir(temp_folder) if f.endswith('.json')]
+                logger.info(f"Reading {len(json_files)} documents from {temp_folder}")
+                
+                for json_filename in json_files:
+                    json_filepath = os.path.join(temp_folder, json_filename)
+                    try:
+                        with open(json_filepath, 'r', encoding='utf-8') as f:
+                            doc_data = json.load(f)
+                        
                         if not doc_data.get("doc_id"):
+                            logger.warning(f"Skipping invalid document: {json_filename}")
                             continue
                         # Skip documents with neither content nor image_data
                         if not doc_data.get("content") and not doc_data.get("image_data"):
+                            logger.warning(f"Skipping document with no content: {json_filename}")
                             continue
+                            
                         if doc_data.get("image_data"):
                             payload = {
                                 "doc_id": doc_data.get("doc_id", ""),
@@ -684,6 +724,18 @@ def ingest(
                             'parent_doc': doc_data.get("parent_doc", ""),
                         })
                         logger.info(f"Data uploading done for doc_id: {doc_data.get('doc_id', 'unknown')}")
+                    except Exception as file_error:
+                        logger.error(f"Error processing file {json_filename}: {file_error}")
+                        continue
+                
+                # Clean up temp folder after successful ingestion
+                try:
+                    import shutil
+                    shutil.rmtree(temp_folder)
+                    logger.info(f"Cleaned up temporary folder: {temp_folder}")
+                except Exception as cleanup_error:
+                    logger.warning(f"Failed to cleanup temp folder {temp_folder}: {cleanup_error}")
+                    
             except Exception as e:
                 raise Exception(f"Error during server markdown extraction and TigerGraph loading: {e}")
             return {

From e25ba1eea8467f9bbc879ed554fac007ef21fa99 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Mon, 24 Nov 2025 20:31:36 +0530
Subject: [PATCH 07/15] Add direct ingestion option with checkbox to skip file
 review

---
 graphrag-ui/src/pages/Setup.tsx | 37 ++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/graphrag-ui/src/pages/Setup.tsx b/graphrag-ui/src/pages/Setup.tsx
index c844896..e86eefb 100644
--- a/graphrag-ui/src/pages/Setup.tsx
+++ b/graphrag-ui/src/pages/Setup.tsx
@@ -62,6 +62,7 @@ const Setup = () => {
   const [tempFiles, setTempFiles] = useState<any[]>([]);
   const [showTempFiles, setShowTempFiles] = useState(false);
   const [ingestJobData, setIngestJobData] = useState<any>(null);
+  const [directIngestion, setDirectIngestion] = useState(false);
 
   // Refresh state
   const [refreshOpen, setRefreshOpen] = useState(false);
@@ -588,8 +589,8 @@ const Setup = () => {
       // Check if temp files were created (for server data source)
       const sessionId = createData.data_source_id?.temp_session_id;
       
-      if (sessionId) {
-        // Files are saved to temp storage - show them for review
+      if (sessionId && !directIngestion) {
+        // Files are saved to temp storage - show them for review (only if not direct ingestion)
         setTempSessionId(sessionId);
         setIngestJobData({
           load_job_id: createData.load_job_id,
@@ -600,7 +601,7 @@ const Setup = () => {
         await fetchTempFiles(sessionId);
         setIsIngesting(false);
       } else {
-        // No temp files (e.g., S3 Bedrock) - proceed directly to ingest
+        // No temp files (e.g., S3 Bedrock) OR direct ingestion enabled - proceed directly to ingest
         setIngestMessage("Step 2/2: Running document ingest...");
 
         const loadingInfo = {
@@ -1350,6 +1351,21 @@ const Setup = () => {
                       <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
                         Process uploaded files and add them to the knowledge graph
                       </p>
+                      
+                      {/* Direct Ingestion Checkbox */}
+                      <div className="flex items-center mb-3">
+                        <input
+                          type="checkbox"
+                          id="directIngestion"
+                          checked={directIngestion}
+                          onChange={(e) => setDirectIngestion(e.target.checked)}
+                          className="mr-2 h-4 w-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
+                        />
+                        <label htmlFor="directIngestion" className="text-sm text-gray-700 dark:text-gray-300">
+                          Direct Ingestion (skip file review)
+                        </label>
+                      </div>
+                      
                       <Button
                         onClick={() => handleIngestDocuments("uploaded")}
                         disabled={isIngesting}
@@ -1754,6 +1770,21 @@ const Setup = () => {
                       <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
                         Process downloaded files and add them to the knowledge graph
                       </p>
+                      
+                      {/* Direct Ingestion Checkbox */}
+                      <div className="flex items-center mb-3">
+                        <input
+                          type="checkbox"
+                          id="directIngestionDownloaded"
+                          checked={directIngestion}
+                          onChange={(e) => setDirectIngestion(e.target.checked)}
+                          className="mr-2 h-4 w-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
+                        />
+                        <label htmlFor="directIngestionDownloaded" className="text-sm text-gray-700 dark:text-gray-300">
+                          Direct Ingestion (skip file review)
+                        </label>
+                      </div>
+                      
                       <Button
                         onClick={() => handleIngestDocuments("downloaded")}
                         disabled={isIngesting}

From d1acdbb50adf65250f4db359a18364095e701b1c Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Wed, 26 Nov 2025 20:37:25 +0530
Subject: [PATCH 08/15] Auto-process files on upload/download, delete temp
 files with original files

---
 graphrag-ui/src/pages/Setup.tsx | 406 +++++++++++++++++---------------
 1 file changed, 219 insertions(+), 187 deletions(-)

diff --git a/graphrag-ui/src/pages/Setup.tsx b/graphrag-ui/src/pages/Setup.tsx
index e86eefb..17f952c 100644
--- a/graphrag-ui/src/pages/Setup.tsx
+++ b/graphrag-ui/src/pages/Setup.tsx
@@ -56,7 +56,7 @@ const Setup = () => {
   const [uploadMessage, setUploadMessage] = useState("");
   const [isIngesting, setIsIngesting] = useState(false);
   const [ingestMessage, setIngestMessage] = useState("");
-  
+
   // Ingestion temp files state
   const [tempSessionId, setTempSessionId] = useState<string | null>(null);
   const [tempFiles, setTempFiles] = useState<any[]>([]);
@@ -166,13 +166,19 @@ const Setup = () => {
 
       const data = await response.json();
       if (data.status === "success") {
-        setUploadMessage(`✅ ${data.message}`);
+        setUploadMessage(`✅ ${data.message} Processing...`);
         setSelectedFiles(null);
         await fetchUploadedFiles();
+        
+        // Step 2: Call create_ingest to process uploaded files
+        console.log("Calling handleCreateIngestAfterUpload from main upload...");
+        await handleCreateIngestAfterUpload("uploaded");
+        console.log("handleCreateIngestAfterUpload completed");
       } else {
         setUploadMessage(`⚠️ ${data.message}`);
       }
     } catch (error: any) {
+      console.error("Upload error:", error);
       setUploadMessage(`❌ Error: ${error.message}`);
     } finally {
       setIsUploading(false);
@@ -226,14 +232,20 @@ const Setup = () => {
 
       // Show final result
       if (failedCount === 0) {
-        setUploadMessage(`✅ Successfully uploaded all ${uploadedCount} files (uploaded individually).`);
+        setUploadMessage(`✅ Successfully uploaded all ${uploadedCount} files. Processing...`);
       } else {
-        setUploadMessage(`⚠️ Uploaded ${uploadedCount} files successfully, ${failedCount} failed.`);
+        setUploadMessage(`⚠️ Uploaded ${uploadedCount} files successfully, ${failedCount} failed. Processing...`);
       }
       
       setSelectedFiles(null);
       await fetchUploadedFiles();
+      
+      // Step 2: Call create_ingest to process uploaded files
+      console.log("Calling handleCreateIngestAfterUpload...");
+      await handleCreateIngestAfterUpload("uploaded");
+      console.log("handleCreateIngestAfterUpload completed");
     } catch (error: any) {
+      console.error("Upload error:", error);
       setUploadMessage(`❌ Batch upload error: ${error.message}`);
     } finally {
       setIsUploading(false);
@@ -244,8 +256,19 @@ const Setup = () => {
   const handleDeleteFile = async (filename: string) => {
     if (!ingestGraphName) return;
 
+    console.log("Deleting file:", filename);
+    console.log("tempSessionId:", tempSessionId);
+
     try {
       const creds = localStorage.getItem("creds");
+      
+      // Also delete corresponding temp files FIRST if session exists
+      if (tempSessionId) {
+        console.log("Calling handleDeleteTempFilesForOriginal...");
+        await handleDeleteTempFilesForOriginal(filename);
+      }
+      
+      // Then delete original file
       const response = await fetch(
         `/ui/${ingestGraphName}/uploads?filename=${encodeURIComponent(filename)}`,
         {
@@ -254,9 +277,11 @@ const Setup = () => {
         }
       );
       const data = await response.json();
+      
       setUploadMessage(`✅ ${data.message}`);
       await fetchUploadedFiles();
     } catch (error: any) {
+      console.error("Delete error:", error);
       setUploadMessage(`❌ Error: ${error.message}`);
     }
   };
@@ -275,6 +300,12 @@ const Setup = () => {
         headers: { Authorization: `Basic ${creds}` },
       });
       const data = await response.json();
+      
+      // Also clear temp session
+      if (tempSessionId) {
+        await handleDeleteAllTempFiles();
+      }
+      
       setUploadMessage(`✅ ${data.message}`);
       await fetchUploadedFiles();
     } catch (error: any) {
@@ -367,8 +398,11 @@ const Setup = () => {
 
       const data = await response.json();
       if (data.status === "success") {
-        setDownloadMessage(`✅ ${data.message}`);
+        setDownloadMessage(`✅ ${data.message}. Processing...`);
         await fetchDownloadedFiles();
+        
+        // Step 2: Call create_ingest to process downloaded files
+        await handleCreateIngestAfterUpload("downloaded");
       } else if (data.status === "warning") {
         setDownloadMessage(`⚠️ ${data.message}`);
       } else {
@@ -395,6 +429,12 @@ const Setup = () => {
         }
       );
       const data = await response.json();
+      
+      // Also delete corresponding temp files if session exists
+      if (tempSessionId) {
+        await handleDeleteTempFilesForOriginal(filename);
+      }
+      
       setDownloadMessage(`✅ ${data.message}`);
       await fetchDownloadedFiles();
     } catch (error: any) {
@@ -491,6 +531,60 @@ const Setup = () => {
     }
   };
 
+  // Delete temp files matching original filename
+  const handleDeleteTempFilesForOriginal = async (originalFilename: string) => {
+    console.log("handleDeleteTempFilesForOriginal called with:", originalFilename);
+    
+    if (!ingestGraphName || !tempSessionId) {
+      console.log("No graph name or session ID, returning");
+      return;
+    }
+
+    try {
+      // Extract base name without extension (e.g., "document.pdf" -> "document")
+      const baseName = originalFilename.replace(/\.[^/.]+$/, "");
+      console.log("Base name:", baseName);
+      
+      const creds = localStorage.getItem("creds");
+      
+      // Fetch temp files to find matches
+      const response = await fetch(`/ui/${ingestGraphName}/ingestion_temp/list?session_id=${tempSessionId}`, {
+        headers: { Authorization: `Basic ${creds}` },
+      });
+      const data = await response.json();
+      console.log("Temp files list response:", data);
+      
+      if (data.status === "success" && data.sessions.length > 0) {
+        const files = data.sessions[0].files || [];
+        console.log("All temp files:", files.map((f: any) => f.filename));
+        
+        // Find temp files matching pattern: doc_{idx}_{baseName}*.json
+        const matchingFiles = files.filter((f: any) => f.filename.includes(`_${baseName}`));
+        console.log("Matching files to delete:", matchingFiles.map((f: any) => f.filename));
+        
+        // Delete each matching file
+        for (const file of matchingFiles) {
+          console.log("Deleting temp file:", file.filename);
+          const deleteResponse = await fetch(
+            `/ui/${ingestGraphName}/ingestion_temp/delete?session_id=${tempSessionId}&filename=${encodeURIComponent(file.filename)}`,
+            {
+              method: "DELETE",
+              headers: { Authorization: `Basic ${creds}` },
+            }
+          );
+          const deleteData = await deleteResponse.json();
+          console.log("Delete response:", deleteData);
+        }
+        
+        console.log(`Successfully deleted ${matchingFiles.length} temp file(s)`);
+      } else {
+        console.log("No temp files found or empty sessions");
+      }
+    } catch (error: any) {
+      console.error("Error deleting temp files:", error);
+    }
+  };
+
   // Run final ingest after user reviews temp files
   const handleRunIngest = async () => {
     if (!ingestJobData) {
@@ -602,32 +696,32 @@ const Setup = () => {
         setIsIngesting(false);
       } else {
         // No temp files (e.g., S3 Bedrock) OR direct ingestion enabled - proceed directly to ingest
-        setIngestMessage("Step 2/2: Running document ingest...");
+      setIngestMessage("Step 2/2: Running document ingest...");
 
-        const loadingInfo = {
-          load_job_id: createData.load_job_id,
-          data_source_id: createData.data_source_id,
+      const loadingInfo = {
+        load_job_id: createData.load_job_id,
+        data_source_id: createData.data_source_id,
           file_path: createData.data_path || createData.file_path,
-        };
+      };
 
-        const ingestResponse = await fetch(`/ui/${ingestGraphName}/ingest`, {
-          method: "POST",
-          headers: {
-            "Content-Type": "application/json",
-            Authorization: `Basic ${creds}`,
-          },
-          body: JSON.stringify(loadingInfo),
-        });
+      const ingestResponse = await fetch(`/ui/${ingestGraphName}/ingest`, {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Basic ${creds}`,
+        },
+        body: JSON.stringify(loadingInfo),
+      });
 
-        if (!ingestResponse.ok) {
-          const errorData = await ingestResponse.json();
-          throw new Error(errorData.detail || `Failed to run ingest: ${ingestResponse.statusText}`);
-        }
+      if (!ingestResponse.ok) {
+        const errorData = await ingestResponse.json();
+        throw new Error(errorData.detail || `Failed to run ingest: ${ingestResponse.statusText}`);
+      }
 
-        const ingestData = await ingestResponse.json();
-        console.log("Ingest response:", ingestData);
+      const ingestData = await ingestResponse.json();
+      console.log("Ingest response:", ingestData);
 
-        setIngestMessage(`✅ Data ingested successfully! Processed documents from ${folderPath}/`);
+      setIngestMessage(`✅ Data ingested successfully! Processed documents from ${folderPath}/`);
         setIsIngesting(false);
       }
     } catch (error: any) {
@@ -637,6 +731,88 @@ const Setup = () => {
     }
   };
 
+  // Create ingest after upload/download (called automatically after files are uploaded/downloaded)
+  const handleCreateIngestAfterUpload = async (sourceType: "uploaded" | "downloaded" = "uploaded") => {
+    console.log("handleCreateIngestAfterUpload called with sourceType:", sourceType);
+    console.log("ingestGraphName:", ingestGraphName);
+    
+    if (!ingestGraphName) {
+      console.log("No graph name, returning early");
+      return;
+    }
+
+    const folderPath = sourceType === "uploaded" 
+      ? `uploads/${ingestGraphName}`
+      : `downloaded_files_cloud/${ingestGraphName}`;
+    
+    console.log("folderPath:", folderPath);
+
+    try {
+      const creds = localStorage.getItem("creds");
+
+      // Call create_ingest to process files
+      const createIngestConfig = {
+        data_source: "server",
+        data_source_config: {
+          folder_path: folderPath
+        },
+        loader_config: {},
+        file_format: "multi"
+      };
+      
+      console.log("Calling create_ingest with config:", createIngestConfig);
+
+      const createResponse = await fetch(`/ui/${ingestGraphName}/create_ingest`, {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Basic ${creds}`,
+        },
+        body: JSON.stringify(createIngestConfig),
+      });
+      
+      console.log("create_ingest response status:", createResponse.status);
+
+      if (!createResponse.ok) {
+        const errorData = await createResponse.json();
+        console.error("create_ingest error:", errorData);
+        throw new Error(errorData.detail || `Failed to create ingest job: ${createResponse.statusText}`);
+      }
+
+      const createData = await createResponse.json();
+      console.log("create_ingest response data:", createData);
+      
+      const sessionId = createData.data_source_id?.temp_session_id;
+      console.log("Session ID:", sessionId);
+      
+      if (sessionId) {
+        // Save session ID for later ingest
+        setTempSessionId(sessionId);
+        setIngestJobData({
+          load_job_id: createData.load_job_id,
+          data_source_id: createData.data_source_id,
+          data_path: createData.data_path || createData.file_path,
+        });
+        
+        console.log("Direct ingestion enabled:", directIngestion);
+        
+        if (directIngestion) {
+          // Direct ingestion - proceed to ingest immediately
+          setUploadMessage("Running direct ingestion...");
+          await handleRunIngest();
+        } else {
+          // Save for later - files ready for ingestion
+          setUploadMessage(`✅ Successfully processed ${createData.data_source_id.file_count} files. Ready for ingestion.`);
+        }
+      } else {
+        console.warn("No session ID returned from create_ingest");
+      }
+    } catch (error: any) {
+      console.error("Error in create_ingest:", error);
+      setUploadMessage(`❌ Processing error: ${error.message}`);
+    }
+  };
+
   // Ingest files from S3 with Bedrock BDA
   const handleS3BedrockIngest = async () => {
     if (!ingestGraphName) {
@@ -1303,6 +1479,20 @@ const Setup = () => {
                   <p className="text-xs text-gray-500 dark:text-gray-400 mt-2">
                     Maximum upload per request: {MAX_UPLOAD_SIZE_MB} MB.
                   </p>
+                  
+                  {/* Direct Ingestion Checkbox */}
+                  <div className="flex items-center mt-3 mb-2">
+                    <input
+                      type="checkbox"
+                      id="directIngestion"
+                      checked={directIngestion}
+                      onChange={(e) => setDirectIngestion(e.target.checked)}
+                      className="mr-2 h-4 w-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
+                    />
+                    <label htmlFor="directIngestion" className="text-sm text-gray-700 dark:text-gray-300">
+                      Direct Ingestion (upload + process + ingest all at once)
+                    </label>
+                  </div>
                   </div>
 
                   <div className="flex gap-2">
@@ -1352,23 +1542,9 @@ const Setup = () => {
                         Process uploaded files and add them to the knowledge graph
                       </p>
                       
-                      {/* Direct Ingestion Checkbox */}
-                      <div className="flex items-center mb-3">
-                        <input
-                          type="checkbox"
-                          id="directIngestion"
-                          checked={directIngestion}
-                          onChange={(e) => setDirectIngestion(e.target.checked)}
-                          className="mr-2 h-4 w-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
-                        />
-                        <label htmlFor="directIngestion" className="text-sm text-gray-700 dark:text-gray-300">
-                          Direct Ingestion (skip file review)
-                        </label>
-                      </div>
-                      
                       <Button
-                        onClick={() => handleIngestDocuments("uploaded")}
-                        disabled={isIngesting}
+                        onClick={handleRunIngest}
+                        disabled={isIngesting || !tempSessionId}
                         className="gradient text-white w-full"
                       >
                         {isIngesting ? (
@@ -1394,71 +1570,6 @@ const Setup = () => {
                           {ingestMessage}
                         </div>
                       )}
-
-                      {/* Processed Temp Files - Review before ingesting */}
-                      {showTempFiles && tempFiles.length > 0 && (
-                        <div className="mt-4 border border-gray-300 dark:border-[#3D3D3D] rounded-lg p-4">
-                          <div className="flex items-center justify-between mb-3">
-                            <h3 className="text-sm font-medium text-black dark:text-white">
-                              Processed Files ({tempFiles.length})
-                            </h3>
-                            <Button
-                              onClick={handleDeleteAllTempFiles}
-                              variant="outline"
-                              size="sm"
-                              className="dark:border-[#3D3D3D]"
-                            >
-                              <Trash2 className="h-3 w-3 mr-1" />
-                              Clear All
-                            </Button>
-                          </div>
-                          <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
-                            Review the processed files below. You can delete any file before ingesting.
-                          </p>
-                          <div className="space-y-2 max-h-64 overflow-y-auto mb-3">
-                            {tempFiles.map((file, index) => (
-                              <div
-                                key={index}
-                                className="flex items-center justify-between p-2 bg-gray-50 dark:bg-shadeA rounded"
-                              >
-                                <div className="flex-1 min-w-0">
-                                  <p className="text-sm text-black dark:text-white truncate">
-                                    {file.doc_id}
-                                  </p>
-                                  <p className="text-xs text-gray-500 dark:text-gray-400">
-                                    {(file.size / 1024).toFixed(2)} KB
-                                  </p>
-                                </div>
-                                <Button
-                                  onClick={() => handleDeleteTempFile(file.filename)}
-                                  variant="outline"
-                                  size="sm"
-                                  className="ml-2 dark:border-[#3D3D3D]"
-                                >
-                                  <Trash2 className="h-3 w-3" />
-                                </Button>
-                              </div>
-                            ))}
-                          </div>
-                          <Button
-                            onClick={handleRunIngest}
-                            disabled={isIngesting}
-                            className="gradient text-white w-full"
-                          >
-                            {isIngesting ? (
-                              <>
-                                <Loader2 className="h-4 w-4 mr-2 animate-spin" />
-                                Ingesting...
-                              </>
-                            ) : (
-                              <>
-                                <Database className="h-4 w-4 mr-2" />
-                                Run Final Ingest
-                              </>
-                            )}
-                          </Button>
-                        </div>
-                      )}
                     </div>
                   )}
 
@@ -1771,23 +1882,9 @@ const Setup = () => {
                         Process downloaded files and add them to the knowledge graph
                       </p>
                       
-                      {/* Direct Ingestion Checkbox */}
-                      <div className="flex items-center mb-3">
-                        <input
-                          type="checkbox"
-                          id="directIngestionDownloaded"
-                          checked={directIngestion}
-                          onChange={(e) => setDirectIngestion(e.target.checked)}
-                          className="mr-2 h-4 w-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
-                        />
-                        <label htmlFor="directIngestionDownloaded" className="text-sm text-gray-700 dark:text-gray-300">
-                          Direct Ingestion (skip file review)
-                        </label>
-                      </div>
-                      
                       <Button
-                        onClick={() => handleIngestDocuments("downloaded")}
-                        disabled={isIngesting}
+                        onClick={handleRunIngest}
+                        disabled={isIngesting || !tempSessionId}
                         className="gradient text-white w-full"
                       >
                         {isIngesting ? (
@@ -1813,71 +1910,6 @@ const Setup = () => {
                           {ingestMessage}
                         </div>
                       )}
-
-                      {/* Processed Temp Files - Review before ingesting */}
-                      {showTempFiles && tempFiles.length > 0 && (
-                        <div className="mt-4 border border-gray-300 dark:border-[#3D3D3D] rounded-lg p-4">
-                          <div className="flex items-center justify-between mb-3">
-                            <h3 className="text-sm font-medium text-black dark:text-white">
-                              Processed Files ({tempFiles.length})
-                            </h3>
-                            <Button
-                              onClick={handleDeleteAllTempFiles}
-                              variant="outline"
-                              size="sm"
-                              className="dark:border-[#3D3D3D]"
-                            >
-                              <Trash2 className="h-3 w-3 mr-1" />
-                              Clear All
-                            </Button>
-                          </div>
-                          <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
-                            Review the processed files below. You can delete any file before ingesting.
-                          </p>
-                          <div className="space-y-2 max-h-64 overflow-y-auto mb-3">
-                            {tempFiles.map((file, index) => (
-                              <div
-                                key={index}
-                                className="flex items-center justify-between p-2 bg-gray-50 dark:bg-shadeA rounded"
-                              >
-                                <div className="flex-1 min-w-0">
-                                  <p className="text-sm text-black dark:text-white truncate">
-                                    {file.doc_id}
-                                  </p>
-                                  <p className="text-xs text-gray-500 dark:text-gray-400">
-                                    {(file.size / 1024).toFixed(2)} KB
-                                  </p>
-                                </div>
-                                <Button
-                                  onClick={() => handleDeleteTempFile(file.filename)}
-                                  variant="outline"
-                                  size="sm"
-                                  className="ml-2 dark:border-[#3D3D3D]"
-                                >
-                                  <Trash2 className="h-3 w-3" />
-                                </Button>
-                              </div>
-                            ))}
-                          </div>
-                          <Button
-                            onClick={handleRunIngest}
-                            disabled={isIngesting}
-                            className="gradient text-white w-full"
-                          >
-                            {isIngesting ? (
-                              <>
-                                <Loader2 className="h-4 w-4 mr-2 animate-spin" />
-                                Ingesting...
-                              </>
-                            ) : (
-                              <>
-                                <Database className="h-4 w-4 mr-2" />
-                                Run Final Ingest
-                              </>
-                            )}
-                          </Button>
-                        </div>
-                      )}
                     </div>
                   )}
                 </div>

From 37ccf7089b0acff64cf9aba7827061df097af4ac Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Tue, 18 Nov 2025 18:32:55 +0530
Subject: [PATCH 09/15] Update README for OpenAI and Bedrock config, add
 pymupdf4llm license

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 13c88b3..9469ad6 100644
--- a/README.md
+++ b/README.md
@@ -482,7 +482,7 @@ In addition to the `OPENAI_API_KEY`, `llm_model` and `model_name` can be edited
             "model_kwargs": {
                 "temperature": 0
             },
-            "prompt_path": "./common/prompts/openai_gpt4/"
+            "prompt_path": "./app/prompts/openai_gpt4/"
         },
         "multimodal_service": {
             "llm_service": "openai",
@@ -614,7 +614,7 @@ In addition to the `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, and `azure_d
                 "temperature": 0,
                 "max_tokens": 4096
             },
-            "prompt_path": "./common/prompts/openai_gpt4/"
+            "prompt_path": "./app/prompts/aws_bedrock_claude3haiku/"
         },
         "multimodal_service": {
             "llm_service": "bedrock",

From 74ce839bc08266433ba26d867eb39cd882a17f69 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Tue, 18 Nov 2025 22:55:22 +0530
Subject: [PATCH 10/15] Fix prompt_path to use ./common/prompts/ for OpenAI and
 Bedrock

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9469ad6..13c88b3 100644
--- a/README.md
+++ b/README.md
@@ -482,7 +482,7 @@ In addition to the `OPENAI_API_KEY`, `llm_model` and `model_name` can be edited
             "model_kwargs": {
                 "temperature": 0
             },
-            "prompt_path": "./app/prompts/openai_gpt4/"
+            "prompt_path": "./common/prompts/openai_gpt4/"
         },
         "multimodal_service": {
             "llm_service": "openai",
@@ -614,7 +614,7 @@ In addition to the `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, and `azure_d
                 "temperature": 0,
                 "max_tokens": 4096
             },
-            "prompt_path": "./app/prompts/aws_bedrock_claude3haiku/"
+            "prompt_path": "./common/prompts/openai_gpt4/"
         },
         "multimodal_service": {
             "llm_service": "bedrock",

From 5090a71ded00a672bf248e3af5143ce8842bbe18 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Fri, 21 Nov 2025 20:57:53 +0530
Subject: [PATCH 11/15] bug fixes

---
 graphrag/app/supportai/supportai.py | 79 +++++++++--------------------
 1 file changed, 23 insertions(+), 56 deletions(-)

diff --git a/graphrag/app/supportai/supportai.py b/graphrag/app/supportai/supportai.py
index 88542dc..8dea43a 100644
--- a/graphrag/app/supportai/supportai.py
+++ b/graphrag/app/supportai/supportai.py
@@ -501,18 +501,17 @@ def create_ingest(
             documents = server_processing_result.get("documents", [])
             doc_count = len(documents)
             
-            # Save each document as a separate JSON file
-            for idx, doc_data in enumerate(documents):
-                doc_filename = f"doc_{idx}_{doc_data.get('doc_id', 'unknown')}.json"
-                doc_filepath = os.path.join(temp_folder, doc_filename)
-                with open(doc_filepath, 'w', encoding='utf-8') as f:
-                    json.dump(doc_data, f, ensure_ascii=False, indent=2)
+            # Save all documents to a single JSONL file (our new logic)
+            jsonl_filepath = os.path.join(temp_folder, "processed_documents.jsonl")
+            with open(jsonl_filepath, 'w', encoding='utf-8') as f:
+                for doc_data in documents:
+                    f.write(json.dumps(doc_data, ensure_ascii=False) + '\n')
             
             # Clear documents from memory immediately after saving
             documents.clear()
             server_processing_result.clear()
             
-            logger.info(f"Saved {doc_count} processed documents to {temp_folder}")
+            logger.info(f"Saved {doc_count} processed documents to {jsonl_filepath}")
             
             res_ingest_config["temp_session_id"] = temp_session_id
             res_ingest_config["temp_folder"] = temp_folder
@@ -671,7 +670,6 @@ def ingest(
             }
         elif ingest_config.get("data_source") == "server":
             try:
-                processed_files = []
                 data_source_id = ingest_config.get("data_source_id", "DocumentContent")
                 
                 # Read from temporary folder
@@ -679,54 +677,23 @@ def ingest(
                 if not temp_folder or not os.path.exists(temp_folder):
                     raise Exception(f"Temporary folder not found: {temp_folder}")
                 
-                # Read all JSON files from temp folder
-                json_files = [f for f in os.listdir(temp_folder) if f.endswith('.json')]
-                logger.info(f"Reading {len(json_files)} documents from {temp_folder}")
+                # Read the processed_documents.jsonl file (our new logic)
+                jsonl_file = os.path.join(temp_folder, "processed_documents.jsonl")
+                if not os.path.exists(jsonl_file):
+                    raise Exception(f"Processed documents file not found: {jsonl_file}")
                 
-                for json_filename in json_files:
-                    json_filepath = os.path.join(temp_folder, json_filename)
-                    try:
-                        with open(json_filepath, 'r', encoding='utf-8') as f:
-                            doc_data = json.load(f)
-                        
-                        if not doc_data.get("doc_id"):
-                            logger.warning(f"Skipping invalid document: {json_filename}")
-                            continue
-                        # Skip documents with neither content nor image_data
-                        if not doc_data.get("content") and not doc_data.get("image_data"):
-                            logger.warning(f"Skipping document with no content: {json_filename}")
-                            continue
-                            
-                        if doc_data.get("image_data"):
-                            payload = {
-                                "doc_id": doc_data.get("doc_id", ""),
-                                "doc_type": "image",
-                                "image_data": doc_data.get("image_data", ""),
-                                "image_format": doc_data.get("image_format", "jpg"),
-                                "image_description": doc_data.get("image_description", ""),
-                                "parent_doc": doc_data.get("parent_doc", ""),
-                                "page_number": doc_data.get("page_number", 0),
-                                "width": doc_data.get("width", 0),
-                                "height": doc_data.get("height", 0),
-                                "position": doc_data.get("position", 0),
-                                "content": ""
-                            }
-                        else:
-                            payload = {
-                                "doc_id": doc_data.get("doc_id", ""),
-                                "doc_type": doc_data.get("doc_type", "markdown"),
-                                "content": doc_data.get("content", "")
-                            }
-                        payload_json = json.dumps(payload)
-                        conn.runLoadingJobWithData(payload_json, data_source_id, loader_info.load_job_id)
-                        processed_files.append({
-                            'file_path': doc_data.get("doc_id", ""),
-                            'parent_doc': doc_data.get("parent_doc", ""),
-                        })
-                        logger.info(f"Data uploading done for doc_id: {doc_data.get('doc_id', 'unknown')}")
-                    except Exception as file_error:
-                        logger.error(f"Error processing file {json_filename}: {file_error}")
-                        continue
+                # Read entire JSONL content as a single string
+                with open(jsonl_file, 'r', encoding='utf-8') as f:
+                    jsonl_content = f.read()
+                
+                # Count documents for logging
+                document_count = jsonl_content.count('\n') if jsonl_content.strip() else 0
+                logger.info(f"Ingesting {document_count} documents from {jsonl_file}")
+                
+                # Pass entire JSONL content in ONE call (efficient!)
+                conn.runLoadingJobWithData(jsonl_content, data_source_id, loader_info.load_job_id)
+                
+                logger.info(f"Successfully ingested {document_count} documents")
                 
                 # Clean up temp folder after successful ingestion
                 try:
@@ -740,7 +707,7 @@ def ingest(
                 raise Exception(f"Error during server markdown extraction and TigerGraph loading: {e}")
             return {
                 "job_name": loader_info.load_job_id,
-                "summary": processed_files
+                "summary": f"Data ingestion successful - processed {document_count} documents"
             }
         else:
             raise Exception("Data source and file format combination not implemented")

From 5417f8872fd55ef7d2598b95314b6e7d05eb9fd0 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Mon, 1 Dec 2025 17:16:28 +0530
Subject: [PATCH 12/15] Merge latest main and consolidate markdown_parsing.py
 into text_extractors.py

---
 common/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/requirements.txt b/common/requirements.txt
index f0022f3..3bbd096 100644
--- a/common/requirements.txt
+++ b/common/requirements.txt
@@ -108,7 +108,7 @@ ordered-set==4.1.0
 orjson==3.10.18
 packaging==24.2
 pandas==2.2.3
-#pathtools==0.1.2
+pathtools==0.1.2
 pillow==11.2.1
 #PyMuPDF==1.26.4
 pymupdf4llm==0.2.0

From eefcd67d012fd787a2555ba2f7825c585ae0a211 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Wed, 3 Dec 2025 15:23:46 +0530
Subject: [PATCH 13/15] Redesign temp file storage: save immediately during
 file processing instead of after

---
 common/utils/text_extractors.py     | 77 ++++++++++++++++++++++++-----
 graphrag/app/supportai/supportai.py | 14 +++++-
 2 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/common/utils/text_extractors.py b/common/utils/text_extractors.py
index ec5b140..4f22822 100644
--- a/common/utils/text_extractors.py
+++ b/common/utils/text_extractors.py
@@ -42,10 +42,11 @@ def __init__(self):
             '.jpg': 'image/jpeg'
         }
 
-    async def _process_file_async(self, file_path, folder_path_obj, graphname):
+    async def _process_file_async(self, file_path, folder_path_obj, graphname, temp_folder=None, file_counter=None):
         """
         Async helper to process a single file.
         Runs in thread pool to avoid blocking on I/O operations.
+        If temp_folder is provided, saves documents immediately and returns metadata only.
         """
         try:
             loop = asyncio.get_event_loop()
@@ -57,6 +58,27 @@ async def _process_file_async(self, file_path, folder_path_obj, graphname):
                 graphname
             )
 
+            # If temp_folder provided, save immediately and return metadata only
+            if temp_folder and doc_entries:
+                saved_files = []
+                for idx, doc_data in enumerate(doc_entries):
+                    # Use file_counter for unique naming across all files
+                    counter_val = next(file_counter) if file_counter else idx
+                    doc_filename = f"doc_{counter_val}_{doc_data.get('doc_id', 'unknown')}.json"
+                    doc_filepath = os.path.join(temp_folder, doc_filename)
+                    with open(doc_filepath, 'w', encoding='utf-8') as f:
+                        json.dump(doc_data, f, ensure_ascii=False, indent=2)
+                    saved_files.append(doc_filename)
+                
+                # Return metadata only, not full documents (memory efficient)
+                return {
+                    'success': True,
+                    'file_path': str(file_path),
+                    'saved_files': saved_files,
+                    'num_documents': len(doc_entries)
+                }
+            
+            # No temp_folder - return documents in memory (legacy behavior)
             return {
                 'success': True,
                 'file_path': str(file_path),
@@ -72,10 +94,11 @@ async def _process_file_async(self, file_path, folder_path_obj, graphname):
             logger.warning(f"Failed to process file {file_path}: {e}")
             return {'success': False, 'file_path': str(file_path), 'error': str(e)}
 
-    async def _process_folder_async(self, folder_path, graphname=None, max_concurrent=10):
+    async def _process_folder_async(self, folder_path, graphname=None, max_concurrent=10, temp_folder=None):
         """
         Async version of process_folder for parallel file processing.
         This prevents conflicts when multiple users process folders simultaneously.
+        If temp_folder is provided, saves documents immediately to disk instead of holding in memory.
         """
         logger.info(f"Processing local folder ASYNC: {folder_path} for graph: {graphname} (max_concurrent={max_concurrent})")
 
@@ -87,6 +110,11 @@ async def _process_folder_async(self, folder_path, graphname=None, max_concurren
         if not folder_path_obj.is_dir():
             raise Exception(f"Path is not a directory: {folder_path}")
 
+        # Create temp folder if provided
+        if temp_folder:
+            os.makedirs(temp_folder, exist_ok=True)
+            logger.info(f"Saving processed documents to: {temp_folder}")
+
         def safe_walk(path):
             try:
                 for item in path.iterdir():
@@ -111,16 +139,20 @@ def safe_walk(path):
         logger.info(f"Found {len(files_to_process)} files to process")
 
         semaphore = asyncio.Semaphore(max_concurrent)
+        
+        # Thread-safe counter for unique file naming
+        file_counter = iter(range(100000)) if temp_folder else None
 
         async def process_with_semaphore(file_path):
             async with semaphore:
-                return await self._process_file_async(file_path, folder_path_obj, graphname)
+                return await self._process_file_async(file_path, folder_path_obj, graphname, temp_folder, file_counter)
 
         tasks = [process_with_semaphore(fp) for fp in files_to_process]
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
         all_documents = []
         processed_files_info = []
+        total_saved_files = []
 
         for result in results:
             if isinstance(result, Exception):
@@ -128,10 +160,15 @@ async def process_with_semaphore(file_path):
                 continue
 
             if result.get('success'):
-                all_documents.extend(result.get('documents', []))
+                # If temp_folder was used, documents are saved to disk
+                if temp_folder:
+                    total_saved_files.extend(result.get('saved_files', []))
+                else:
+                    all_documents.extend(result.get('documents', []))
+                
                 processed_files_info.append({
                     'file_path': result['file_path'],
-                    'num_documents': result.get('num_documents', len(result.get('documents', []))),
+                    'num_documents': result.get('num_documents', 0),
                     'status': 'success'
                 })
             else:
@@ -141,23 +178,39 @@ async def process_with_semaphore(file_path):
                     'error': result.get('error', 'Unknown error')
                 })
 
-        logger.info(f"Processed {len(processed_files_info)} files, extracted {len(all_documents)} total documents")
+        total_docs = len(total_saved_files) if temp_folder else len(all_documents)
+        logger.info(f"Processed {len(processed_files_info)} files, extracted {total_docs} total documents")
 
-        return {
+        response = {
             'statusCode': 200,
-            'message': f'Processed {len(processed_files_info)} files, {len(all_documents)} documents',
-            'documents': all_documents,
+            'message': f'Processed {len(processed_files_info)} files, {total_docs} documents',
             'files': processed_files_info,
-            'num_documents': len(all_documents)
+            'num_documents': total_docs
         }
+        
+        # Only include documents in response if NOT saving to temp_folder
+        if temp_folder:
+            response['saved_to_temp'] = True
+            response['temp_folder'] = temp_folder
+            response['saved_files'] = total_saved_files
+        else:
+            response['documents'] = all_documents
+        
+        return response
 
-    def process_folder(self, folder_path, graphname=None):
+    def process_folder(self, folder_path, graphname=None, temp_folder=None):
         """
         Process local folder with multiple file formats and extract text content.
         Uses async processing internally for parallel file handling.
+        
+        Args:
+            folder_path: Path to the folder containing files to process
+            graphname: Name of the graph (for context)
+            temp_folder: Optional path to save processed documents immediately.
+                        If provided, documents are saved to disk instead of returned in memory.
         """
         logger.info(f"Processing local folder: {folder_path} for graph: {graphname}")
-        return asyncio.run(self._process_folder_async(folder_path, graphname))
+        return asyncio.run(self._process_folder_async(folder_path, graphname, temp_folder=temp_folder))
 
 
 def extract_text_from_file_with_images_as_docs(file_path, graphname=None):
diff --git a/graphrag/app/supportai/supportai.py b/graphrag/app/supportai/supportai.py
index 8dea43a..63cfa3d 100644
--- a/graphrag/app/supportai/supportai.py
+++ b/graphrag/app/supportai/supportai.py
@@ -485,12 +485,22 @@ def create_ingest(
         if data_path is None:
             raise Exception("Folder path not provided for server processing")
         try:
+            # Create temp folder BEFORE processing so extractor can save directly
+            temp_session_id = str(uuid.uuid4())
+            temp_folder = os.path.join("uploads", "ingestion_temp", graphname, temp_session_id)
+            
+            # Process files and save immediately to temp folder (memory efficient)
             extractor = TextExtractor()
-            server_processing_result = extractor.process_folder(data_path, graphname=graphname)
+            server_processing_result = extractor.process_folder(
+                data_path, 
+                graphname=graphname,
+                temp_folder=temp_folder  # Extractor saves files as it processes
+            )
+            
             if server_processing_result.get("statusCode") != 200:
                 raise Exception(f"Server folder processing failed: {server_processing_result}")
             
-            # Log only summary, NOT the full documents to avoid memory logging
+            doc_count = server_processing_result.get("num_documents", 0)
             logger.info(f"Server folder processing completed: {server_processing_result.get('message')}")
 
             # Save processed documents to temporary folder instead of keeping in memory

From 555a500f3d24bec8fafd556847c324b5f8f5761f Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Thu, 4 Dec 2025 21:02:50 +0530
Subject: [PATCH 14/15] files processing with processed file json creation then
 using single job to insert

---
 common/utils/text_extractors.py     | 191 +++++++++++++++++++---------
 graphrag-ui/src/pages/Setup.tsx     | 107 ++++++++--------
 graphrag/app/routers/ui.py          | 140 +++++++++++++++-----
 graphrag/app/supportai/supportai.py |  43 ++-----
 4 files changed, 310 insertions(+), 171 deletions(-)

diff --git a/common/utils/text_extractors.py b/common/utils/text_extractors.py
index 4f22822..9b5b652 100644
--- a/common/utils/text_extractors.py
+++ b/common/utils/text_extractors.py
@@ -42,11 +42,11 @@ def __init__(self):
             '.jpg': 'image/jpeg'
         }
 
-    async def _process_file_async(self, file_path, folder_path_obj, graphname, temp_folder=None, file_counter=None):
+    async def _process_file_async(self, file_path, folder_path_obj, graphname, temp_folder, jsonl_file, jsonl_lock):
         """
         Async helper to process a single file.
         Runs in thread pool to avoid blocking on I/O operations.
-        If temp_folder is provided, saves documents immediately and returns metadata only.
+        Appends documents immediately to JSONL file.
         """
         try:
             loop = asyncio.get_event_loop()
@@ -58,31 +58,21 @@ async def _process_file_async(self, file_path, folder_path_obj, graphname, temp_
                 graphname
             )
 
-            # If temp_folder provided, save immediately and return metadata only
-            if temp_folder and doc_entries:
-                saved_files = []
-                for idx, doc_data in enumerate(doc_entries):
-                    # Use file_counter for unique naming across all files
-                    counter_val = next(file_counter) if file_counter else idx
-                    doc_filename = f"doc_{counter_val}_{doc_data.get('doc_id', 'unknown')}.json"
-                    doc_filepath = os.path.join(temp_folder, doc_filename)
-                    with open(doc_filepath, 'w', encoding='utf-8') as f:
-                        json.dump(doc_data, f, ensure_ascii=False, indent=2)
-                    saved_files.append(doc_filename)
-                
-                # Return metadata only, not full documents (memory efficient)
-                return {
-                    'success': True,
-                    'file_path': str(file_path),
-                    'saved_files': saved_files,
-                    'num_documents': len(doc_entries)
-                }
+            # Append each document to JSONL file immediately
+            if doc_entries:
+                # Use lock to ensure thread-safe writing to JSONL file
+                async with jsonl_lock:
+                    await loop.run_in_executor(
+                        None,
+                        self._append_to_jsonl,
+                        jsonl_file,
+                        doc_entries
+                    )
             
-            # No temp_folder - return documents in memory (legacy behavior)
+            # Return metadata only, documents already saved to JSONL
             return {
                 'success': True,
                 'file_path': str(file_path),
-                'documents': doc_entries,
                 'num_documents': len(doc_entries)
             }
 
@@ -93,12 +83,21 @@ async def _process_file_async(self, file_path, folder_path_obj, graphname, temp_
         except Exception as e:
             logger.warning(f"Failed to process file {file_path}: {e}")
             return {'success': False, 'file_path': str(file_path), 'error': str(e)}
+    
+    def _append_to_jsonl(self, jsonl_file, doc_entries):
+        """
+        Append document entries to JSONL file.
+        Each document is written as a separate line.
+        """
+        with open(jsonl_file, 'a', encoding='utf-8') as f:
+            for doc_data in doc_entries:
+                json_line = json.dumps(doc_data, ensure_ascii=False)
+                f.write(json_line + '\n')
 
-    async def _process_folder_async(self, folder_path, graphname=None, max_concurrent=10, temp_folder=None):
+    async def _process_folder_async(self, folder_path, graphname, temp_folder, max_concurrent=10):
         """
         Async version of process_folder for parallel file processing.
-        This prevents conflicts when multiple users process folders simultaneously.
-        If temp_folder is provided, saves documents immediately to disk instead of holding in memory.
+        Saves all documents immediately to a single JSONL file as they are processed.
         """
         logger.info(f"Processing local folder ASYNC: {folder_path} for graph: {graphname} (max_concurrent={max_concurrent})")
 
@@ -110,10 +109,12 @@ async def _process_folder_async(self, folder_path, graphname=None, max_concurren
         if not folder_path_obj.is_dir():
             raise Exception(f"Path is not a directory: {folder_path}")
 
-        # Create temp folder if provided
-        if temp_folder:
-            os.makedirs(temp_folder, exist_ok=True)
-            logger.info(f"Saving processed documents to: {temp_folder}")
+        # Create temp folder and JSONL file
+        os.makedirs(temp_folder, exist_ok=True)
+        jsonl_file = os.path.join(temp_folder, "processed_documents.jsonl")
+        # Create async lock for thread-safe JSONL writing
+        jsonl_lock = asyncio.Lock()
+        logger.info(f"Saving processed documents to: {jsonl_file}")
 
         def safe_walk(path):
             try:
@@ -139,20 +140,16 @@ def safe_walk(path):
         logger.info(f"Found {len(files_to_process)} files to process")
 
         semaphore = asyncio.Semaphore(max_concurrent)
-        
-        # Thread-safe counter for unique file naming
-        file_counter = iter(range(100000)) if temp_folder else None
 
         async def process_with_semaphore(file_path):
             async with semaphore:
-                return await self._process_file_async(file_path, folder_path_obj, graphname, temp_folder, file_counter)
+                return await self._process_file_async(file_path, folder_path_obj, graphname, temp_folder, jsonl_file, jsonl_lock)
 
         tasks = [process_with_semaphore(fp) for fp in files_to_process]
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
-        all_documents = []
         processed_files_info = []
-        total_saved_files = []
+        total_docs = 0
 
         for result in results:
             if isinstance(result, Exception):
@@ -160,15 +157,12 @@ async def process_with_semaphore(file_path):
                 continue
 
             if result.get('success'):
-                # If temp_folder was used, documents are saved to disk
-                if temp_folder:
-                    total_saved_files.extend(result.get('saved_files', []))
-                else:
-                    all_documents.extend(result.get('documents', []))
+                num_docs = result.get('num_documents', 0)
+                total_docs += num_docs
                 
                 processed_files_info.append({
                     'file_path': result['file_path'],
-                    'num_documents': result.get('num_documents', 0),
+                    'num_documents': num_docs,
                     'status': 'success'
                 })
             else:
@@ -178,39 +172,118 @@ async def process_with_semaphore(file_path):
                     'error': result.get('error', 'Unknown error')
                 })
 
-        total_docs = len(total_saved_files) if temp_folder else len(all_documents)
         logger.info(f"Processed {len(processed_files_info)} files, extracted {total_docs} total documents")
 
-        response = {
+        return {
             'statusCode': 200,
             'message': f'Processed {len(processed_files_info)} files, {total_docs} documents',
             'files': processed_files_info,
-            'num_documents': total_docs
+            'num_documents': total_docs,
+            'temp_folder': temp_folder,
+            'jsonl_file': jsonl_file
         }
-        
-        # Only include documents in response if NOT saving to temp_folder
-        if temp_folder:
-            response['saved_to_temp'] = True
-            response['temp_folder'] = temp_folder
-            response['saved_files'] = total_saved_files
-        else:
-            response['documents'] = all_documents
-        
-        return response
 
-    def process_folder(self, folder_path, graphname=None, temp_folder=None):
+    def process_folder(self, folder_path, graphname, temp_folder):
         """
         Process local folder with multiple file formats and extract text content.
         Uses async processing internally for parallel file handling.
+        Saves all documents to JSONL file immediately as they are processed.
         
         Args:
             folder_path: Path to the folder containing files to process
             graphname: Name of the graph (for context)
-            temp_folder: Optional path to save processed documents immediately.
-                        If provided, documents are saved to disk instead of returned in memory.
+            temp_folder: Path to save processed documents as JSONL file
         """
         logger.info(f"Processing local folder: {folder_path} for graph: {graphname}")
-        return asyncio.run(self._process_folder_async(folder_path, graphname, temp_folder=temp_folder))
+        return asyncio.run(self._process_folder_async(folder_path, graphname, temp_folder))
+    
+    def delete_file_from_jsonl(self, temp_folder, filename):
+        """
+        Delete all documents related to a specific file from the JSONL file.
+        
+        Args:
+            temp_folder: Path to the temp folder containing processed_documents.jsonl
+            filename: Original filename (e.g., "report.pdf", "stock_gs200.jpg")
+        
+        Returns:
+            dict with status and number of documents removed
+        """
+        jsonl_file = os.path.join(temp_folder, "processed_documents.jsonl")
+        
+        if not os.path.exists(jsonl_file):
+            logger.warning(f"JSONL file not found: {jsonl_file}")
+            return {'success': False, 'error': 'JSONL file not found'}
+        
+        # Get base name without extension to match doc_id
+        base_name = Path(filename).stem
+        logger.info(f"Deleting documents for file: {filename} (base_name: '{base_name}')")
+        
+        # Read all lines and filter out ones matching this file
+        remaining_lines = []
+        removed_count = 0
+        removed_doc_ids = []
+        
+        try:
+            with open(jsonl_file, 'r', encoding='utf-8') as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    
+                    try:
+                        doc_data = json.loads(line)
+                        doc_id = doc_data.get('doc_id', '')
+                        
+                        # Check if doc_id matches the base_name or starts with base_name_
+                        # Handles: "stock_gs200" == "stock_gs200" or "stock_gs200_image_1".startswith("stock_gs200_")
+                        if doc_id == base_name or doc_id.startswith(f"{base_name}_"):
+                            removed_count += 1
+                            removed_doc_ids.append(doc_id)
+                            logger.info(f"Removing document: {doc_id}")
+                        else:
+                            remaining_lines.append(line)
+                    except json.JSONDecodeError as e:
+                        logger.warning(f"Skipping invalid JSON at line {line_num}: {e}")
+                        # Keep invalid lines in case they're important
+                        remaining_lines.append(line)
+            
+            if removed_count == 0:
+                logger.warning(f"No documents found matching base_name: '{base_name}'")
+                return {
+                    'success': False,
+                    'error': f'No documents found for {filename}',
+                    'removed_count': 0
+                }
+            
+            # If no lines remain, delete the entire temp folder
+            if not remaining_lines:
+                logger.info(f"No documents remaining, deleting temp folder: {temp_folder}")
+                import shutil
+                shutil.rmtree(temp_folder, ignore_errors=True)
+                return {
+                    'success': True,
+                    'removed_count': removed_count,
+                    'removed_doc_ids': removed_doc_ids,
+                    'temp_folder_deleted': True
+                }
+            
+            # Write remaining lines back to JSONL
+            with open(jsonl_file, 'w', encoding='utf-8') as f:
+                for line in remaining_lines:
+                    f.write(line + '\n')
+            
+            logger.info(f"Removed {removed_count} documents ({', '.join(removed_doc_ids)}), {len(remaining_lines)} remaining")
+            return {
+                'success': True,
+                'removed_count': removed_count,
+                'removed_doc_ids': removed_doc_ids,
+                'remaining_count': len(remaining_lines),
+                'temp_folder_deleted': False
+            }
+            
+        except Exception as e:
+            logger.error(f"Error deleting from JSONL: {e}")
+            return {'success': False, 'error': str(e)}
 
 
 def extract_text_from_file_with_images_as_docs(file_path, graphname=None):
diff --git a/graphrag-ui/src/pages/Setup.tsx b/graphrag-ui/src/pages/Setup.tsx
index 17f952c..c3e4689 100644
--- a/graphrag-ui/src/pages/Setup.tsx
+++ b/graphrag-ui/src/pages/Setup.tsx
@@ -398,11 +398,18 @@ const Setup = () => {
 
       const data = await response.json();
       if (data.status === "success") {
-        setDownloadMessage(`✅ ${data.message}. Processing...`);
+        setDownloadMessage(`✅ ${data.message}. Processed ${data.doc_count || 0} document(s)`);
         await fetchDownloadedFiles();
         
-        // Step 2: Call create_ingest to process downloaded files
-        await handleCreateIngestAfterUpload("downloaded");
+        // Save session ID from automatic processing
+        if (data.temp_session_id) {
+          setTempSessionId(data.temp_session_id);
+          await fetchTempFiles(data.temp_session_id);
+        }
+      } else if (data.status === "partial_success") {
+        setDownloadMessage(`⚠️ ${data.message}`);
+        await fetchDownloadedFiles();
+        // Don't call create_ingest if processing already attempted
       } else if (data.status === "warning") {
         setDownloadMessage(`⚠️ ${data.message}`);
       } else {
@@ -421,22 +428,26 @@ const Setup = () => {
 
     try {
       const creds = localStorage.getItem("creds");
-      const response = await fetch(
-        `/ui/${ingestGraphName}/cloud/delete?filename=${encodeURIComponent(filename)}`,
-        {
-          method: "DELETE",
-          headers: { Authorization: `Basic ${creds}` },
-        }
-      );
-      const data = await response.json();
       
-      // Also delete corresponding temp files if session exists
+      // Build URL with session_id if available
+      let deleteUrl = `/ui/${ingestGraphName}/cloud/delete?filename=${encodeURIComponent(filename)}`;
       if (tempSessionId) {
-        await handleDeleteTempFilesForOriginal(filename);
+        deleteUrl += `&session_id=${encodeURIComponent(tempSessionId)}`;
       }
       
+      const response = await fetch(deleteUrl, {
+        method: "DELETE",
+        headers: { Authorization: `Basic ${creds}` },
+      });
+      const data = await response.json();
+      
       setDownloadMessage(`✅ ${data.message}`);
       await fetchDownloadedFiles();
+      
+      // Refresh temp files list if session exists
+      if (tempSessionId) {
+        await fetchTempFiles(tempSessionId);
+      }
     } catch (error: any) {
       setDownloadMessage(`❌ Error: ${error.message}`);
     }
@@ -451,13 +462,26 @@ const Setup = () => {
 
     try {
       const creds = localStorage.getItem("creds");
-      const response = await fetch(`/ui/${ingestGraphName}/cloud/delete`, {
+      
+      // Build URL with session_id if available
+      let deleteUrl = `/ui/${ingestGraphName}/cloud/delete`;
+      if (tempSessionId) {
+        deleteUrl += `?session_id=${encodeURIComponent(tempSessionId)}`;
+      }
+      
+      const response = await fetch(deleteUrl, {
         method: "DELETE",
         headers: { Authorization: `Basic ${creds}` },
       });
       const data = await response.json();
       setDownloadMessage(`✅ ${data.message}`);
       await fetchDownloadedFiles();
+      
+      // Clear session ID and refresh temp files
+      if (tempSessionId) {
+        setTempSessionId(null);
+        setTempFiles([]);
+      }
     } catch (error: any) {
       setDownloadMessage(`❌ Error: ${error.message}`);
     }
@@ -541,44 +565,24 @@ const Setup = () => {
     }
 
     try {
-      // Extract base name without extension (e.g., "document.pdf" -> "document")
-      const baseName = originalFilename.replace(/\.[^/.]+$/, "");
-      console.log("Base name:", baseName);
-      
       const creds = localStorage.getItem("creds");
       
-      // Fetch temp files to find matches
-      const response = await fetch(`/ui/${ingestGraphName}/ingestion_temp/list?session_id=${tempSessionId}`, {
-        headers: { Authorization: `Basic ${creds}` },
-      });
-      const data = await response.json();
-      console.log("Temp files list response:", data);
-      
-      if (data.status === "success" && data.sessions.length > 0) {
-        const files = data.sessions[0].files || [];
-        console.log("All temp files:", files.map((f: any) => f.filename));
-        
-        // Find temp files matching pattern: doc_{idx}_{baseName}*.json
-        const matchingFiles = files.filter((f: any) => f.filename.includes(`_${baseName}`));
-        console.log("Matching files to delete:", matchingFiles.map((f: any) => f.filename));
-        
-        // Delete each matching file
-        for (const file of matchingFiles) {
-          console.log("Deleting temp file:", file.filename);
-          const deleteResponse = await fetch(
-            `/ui/${ingestGraphName}/ingestion_temp/delete?session_id=${tempSessionId}&filename=${encodeURIComponent(file.filename)}`,
-            {
-              method: "DELETE",
-              headers: { Authorization: `Basic ${creds}` },
-            }
-          );
-          const deleteData = await deleteResponse.json();
-          console.log("Delete response:", deleteData);
+      // Call the delete endpoint with the original filename
+      // The backend will handle removing all related documents from the JSONL file
+      const deleteResponse = await fetch(
+        `/ui/${ingestGraphName}/ingestion_temp/delete?session_id=${tempSessionId}&filename=${encodeURIComponent(originalFilename)}`,
+        {
+          method: "DELETE",
+          headers: { Authorization: `Basic ${creds}` },
         }
-        
-        console.log(`Successfully deleted ${matchingFiles.length} temp file(s)`);
+      );
+      const deleteData = await deleteResponse.json();
+      console.log("Delete temp files response:", deleteData);
+      
+      if (deleteData.status === "success") {
+        console.log(`Successfully deleted processed documents for ${originalFilename}`);
       } else {
-        console.log("No temp files found or empty sessions");
+        console.error("Failed to delete temp files:", deleteData);
       }
     } catch (error: any) {
       console.error("Error deleting temp files:", error);
@@ -621,7 +625,8 @@ const Setup = () => {
       const ingestData = await ingestResponse.json();
       console.log("Ingest response:", ingestData);
 
-      setIngestMessage(`✅ Data ingested successfully! Processed ${tempFiles.length} documents.`);
+      const docCount = ingestData.document_count || tempFiles.length;
+      setIngestMessage(`✅ Data ingested successfully! Processed ${docCount} document(s).`);
       
       // Clear temp state
       setTempFiles([]);
@@ -691,7 +696,7 @@ const Setup = () => {
           data_source_id: createData.data_source_id,
           data_path: createData.data_path || createData.file_path,
         });
-        setIngestMessage(`✅ Processed ${createData.data_source_id.file_count} files. Review them below before ingesting.`);
+        setIngestMessage(`✅ Files processed successfully. Review them below before ingesting.`);
         await fetchTempFiles(sessionId);
         setIsIngesting(false);
       } else {
@@ -802,7 +807,7 @@ const Setup = () => {
           await handleRunIngest();
         } else {
           // Save for later - files ready for ingestion
-          setUploadMessage(`✅ Successfully processed ${createData.data_source_id.file_count} files. Ready for ingestion.`);
+          setUploadMessage(`✅ Files processed successfully. Ready for ingestion.`);
         }
       } else {
         console.warn("No session ID returned from create_ingest");
diff --git a/graphrag/app/routers/ui.py b/graphrag/app/routers/ui.py
index 9b012ec..9d8cc33 100644
--- a/graphrag/app/routers/ui.py
+++ b/graphrag/app/routers/ui.py
@@ -1058,7 +1058,8 @@ async def download_from_cloud(
     request_body: dict = Body(...),
 ):
     """
-    Download files from cloud storage (S3, GCS, or Azure) to local directory.
+    Download files from cloud storage (S3, GCS, or Azure) to local directory
+    and automatically process them to create JSONL files for ingestion.
     
     Parameters:
     - graphname: The graph name to associate downloaded files with
@@ -1252,14 +1253,49 @@ async def download_from_cloud(
         
         logger.info(f"Downloaded {len(downloaded_files)} file(s) from {provider} for graph {graphname}")
         
-        return {
-            "status": "success",
-            "message": f"Successfully downloaded {len(downloaded_files)} file(s) from {provider}",
-            "graphname": graphname,
-            "provider": provider,
-            "downloaded_files": downloaded_files,
-            "local_path": download_dir,
-        }
+        # Automatically process downloaded files to create JSONL
+        from common.utils.text_extractors import TextExtractor
+        temp_session_id = str(uuid.uuid4())
+        temp_folder = os.path.join("uploads", "ingestion_temp", graphname, temp_session_id)
+        
+        try:
+            extractor = TextExtractor()
+            processing_result = extractor.process_folder(
+                download_dir,
+                graphname=graphname,
+                temp_folder=temp_folder
+            )
+            
+            if processing_result.get("statusCode") != 200:
+                logger.error(f"Cloud file processing failed: {processing_result}")
+                raise Exception(f"Failed to process downloaded files: {processing_result}")
+            
+            doc_count = processing_result.get("num_documents", 0)
+            logger.info(f"Processed {doc_count} documents from downloaded files")
+            
+            return {
+                "status": "success",
+                "message": f"Successfully downloaded and processed {len(downloaded_files)} file(s) from {provider}",
+                "graphname": graphname,
+                "provider": provider,
+                "downloaded_files": downloaded_files,
+                "local_path": download_dir,
+                "temp_session_id": temp_session_id,
+                "temp_folder": temp_folder,
+                "doc_count": doc_count,
+            }
+        except Exception as e:
+            logger.error(f"Error processing downloaded files: {e}")
+            # Return success for download but warn about processing failure
+            return {
+                "status": "partial_success",
+                "message": f"Downloaded {len(downloaded_files)} file(s) but processing failed: {str(e)}",
+                "graphname": graphname,
+                "provider": provider,
+                "downloaded_files": downloaded_files,
+                "local_path": download_dir,
+                "processing_error": str(e),
+            }
     
     except HTTPException:
         raise
@@ -1322,13 +1358,16 @@ async def delete_cloud_downloads(
     graphname: str,
     credentials: Annotated[HTTPBase, Depends(security)],
     filename: str = None,
+    session_id: str = None,
 ):
     """
     Delete downloaded cloud files for a specific graph.
+    Also deletes corresponding processed documents from the JSONL file.
     
     Parameters:
     - graphname: The graph name whose downloaded files to clear
     - filename: If provided, only delete this specific file. Otherwise, delete all files.
+    - session_id: The session ID for the temp folder containing processed JSONL
     """
     try:
         download_dir = os.path.join("downloaded_files_cloud", graphname)
@@ -1343,9 +1382,26 @@ async def delete_cloud_downloads(
         deleted_files = []
         
         if filename:
-            # Delete specific file
+            # Delete specific file AND its processed documents from JSONL
             file_path = os.path.join(download_dir, filename)
             if os.path.exists(file_path) and os.path.isfile(file_path):
+                # If session_id provided, also delete from JSONL
+                if session_id:
+                    from common.utils.text_extractors import TextExtractor
+                    extractor = TextExtractor()
+                    
+                    temp_folder = os.path.join("uploads", "ingestion_temp", graphname, session_id)
+                    if os.path.exists(temp_folder):
+                        # Delete from JSONL first
+                        delete_result = extractor.delete_file_from_jsonl(temp_folder, filename)
+                        logger.info(f"JSONL delete result for {filename}: {delete_result}")
+                        
+                        # If JSONL delete failed (and JSONL exists), warn but continue with file deletion
+                        if not delete_result.get('success'):
+                            logger.warning(f"Failed to delete from JSONL: {delete_result.get('error')}")
+                            # Continue with file deletion even if JSONL deletion failed
+                
+                # Delete the original downloaded file
                 os.remove(file_path)
                 deleted_files.append(filename)
                 logger.info(f"Deleted cloud download {filename} for graph {graphname}")
@@ -1353,13 +1409,21 @@ async def delete_cloud_downloads(
                 raise HTTPException(status_code=404, detail=f"File {filename} not found")
         else:
             # Delete all files in the directory
-            for filename in os.listdir(download_dir):
-                file_path = os.path.join(download_dir, filename)
+            for fname in os.listdir(download_dir):
+                file_path = os.path.join(download_dir, fname)
                 if os.path.isfile(file_path):
                     os.remove(file_path)
-                    deleted_files.append(filename)
+                    deleted_files.append(fname)
             
-            # Remove the directory if it's empty
+            # If session_id provided, delete the entire temp folder
+            if session_id:
+                temp_folder = os.path.join("uploads", "ingestion_temp", graphname, session_id)
+                if os.path.exists(temp_folder):
+                    import shutil
+                    shutil.rmtree(temp_folder, ignore_errors=True)
+                    logger.info(f"Deleted temp folder for session {session_id}")
+            
+            # Remove the download directory if it's empty
             if not os.listdir(download_dir):
                 os.rmdir(download_dir)
             
@@ -1478,25 +1542,41 @@ async def delete_ingestion_temp_files(
         deleted_files = []
         
         if filename:
-            # Delete specific file
-            file_path = os.path.join(session_dir, filename)
-            if os.path.exists(file_path) and os.path.isfile(file_path):
-                os.remove(file_path)
-                deleted_files.append(filename)
-                logger.info(f"Deleted temp file {filename} from session {session_id}")
-                
-                # If session folder is now empty, remove it
-                if not os.listdir(session_dir):
-                    os.rmdir(session_dir)
-                    logger.info(f"Removed empty session folder {session_id}")
+            # Delete processed documents from JSONL for this original filename
+            # Note: Original files are NOT in temp folder, only processed_documents.jsonl is here
+            from common.utils.text_extractors import TextExtractor
+            extractor = TextExtractor()
+            
+            # Delete from JSONL - MUST succeed
+            delete_result = extractor.delete_file_from_jsonl(session_dir, filename)
+            logger.info(f"JSONL delete result for {filename}: {delete_result}")
+            
+            # If JSONL delete failed, return error
+            if not delete_result.get('success'):
+                error_msg = delete_result.get('error', 'Unknown error')
+                logger.error(f"Failed to delete from JSONL: {error_msg}")
+                raise HTTPException(status_code=500, detail=f"Failed to delete processed documents: {error_msg}")
+            
+            deleted_files.append(filename)
+            logger.info(f"Deleted {delete_result.get('removed_count', 0)} processed documents for {filename} from JSONL")
+            
+            # Check if temp folder was deleted by JSONL cleanup
+            if delete_result.get('temp_folder_deleted'):
+                logger.info(f"Session folder {session_id} was automatically deleted (no documents remaining)")
+            elif not os.path.exists(session_dir):
+                logger.info(f"Session folder {session_id} was deleted")
+            elif not os.listdir(session_dir):
+                # Clean up empty session folder
+                os.rmdir(session_dir)
+                logger.info(f"Removed empty session folder {session_id}")
             else:
-                raise HTTPException(status_code=404, detail=f"File {filename} not found")
+                logger.info(f"Removed {delete_result.get('removed_count', 0)} documents from JSONL, {delete_result.get('remaining_count', 0)} remaining")
         else:
-            # Delete entire session folder
+            # Delete entire session folder (including JSONL)
             import shutil
-            for filename in os.listdir(session_dir):
-                if os.path.isfile(os.path.join(session_dir, filename)):
-                    deleted_files.append(filename)
+            for fname in os.listdir(session_dir):
+                if os.path.isfile(os.path.join(session_dir, fname)):
+                    deleted_files.append(fname)
             
             shutil.rmtree(session_dir)
             logger.info(f"Deleted session folder {session_id} for graph {graphname}")
diff --git a/graphrag/app/supportai/supportai.py b/graphrag/app/supportai/supportai.py
index 63cfa3d..a9dbbe0 100644
--- a/graphrag/app/supportai/supportai.py
+++ b/graphrag/app/supportai/supportai.py
@@ -502,26 +502,6 @@ def create_ingest(
             
             doc_count = server_processing_result.get("num_documents", 0)
             logger.info(f"Server folder processing completed: {server_processing_result.get('message')}")
-
-            # Save processed documents to temporary folder instead of keeping in memory
-            temp_session_id = str(uuid.uuid4())
-            temp_folder = os.path.join("uploads", "ingestion_temp", graphname, temp_session_id)
-            os.makedirs(temp_folder, exist_ok=True)
-            
-            documents = server_processing_result.get("documents", [])
-            doc_count = len(documents)
-            
-            # Save all documents to a single JSONL file (our new logic)
-            jsonl_filepath = os.path.join(temp_folder, "processed_documents.jsonl")
-            with open(jsonl_filepath, 'w', encoding='utf-8') as f:
-                for doc_data in documents:
-                    f.write(json.dumps(doc_data, ensure_ascii=False) + '\n')
-            
-            # Clear documents from memory immediately after saving
-            documents.clear()
-            server_processing_result.clear()
-            
-            logger.info(f"Saved {doc_count} processed documents to {jsonl_filepath}")
             
             res_ingest_config["temp_session_id"] = temp_session_id
             res_ingest_config["temp_folder"] = temp_folder
@@ -682,28 +662,28 @@ def ingest(
             try:
                 data_source_id = ingest_config.get("data_source_id", "DocumentContent")
                 
-                # Read from temporary folder
+                # Read from temporary folder's JSONL file
                 temp_folder = ingest_config.get("temp_folder")
                 if not temp_folder or not os.path.exists(temp_folder):
                     raise Exception(f"Temporary folder not found: {temp_folder}")
                 
-                # Read the processed_documents.jsonl file (our new logic)
+                # Read the entire JSONL file as a string
                 jsonl_file = os.path.join(temp_folder, "processed_documents.jsonl")
                 if not os.path.exists(jsonl_file):
-                    raise Exception(f"Processed documents file not found: {jsonl_file}")
+                    raise Exception(f"JSONL file not found: {jsonl_file}")
                 
-                # Read entire JSONL content as a single string
+                logger.info(f"Reading JSONL file: {jsonl_file}")
+                
+                # Read entire JSONL content
                 with open(jsonl_file, 'r', encoding='utf-8') as f:
                     jsonl_content = f.read()
                 
-                # Count documents for logging
-                document_count = jsonl_content.count('\n') if jsonl_content.strip() else 0
-                logger.info(f"Ingesting {document_count} documents from {jsonl_file}")
-                
-                # Pass entire JSONL content in ONE call (efficient!)
+                # Load all documents in one call - runLoadingJobWithData supports JSONL format
                 conn.runLoadingJobWithData(jsonl_content, data_source_id, loader_info.load_job_id)
                 
-                logger.info(f"Successfully ingested {document_count} documents")
+                # Count documents for reporting
+                doc_count = sum(1 for line in jsonl_content.strip().split('\n') if line.strip())
+                logger.info(f"Successfully ingested {doc_count} documents from JSONL")
                 
                 # Clean up temp folder after successful ingestion
                 try:
@@ -717,7 +697,8 @@ def ingest(
                 raise Exception(f"Error during server markdown extraction and TigerGraph loading: {e}")
             return {
                 "job_name": loader_info.load_job_id,
-                "summary": f"Data ingestion successful - processed {document_count} documents"
+                "summary": f"Successfully ingested {doc_count} documents from JSONL",
+                "document_count": doc_count
             }
         else:
             raise Exception("Data source and file format combination not implemented")

From 2d54b02a766ab209c65b8d27bd0c2741f906efb4 Mon Sep 17 00:00:00 2001
From: Prins Kumar <prins.kumar@agivant.com>
Date: Fri, 5 Dec 2025 21:52:01 +0530
Subject: [PATCH 15/15] Fix: Delete processed content from JSONL before
 deleting original files

---
 graphrag-ui/src/pages/Setup.tsx | 364 ++++++++++++++++----------------
 graphrag/app/routers/ui.py      | 170 ++++++---------
 2 files changed, 238 insertions(+), 296 deletions(-)

diff --git a/graphrag-ui/src/pages/Setup.tsx b/graphrag-ui/src/pages/Setup.tsx
index c3e4689..0e15939 100644
--- a/graphrag-ui/src/pages/Setup.tsx
+++ b/graphrag-ui/src/pages/Setup.tsx
@@ -40,7 +40,7 @@ const Setup = () => {
   const navigate = useNavigate();
   const [confirm, confirmDialog, isConfirmDialogOpen] = useConfirm();
   const [availableGraphs, setAvailableGraphs] = useState<string[]>([]);
-  
+
   const [initializeGraphOpen, setInitializeGraphOpen] = useState(false);
   const [graphName, setGraphName] = useState("");
   const [isInitializing, setIsInitializing] = useState(false);
@@ -71,7 +71,7 @@ const Setup = () => {
   const [refreshGraphName, setRefreshGraphName] = useState("");
   const [isRebuildRunning, setIsRebuildRunning] = useState(false);
   const [isCheckingStatus, setIsCheckingStatus] = useState(false);
-  
+
   // S3 state
   const [fileFormat, setFileFormat] = useState<"json" | "multi">("json");
   const [awsAccessKey, setAwsAccessKey] = useState("");
@@ -129,7 +129,7 @@ const Setup = () => {
     }
 
     const filesArray = Array.from(selectedFiles);
-    
+
     // Check if any single file exceeds the server limit
     const oversizedFiles = filesArray.filter((file) => file.size > MAX_UPLOAD_SIZE_BYTES);
     if (oversizedFiles.length > 0) {
@@ -169,7 +169,7 @@ const Setup = () => {
         setUploadMessage(`✅ ${data.message} Processing...`);
         setSelectedFiles(null);
         await fetchUploadedFiles();
-        
+
         // Step 2: Call create_ingest to process uploaded files
         console.log("Calling handleCreateIngestAfterUpload from main upload...");
         await handleCreateIngestAfterUpload("uploaded");
@@ -200,9 +200,9 @@ const Setup = () => {
       for (let i = 0; i < filesArray.length; i++) {
         const file = filesArray[i];
         const fileNumber = i + 1;
-        
+
         setUploadMessage(`Uploading file ${fileNumber}/${totalFiles}: ${file.name} (${formatBytes(file.size)})...`);
-        
+
         const formData = new FormData();
         formData.append("files", file);
 
@@ -236,10 +236,10 @@ const Setup = () => {
       } else {
         setUploadMessage(`⚠️ Uploaded ${uploadedCount} files successfully, ${failedCount} failed. Processing...`);
       }
-      
+
       setSelectedFiles(null);
       await fetchUploadedFiles();
-      
+
       // Step 2: Call create_ingest to process uploaded files
       console.log("Calling handleCreateIngestAfterUpload...");
       await handleCreateIngestAfterUpload("uploaded");
@@ -261,25 +261,25 @@ const Setup = () => {
 
     try {
       const creds = localStorage.getItem("creds");
-      
-      // Also delete corresponding temp files FIRST if session exists
-      if (tempSessionId) {
-        console.log("Calling handleDeleteTempFilesForOriginal...");
-        await handleDeleteTempFilesForOriginal(filename);
-      }
-      
-      // Then delete original file
-      const response = await fetch(
-        `/ui/${ingestGraphName}/uploads?filename=${encodeURIComponent(filename)}`,
-        {
-          method: "DELETE",
-          headers: { Authorization: `Basic ${creds}` },
-        }
-      );
+
+      // Delete original file (backend will also delete processed content from JSONL if session_id is provided)
+      const url = tempSessionId
+        ? `/ui/${ingestGraphName}/uploads?filename=${encodeURIComponent(filename)}&session_id=${tempSessionId}`
+        : `/ui/${ingestGraphName}/uploads?filename=${encodeURIComponent(filename)}`;
+
+      const response = await fetch(url, {
+        method: "DELETE",
+        headers: { Authorization: `Basic ${creds}` },
+      });
       const data = await response.json();
-      
+
       setUploadMessage(`✅ ${data.message}`);
       await fetchUploadedFiles();
+
+      // Refresh temp files list if session exists
+      if (tempSessionId) {
+        await fetchTempFiles(tempSessionId);
+      }
     } catch (error: any) {
       console.error("Delete error:", error);
       setUploadMessage(`❌ Error: ${error.message}`);
@@ -300,12 +300,12 @@ const Setup = () => {
         headers: { Authorization: `Basic ${creds}` },
       });
       const data = await response.json();
-      
+
       // Also clear temp session
       if (tempSessionId) {
         await handleDeleteAllTempFiles();
       }
-      
+
       setUploadMessage(`✅ ${data.message}`);
       await fetchUploadedFiles();
     } catch (error: any) {
@@ -341,7 +341,7 @@ const Setup = () => {
 
     try {
       const creds = localStorage.getItem("creds");
-      
+
       // Prepare request body based on provider
       let requestBody: any = { provider: cloudProvider };
 
@@ -398,18 +398,11 @@ const Setup = () => {
 
       const data = await response.json();
       if (data.status === "success") {
-        setDownloadMessage(`✅ ${data.message}. Processed ${data.doc_count || 0} document(s)`);
+        setDownloadMessage(`✅ ${data.message}. Processing...`);
         await fetchDownloadedFiles();
-        
-        // Save session ID from automatic processing
-        if (data.temp_session_id) {
-          setTempSessionId(data.temp_session_id);
-          await fetchTempFiles(data.temp_session_id);
-        }
-      } else if (data.status === "partial_success") {
-        setDownloadMessage(`⚠️ ${data.message}`);
-        await fetchDownloadedFiles();
-        // Don't call create_ingest if processing already attempted
+
+        // Step 2: Call create_ingest to process downloaded files
+        await handleCreateIngestAfterUpload("downloaded");
       } else if (data.status === "warning") {
         setDownloadMessage(`⚠️ ${data.message}`);
       } else {
@@ -428,22 +421,21 @@ const Setup = () => {
 
     try {
       const creds = localStorage.getItem("creds");
-      
-      // Build URL with session_id if available
-      let deleteUrl = `/ui/${ingestGraphName}/cloud/delete?filename=${encodeURIComponent(filename)}`;
-      if (tempSessionId) {
-        deleteUrl += `&session_id=${encodeURIComponent(tempSessionId)}`;
-      }
-      
-      const response = await fetch(deleteUrl, {
+
+      // Delete original file (backend will also delete processed content from JSONL if session_id is provided)
+      const url = tempSessionId
+        ? `/ui/${ingestGraphName}/cloud/delete?filename=${encodeURIComponent(filename)}&session_id=${tempSessionId}`
+        : `/ui/${ingestGraphName}/cloud/delete?filename=${encodeURIComponent(filename)}`;
+
+      const response = await fetch(url, {
         method: "DELETE",
         headers: { Authorization: `Basic ${creds}` },
       });
       const data = await response.json();
-      
+
       setDownloadMessage(`✅ ${data.message}`);
       await fetchDownloadedFiles();
-      
+
       // Refresh temp files list if session exists
       if (tempSessionId) {
         await fetchTempFiles(tempSessionId);
@@ -462,26 +454,13 @@ const Setup = () => {
 
     try {
       const creds = localStorage.getItem("creds");
-      
-      // Build URL with session_id if available
-      let deleteUrl = `/ui/${ingestGraphName}/cloud/delete`;
-      if (tempSessionId) {
-        deleteUrl += `?session_id=${encodeURIComponent(tempSessionId)}`;
-      }
-      
-      const response = await fetch(deleteUrl, {
+      const response = await fetch(`/ui/${ingestGraphName}/cloud/delete`, {
         method: "DELETE",
         headers: { Authorization: `Basic ${creds}` },
       });
       const data = await response.json();
       setDownloadMessage(`✅ ${data.message}`);
       await fetchDownloadedFiles();
-      
-      // Clear session ID and refresh temp files
-      if (tempSessionId) {
-        setTempSessionId(null);
-        setTempFiles([]);
-      }
     } catch (error: any) {
       setDownloadMessage(`❌ Error: ${error.message}`);
     }
@@ -558,31 +537,51 @@ const Setup = () => {
   // Delete temp files matching original filename
   const handleDeleteTempFilesForOriginal = async (originalFilename: string) => {
     console.log("handleDeleteTempFilesForOriginal called with:", originalFilename);
-    
+
     if (!ingestGraphName || !tempSessionId) {
       console.log("No graph name or session ID, returning");
       return;
     }
 
     try {
+      // Extract base name without extension (e.g., "document.pdf" -> "document")
+      const baseName = originalFilename.replace(/\.[^/.]+$/, "");
+      console.log("Base name:", baseName);
+
       const creds = localStorage.getItem("creds");
-      
-      // Call the delete endpoint with the original filename
-      // The backend will handle removing all related documents from the JSONL file
-      const deleteResponse = await fetch(
-        `/ui/${ingestGraphName}/ingestion_temp/delete?session_id=${tempSessionId}&filename=${encodeURIComponent(originalFilename)}`,
-        {
-          method: "DELETE",
-          headers: { Authorization: `Basic ${creds}` },
+
+      // Fetch temp files to find matches
+      const response = await fetch(`/ui/${ingestGraphName}/ingestion_temp/list?session_id=${tempSessionId}`, {
+        headers: { Authorization: `Basic ${creds}` },
+      });
+      const data = await response.json();
+      console.log("Temp files list response:", data);
+
+      if (data.status === "success" && data.sessions.length > 0) {
+        const files = data.sessions[0].files || [];
+        console.log("All temp files:", files.map((f: any) => f.filename));
+
+        // Find temp files matching pattern: doc_{idx}_{baseName}*.json
+        const matchingFiles = files.filter((f: any) => f.filename.includes(`_${baseName}`));
+        console.log("Matching files to delete:", matchingFiles.map((f: any) => f.filename));
+
+        // Delete each matching file
+        for (const file of matchingFiles) {
+          console.log("Deleting temp file:", file.filename);
+          const deleteResponse = await fetch(
+            `/ui/${ingestGraphName}/ingestion_temp/delete?session_id=${tempSessionId}&filename=${encodeURIComponent(file.filename)}`,
+            {
+              method: "DELETE",
+              headers: { Authorization: `Basic ${creds}` },
+            }
+          );
+          const deleteData = await deleteResponse.json();
+          console.log("Delete response:", deleteData);
         }
-      );
-      const deleteData = await deleteResponse.json();
-      console.log("Delete temp files response:", deleteData);
-      
-      if (deleteData.status === "success") {
-        console.log(`Successfully deleted processed documents for ${originalFilename}`);
+
+        console.log(`Successfully deleted ${matchingFiles.length} temp file(s)`);
       } else {
-        console.error("Failed to delete temp files:", deleteData);
+        console.log("No temp files found or empty sessions");
       }
     } catch (error: any) {
       console.error("Error deleting temp files:", error);
@@ -625,9 +624,8 @@ const Setup = () => {
       const ingestData = await ingestResponse.json();
       console.log("Ingest response:", ingestData);
 
-      const docCount = ingestData.document_count || tempFiles.length;
-      setIngestMessage(`✅ Data ingested successfully! Processed ${docCount} document(s).`);
-      
+      setIngestMessage(`✅ Data ingested successfully! Processed ${tempFiles.length} documents.`);
+
       // Clear temp state
       setTempFiles([]);
       setShowTempFiles(false);
@@ -648,7 +646,7 @@ const Setup = () => {
       return;
     }
 
-    const folderPath = sourceType === "uploaded" 
+    const folderPath = sourceType === "uploaded"
       ? `uploads/${ingestGraphName}`
       : `downloaded_files_cloud/${ingestGraphName}`;
 
@@ -687,7 +685,7 @@ const Setup = () => {
 
       // Check if temp files were created (for server data source)
       const sessionId = createData.data_source_id?.temp_session_id;
-      
+
       if (sessionId && !directIngestion) {
         // Files are saved to temp storage - show them for review (only if not direct ingestion)
         setTempSessionId(sessionId);
@@ -696,37 +694,37 @@ const Setup = () => {
           data_source_id: createData.data_source_id,
           data_path: createData.data_path || createData.file_path,
         });
-        setIngestMessage(`✅ Files processed successfully. Review them below before ingesting.`);
+        setIngestMessage(`✅ Processed ${createData.data_source_id.file_count} files. Review them below before ingesting.`);
         await fetchTempFiles(sessionId);
         setIsIngesting(false);
       } else {
         // No temp files (e.g., S3 Bedrock) OR direct ingestion enabled - proceed directly to ingest
-      setIngestMessage("Step 2/2: Running document ingest...");
+        setIngestMessage("Step 2/2: Running document ingest...");
 
-      const loadingInfo = {
-        load_job_id: createData.load_job_id,
-        data_source_id: createData.data_source_id,
+        const loadingInfo = {
+          load_job_id: createData.load_job_id,
+          data_source_id: createData.data_source_id,
           file_path: createData.data_path || createData.file_path,
-      };
+        };
 
-      const ingestResponse = await fetch(`/ui/${ingestGraphName}/ingest`, {
-        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-          Authorization: `Basic ${creds}`,
-        },
-        body: JSON.stringify(loadingInfo),
-      });
+        const ingestResponse = await fetch(`/ui/${ingestGraphName}/ingest`, {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+            Authorization: `Basic ${creds}`,
+          },
+          body: JSON.stringify(loadingInfo),
+        });
 
-      if (!ingestResponse.ok) {
-        const errorData = await ingestResponse.json();
-        throw new Error(errorData.detail || `Failed to run ingest: ${ingestResponse.statusText}`);
-      }
+        if (!ingestResponse.ok) {
+          const errorData = await ingestResponse.json();
+          throw new Error(errorData.detail || `Failed to run ingest: ${ingestResponse.statusText}`);
+        }
 
-      const ingestData = await ingestResponse.json();
-      console.log("Ingest response:", ingestData);
+        const ingestData = await ingestResponse.json();
+        console.log("Ingest response:", ingestData);
 
-      setIngestMessage(`✅ Data ingested successfully! Processed documents from ${folderPath}/`);
+        setIngestMessage(`✅ Data ingested successfully! Processed documents from ${folderPath}/`);
         setIsIngesting(false);
       }
     } catch (error: any) {
@@ -740,16 +738,16 @@ const Setup = () => {
   const handleCreateIngestAfterUpload = async (sourceType: "uploaded" | "downloaded" = "uploaded") => {
     console.log("handleCreateIngestAfterUpload called with sourceType:", sourceType);
     console.log("ingestGraphName:", ingestGraphName);
-    
+
     if (!ingestGraphName) {
       console.log("No graph name, returning early");
       return;
     }
 
-    const folderPath = sourceType === "uploaded" 
+    const folderPath = sourceType === "uploaded"
       ? `uploads/${ingestGraphName}`
       : `downloaded_files_cloud/${ingestGraphName}`;
-    
+
     console.log("folderPath:", folderPath);
 
     try {
@@ -764,7 +762,7 @@ const Setup = () => {
         loader_config: {},
         file_format: "multi"
       };
-      
+
       console.log("Calling create_ingest with config:", createIngestConfig);
 
       const createResponse = await fetch(`/ui/${ingestGraphName}/create_ingest`, {
@@ -775,7 +773,7 @@ const Setup = () => {
         },
         body: JSON.stringify(createIngestConfig),
       });
-      
+
       console.log("create_ingest response status:", createResponse.status);
 
       if (!createResponse.ok) {
@@ -786,10 +784,10 @@ const Setup = () => {
 
       const createData = await createResponse.json();
       console.log("create_ingest response data:", createData);
-      
+
       const sessionId = createData.data_source_id?.temp_session_id;
       console.log("Session ID:", sessionId);
-      
+
       if (sessionId) {
         // Save session ID for later ingest
         setTempSessionId(sessionId);
@@ -798,16 +796,16 @@ const Setup = () => {
           data_source_id: createData.data_source_id,
           data_path: createData.data_path || createData.file_path,
         });
-        
+
         console.log("Direct ingestion enabled:", directIngestion);
-        
+
         if (directIngestion) {
           // Direct ingestion - proceed to ingest immediately
           setUploadMessage("Running direct ingestion...");
           await handleRunIngest();
         } else {
           // Save for later - files ready for ingestion
-          setUploadMessage(`✅ Files processed successfully. Ready for ingestion.`);
+          setUploadMessage(`✅ Successfully processed ${createData.data_source_id.file_count} files. Ready for ingestion.`);
         }
       } else {
         console.warn("No session ID returned from create_ingest");
@@ -969,9 +967,9 @@ const Setup = () => {
         const statusData = await statusResponse.json();
         const wasRunning = isRebuildRunning;
         const isCurrentlyRunning = statusData.is_running || false;
-        
+
         setIsRebuildRunning(isCurrentlyRunning);
-        
+
         if (isCurrentlyRunning) {
           const startTime = statusData.started_at ? new Date(statusData.started_at * 1000).toLocaleString() : "unknown time";
           setRefreshMessage(`⚠️ A rebuild is already in progress for "${graphName}" (started at ${startTime}). Please wait for it to complete.`);
@@ -1027,7 +1025,7 @@ const Setup = () => {
 
     try {
       const creds = localStorage.getItem("creds");
-      
+
       const response = await fetch(`/ui/${refreshGraphName}/rebuild_graph`, {
         method: "POST",
         headers: {
@@ -1058,12 +1056,12 @@ const Setup = () => {
     if (refreshOpen && refreshGraphName) {
       // Check status immediately when dialog opens
       checkRebuildStatus(refreshGraphName, true);
-      
+
       // Set up polling to check status every 5 seconds while dialog is open
       const intervalId = setInterval(() => {
         checkRebuildStatus(refreshGraphName, false);
       }, 5000);
-      
+
       return () => clearInterval(intervalId);
     }
   }, [refreshOpen, refreshGraphName]);
@@ -1163,10 +1161,10 @@ const Setup = () => {
         setIsInitializing(false);
         return;
       }
-      
+
       setStatusMessage(`✅ Graph "${graphName}" created and initialized successfully! You can now close this dialog.`);
       setStatusType("success");
-      
+
       // Add the new graph to the available graphs list
       const newGraph = graphName;
       setAvailableGraphs(prev => {
@@ -1180,7 +1178,7 @@ const Setup = () => {
         }
         return prev;
       });
-      
+
       // Set the newly created graph as selected for ingestion
       setIngestGraphName(graphName);
       setRefreshGraphName(graphName);
@@ -1216,7 +1214,7 @@ const Setup = () => {
 
         {/* Three cards displayed horizontally */}
         <div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
-          
+
           {/* Section 1: Initialize Knowledge Graph */}
           <div className="border border-gray-300 dark:border-[#3D3D3D] rounded-lg p-6 bg-white dark:bg-shadeA flex flex-col h-full">
             <div className="mb-4">
@@ -1231,7 +1229,7 @@ const Setup = () => {
               </p>
             </div>
             <div className="mt-auto pt-4 border-t border-gray-300 dark:border-[#3D3D3D]">
-              <Button 
+              <Button
                 className="gradient w-full text-white"
                 onClick={() => setInitializeGraphOpen(true)}
               >
@@ -1255,7 +1253,7 @@ const Setup = () => {
               </p>
             </div>
             <div className="mt-auto pt-4 border-t border-gray-300 dark:border-[#3D3D3D]">
-              <Button 
+              <Button
                 className="gradient w-full text-white"
                 onClick={() => setIngestOpen(true)}
               >
@@ -1279,7 +1277,7 @@ const Setup = () => {
               </p>
             </div>
             <div className="mt-auto pt-4 border-t border-gray-300 dark:border-[#3D3D3D]">
-              <Button 
+              <Button
                 className="gradient w-full text-white"
                 onClick={() => setRefreshOpen(true)}
               >
@@ -1292,7 +1290,7 @@ const Setup = () => {
         </div>
 
         {/* Initialize Graph Dialog */}
-        <Dialog 
+        <Dialog
           open={initializeGraphOpen}
           onOpenChange={(open) => {
             // Prevent closing if confirm dialog is open
@@ -1302,7 +1300,7 @@ const Setup = () => {
             setInitializeGraphOpen(open);
           }}
         >
-          <DialogContent 
+          <DialogContent
             className="sm:max-w-[500px] bg-white dark:bg-background border-gray-300 dark:border-[#3D3D3D]"
             onInteractOutside={(e) => e.preventDefault()}
           >
@@ -1312,7 +1310,7 @@ const Setup = () => {
                 Enter the name of your knowledge graph. The system will create it if necessary and initialize it with the GraphRAG schema.
               </DialogDescription>
             </DialogHeader>
-            
+
             <div className="py-4">
               <div className="mb-4">
                 <label className="block text-sm font-medium mb-2 text-black dark:text-white">
@@ -1334,13 +1332,12 @@ const Setup = () => {
 
               {statusMessage && (
                 <div
-                  className={`p-3 rounded-lg text-sm ${
-                    statusType === "success"
-                      ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
-                      : statusType === "error"
+                  className={`p-3 rounded-lg text-sm ${statusType === "success"
+                    ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
+                    : statusType === "error"
                       ? "bg-red-50 dark:bg-red-900/20 text-red-700 dark:text-red-300"
                       : "bg-blue-50 dark:bg-blue-900/20 text-blue-700 dark:text-blue-300"
-                  }`}
+                    }`}
                 >
                   {statusMessage}
                 </div>
@@ -1400,8 +1397,8 @@ const Setup = () => {
         </Dialog>
 
         {/* Data Ingest Dialog */}
-        <Dialog 
-          open={ingestOpen} 
+        <Dialog
+          open={ingestOpen}
           onOpenChange={(open) => {
             // Prevent closing if confirm dialog is open
             if (!open && isConfirmDialogOpen) {
@@ -1410,7 +1407,7 @@ const Setup = () => {
             setIngestOpen(open);
           }}
         >
-          <DialogContent 
+          <DialogContent
             className="sm:max-w-[700px] bg-white dark:bg-background border-gray-300 dark:border-[#3D3D3D] max-h-[80vh] overflow-y-auto"
             onInteractOutside={(e) => e.preventDefault()}
           >
@@ -1481,23 +1478,23 @@ const Setup = () => {
                       disabled={isUploading}
                       className="dark:border-[#3D3D3D] dark:bg-shadeA"
                     />
-                  <p className="text-xs text-gray-500 dark:text-gray-400 mt-2">
-                    Maximum upload per request: {MAX_UPLOAD_SIZE_MB} MB.
-                  </p>
-                  
-                  {/* Direct Ingestion Checkbox */}
-                  <div className="flex items-center mt-3 mb-2">
-                    <input
-                      type="checkbox"
-                      id="directIngestion"
-                      checked={directIngestion}
-                      onChange={(e) => setDirectIngestion(e.target.checked)}
-                      className="mr-2 h-4 w-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
-                    />
-                    <label htmlFor="directIngestion" className="text-sm text-gray-700 dark:text-gray-300">
-                      Direct Ingestion (upload + process + ingest all at once)
-                    </label>
-                  </div>
+                    <p className="text-xs text-gray-500 dark:text-gray-400 mt-2">
+                      Maximum upload per request: {MAX_UPLOAD_SIZE_MB} MB.
+                    </p>
+
+                    {/* Direct Ingestion Checkbox */}
+                    <div className="flex items-center mt-3 mb-2">
+                      <input
+                        type="checkbox"
+                        id="directIngestion"
+                        checked={directIngestion}
+                        onChange={(e) => setDirectIngestion(e.target.checked)}
+                        className="mr-2 h-4 w-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
+                      />
+                      <label htmlFor="directIngestion" className="text-sm text-gray-700 dark:text-gray-300">
+                        Direct Ingestion (upload + process + ingest all at once)
+                      </label>
+                    </div>
                   </div>
 
                   <div className="flex gap-2">
@@ -1546,7 +1543,7 @@ const Setup = () => {
                       <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
                         Process uploaded files and add them to the knowledge graph
                       </p>
-                      
+
                       <Button
                         onClick={handleRunIngest}
                         disabled={isIngesting || !tempSessionId}
@@ -1565,13 +1562,12 @@ const Setup = () => {
                         )}
                       </Button>
                       {ingestMessage && (
-                        <div className={`p-3 rounded-lg text-sm mt-3 ${
-                          ingestMessage.includes("✅")
-                            ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
-                            : ingestMessage.includes("❌")
+                        <div className={`p-3 rounded-lg text-sm mt-3 ${ingestMessage.includes("✅")
+                          ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
+                          : ingestMessage.includes("❌")
                             ? "bg-red-50 dark:bg-red-900/20 text-red-700 dark:text-red-300"
                             : "bg-blue-50 dark:bg-blue-900/20 text-blue-700 dark:text-blue-300"
-                        }`}>
+                          }`}>
                           {ingestMessage}
                         </div>
                       )}
@@ -1806,7 +1802,7 @@ const Setup = () => {
                     <p className="text-xs text-gray-500 dark:text-gray-400 mb-2">
                       Files will be downloaded to: downloaded_files_cloud/{ingestGraphName}/
                     </p>
-                    <Button 
+                    <Button
                       onClick={handleCloudDownload}
                       disabled={isDownloading}
                       className="gradient text-white w-full"
@@ -1826,13 +1822,12 @@ const Setup = () => {
                   </div>
 
                   {downloadMessage && (
-                    <div className={`p-3 rounded-lg text-sm mt-3 ${
-                      downloadMessage.includes("✅")
-                        ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
-                        : downloadMessage.includes("❌")
+                    <div className={`p-3 rounded-lg text-sm mt-3 ${downloadMessage.includes("✅")
+                      ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
+                      : downloadMessage.includes("❌")
                         ? "bg-red-50 dark:bg-red-900/20 text-red-700 dark:text-red-300"
                         : "bg-blue-50 dark:bg-blue-900/20 text-blue-700 dark:text-blue-300"
-                    }`}>
+                      }`}>
                       {downloadMessage}
                     </div>
                   )}
@@ -1886,7 +1881,7 @@ const Setup = () => {
                       <p className="text-xs text-gray-500 dark:text-gray-400 mb-3">
                         Process downloaded files and add them to the knowledge graph
                       </p>
-                      
+
                       <Button
                         onClick={handleRunIngest}
                         disabled={isIngesting || !tempSessionId}
@@ -1905,13 +1900,12 @@ const Setup = () => {
                         )}
                       </Button>
                       {ingestMessage && (
-                        <div className={`p-3 rounded-lg text-sm mt-3 ${
-                          ingestMessage.includes("✅")
-                            ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
-                            : ingestMessage.includes("❌")
+                        <div className={`p-3 rounded-lg text-sm mt-3 ${ingestMessage.includes("✅")
+                          ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
+                          : ingestMessage.includes("❌")
                             ? "bg-red-50 dark:bg-red-900/20 text-red-700 dark:text-red-300"
                             : "bg-blue-50 dark:bg-blue-900/20 text-blue-700 dark:text-blue-300"
-                        }`}>
+                          }`}>
                           {ingestMessage}
                         </div>
                       )}
@@ -2048,13 +2042,12 @@ const Setup = () => {
                       )}
                     </Button>
                     {ingestMessage && (
-                      <div className={`p-3 rounded-lg text-sm mt-3 ${
-                        ingestMessage.includes("✅")
-                          ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
-                          : ingestMessage.includes("❌")
+                      <div className={`p-3 rounded-lg text-sm mt-3 ${ingestMessage.includes("✅")
+                        ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
+                        : ingestMessage.includes("❌")
                           ? "bg-red-50 dark:bg-red-900/20 text-red-700 dark:text-red-300"
                           : "bg-blue-50 dark:bg-blue-900/20 text-blue-700 dark:text-blue-300"
-                      }`}>
+                        }`}>
                         {ingestMessage}
                       </div>
                     )}
@@ -2079,8 +2072,8 @@ const Setup = () => {
         </Dialog>
 
         {/* Refresh Graph Dialog */}
-        <Dialog 
-          open={refreshOpen} 
+        <Dialog
+          open={refreshOpen}
           onOpenChange={(open) => {
             // Prevent closing if confirm dialog is open
             if (!open && isConfirmDialogOpen) {
@@ -2089,7 +2082,7 @@ const Setup = () => {
             setRefreshOpen(open);
           }}
         >
-          <DialogContent 
+          <DialogContent
             className="sm:max-w-[500px] bg-white dark:bg-background border-gray-300 dark:border-[#3D3D3D]"
             onInteractOutside={(e) => e.preventDefault()}
           >
@@ -2130,19 +2123,18 @@ const Setup = () => {
                   ⚠️ Warning
                 </p>
                 <p className="text-sm text-yellow-700 dark:text-yellow-300 mt-1">
-                  This operation will rebuild the graph content that will interrupt related queries. 
+                  This operation will rebuild the graph content that will interrupt related queries.
                   Please confirm to proceed.
                 </p>
               </div>
 
               {refreshMessage && (
-                <div className={`p-3 rounded-lg text-sm ${
-                  refreshMessage.includes("✅")
-                    ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
-                    : refreshMessage.includes("❌")
+                <div className={`p-3 rounded-lg text-sm ${refreshMessage.includes("✅")
+                  ? "bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-300"
+                  : refreshMessage.includes("❌")
                     ? "bg-red-50 dark:bg-red-900/20 text-red-700 dark:text-red-300"
                     : "bg-blue-50 dark:bg-blue-900/20 text-blue-700 dark:text-blue-300"
-                }`}>
+                  }`}>
                   {refreshMessage}
                 </div>
               )}
diff --git a/graphrag/app/routers/ui.py b/graphrag/app/routers/ui.py
index 9d8cc33..b69afae 100644
--- a/graphrag/app/routers/ui.py
+++ b/graphrag/app/routers/ui.py
@@ -52,6 +52,7 @@
 from common.logs.logwriter import LogWriter
 from common.metrics.prometheus_metrics import metrics as pmetrics
 from supportai import supportai
+from common.utils.text_extractors import TextExtractor
 from common.py_schemas.schemas import (
     AgentProgess,
     CreateIngestConfig,
@@ -990,6 +991,7 @@ async def clear_uploaded_files(
     graphname: str,
     creds: Annotated[tuple[list[str], HTTPBasicCredentials], Depends(ui_basic_auth)],
     filename: str | None = None,
+    session_id: str | None = None,
 ):
     """
     Clear uploaded files for a specific graphname.
@@ -997,6 +999,7 @@ async def clear_uploaded_files(
     Parameters:
     - graphname: The graph name whose files to clear
     - filename: If provided, only delete this specific file. Otherwise, delete all files.
+    - session_id: Optional session ID to delete processed content from temp folder
     """
     try:
         upload_dir = os.path.join("uploads", graphname)
@@ -1009,9 +1012,21 @@ async def clear_uploaded_files(
             }
         
         deleted_files = []
+        text_extractor = TextExtractor()
         
         if filename:
-            # Delete specific file
+            # Delete processed content from JSONL FIRST if session_id provided
+            if session_id:
+                temp_folder = os.path.join("uploads", "ingestion_temp", graphname, session_id)
+                if os.path.exists(temp_folder):
+                    logger.info(f"Deleting processed content for {filename} from temp folder")
+                    result = text_extractor.delete_file_from_jsonl(temp_folder, filename)
+                    if result.get('success'):
+                        logger.info(f"Removed {result.get('removed_count', 0)} processed documents for {filename}")
+                    else:
+                        logger.warning(f"Failed to remove processed content: {result.get('error', 'Unknown error')}")
+            
+            # Then delete the original file
             file_path = os.path.join(upload_dir, filename)
             if os.path.exists(file_path) and os.path.isfile(file_path):
                 os.remove(file_path)
@@ -1049,6 +1064,7 @@ async def clear_uploaded_files(
         raise HTTPException(status_code=500, detail=f"Error deleting files: {str(e)}")
 
 
+
 # Cloud Storage Download Endpoints
 
 @router.post(route_prefix + "/{graphname}/cloud/download")
@@ -1058,8 +1074,7 @@ async def download_from_cloud(
     request_body: dict = Body(...),
 ):
     """
-    Download files from cloud storage (S3, GCS, or Azure) to local directory
-    and automatically process them to create JSONL files for ingestion.
+    Download files from cloud storage (S3, GCS, or Azure) to local directory.
     
     Parameters:
     - graphname: The graph name to associate downloaded files with
@@ -1253,49 +1268,14 @@ async def download_from_cloud(
         
         logger.info(f"Downloaded {len(downloaded_files)} file(s) from {provider} for graph {graphname}")
         
-        # Automatically process downloaded files to create JSONL
-        from common.utils.text_extractors import TextExtractor
-        temp_session_id = str(uuid.uuid4())
-        temp_folder = os.path.join("uploads", "ingestion_temp", graphname, temp_session_id)
-        
-        try:
-            extractor = TextExtractor()
-            processing_result = extractor.process_folder(
-                download_dir,
-                graphname=graphname,
-                temp_folder=temp_folder
-            )
-            
-            if processing_result.get("statusCode") != 200:
-                logger.error(f"Cloud file processing failed: {processing_result}")
-                raise Exception(f"Failed to process downloaded files: {processing_result}")
-            
-            doc_count = processing_result.get("num_documents", 0)
-            logger.info(f"Processed {doc_count} documents from downloaded files")
-            
-            return {
-                "status": "success",
-                "message": f"Successfully downloaded and processed {len(downloaded_files)} file(s) from {provider}",
-                "graphname": graphname,
-                "provider": provider,
-                "downloaded_files": downloaded_files,
-                "local_path": download_dir,
-                "temp_session_id": temp_session_id,
-                "temp_folder": temp_folder,
-                "doc_count": doc_count,
-            }
-        except Exception as e:
-            logger.error(f"Error processing downloaded files: {e}")
-            # Return success for download but warn about processing failure
-            return {
-                "status": "partial_success",
-                "message": f"Downloaded {len(downloaded_files)} file(s) but processing failed: {str(e)}",
-                "graphname": graphname,
-                "provider": provider,
-                "downloaded_files": downloaded_files,
-                "local_path": download_dir,
-                "processing_error": str(e),
-            }
+        return {
+            "status": "success",
+            "message": f"Successfully downloaded {len(downloaded_files)} file(s) from {provider}",
+            "graphname": graphname,
+            "provider": provider,
+            "downloaded_files": downloaded_files,
+            "local_path": download_dir,
+        }
     
     except HTTPException:
         raise
@@ -1362,12 +1342,11 @@ async def delete_cloud_downloads(
 ):
     """
     Delete downloaded cloud files for a specific graph.
-    Also deletes corresponding processed documents from the JSONL file.
     
     Parameters:
     - graphname: The graph name whose downloaded files to clear
     - filename: If provided, only delete this specific file. Otherwise, delete all files.
-    - session_id: The session ID for the temp folder containing processed JSONL
+    - session_id: Optional session ID to delete processed content from temp folder
     """
     try:
         download_dir = os.path.join("downloaded_files_cloud", graphname)
@@ -1380,28 +1359,23 @@ async def delete_cloud_downloads(
             }
         
         deleted_files = []
+        text_extractor = TextExtractor()
         
         if filename:
-            # Delete specific file AND its processed documents from JSONL
+            # Delete processed content from JSONL FIRST if session_id provided
+            if session_id:
+                temp_folder = os.path.join("uploads", "ingestion_temp", graphname, session_id)
+                if os.path.exists(temp_folder):
+                    logger.info(f"Deleting processed content for {filename} from temp folder")
+                    result = text_extractor.delete_file_from_jsonl(temp_folder, filename)
+                    if result.get('success'):
+                        logger.info(f"Removed {result.get('removed_count', 0)} processed documents for {filename}")
+                    else:
+                        logger.warning(f"Failed to remove processed content: {result.get('error', 'Unknown error')}")
+            
+            # Then delete the original file
             file_path = os.path.join(download_dir, filename)
             if os.path.exists(file_path) and os.path.isfile(file_path):
-                # If session_id provided, also delete from JSONL
-                if session_id:
-                    from common.utils.text_extractors import TextExtractor
-                    extractor = TextExtractor()
-                    
-                    temp_folder = os.path.join("uploads", "ingestion_temp", graphname, session_id)
-                    if os.path.exists(temp_folder):
-                        # Delete from JSONL first
-                        delete_result = extractor.delete_file_from_jsonl(temp_folder, filename)
-                        logger.info(f"JSONL delete result for {filename}: {delete_result}")
-                        
-                        # If JSONL delete failed (and JSONL exists), warn but continue with file deletion
-                        if not delete_result.get('success'):
-                            logger.warning(f"Failed to delete from JSONL: {delete_result.get('error')}")
-                            # Continue with file deletion even if JSONL deletion failed
-                
-                # Delete the original downloaded file
                 os.remove(file_path)
                 deleted_files.append(filename)
                 logger.info(f"Deleted cloud download {filename} for graph {graphname}")
@@ -1409,21 +1383,13 @@ async def delete_cloud_downloads(
                 raise HTTPException(status_code=404, detail=f"File {filename} not found")
         else:
             # Delete all files in the directory
-            for fname in os.listdir(download_dir):
-                file_path = os.path.join(download_dir, fname)
+            for filename in os.listdir(download_dir):
+                file_path = os.path.join(download_dir, filename)
                 if os.path.isfile(file_path):
                     os.remove(file_path)
-                    deleted_files.append(fname)
-            
-            # If session_id provided, delete the entire temp folder
-            if session_id:
-                temp_folder = os.path.join("uploads", "ingestion_temp", graphname, session_id)
-                if os.path.exists(temp_folder):
-                    import shutil
-                    shutil.rmtree(temp_folder, ignore_errors=True)
-                    logger.info(f"Deleted temp folder for session {session_id}")
+                    deleted_files.append(filename)
             
-            # Remove the download directory if it's empty
+            # Remove the directory if it's empty
             if not os.listdir(download_dir):
                 os.rmdir(download_dir)
             
@@ -1542,41 +1508,25 @@ async def delete_ingestion_temp_files(
         deleted_files = []
         
         if filename:
-            # Delete processed documents from JSONL for this original filename
-            # Note: Original files are NOT in temp folder, only processed_documents.jsonl is here
-            from common.utils.text_extractors import TextExtractor
-            extractor = TextExtractor()
-            
-            # Delete from JSONL - MUST succeed
-            delete_result = extractor.delete_file_from_jsonl(session_dir, filename)
-            logger.info(f"JSONL delete result for {filename}: {delete_result}")
-            
-            # If JSONL delete failed, return error
-            if not delete_result.get('success'):
-                error_msg = delete_result.get('error', 'Unknown error')
-                logger.error(f"Failed to delete from JSONL: {error_msg}")
-                raise HTTPException(status_code=500, detail=f"Failed to delete processed documents: {error_msg}")
-            
-            deleted_files.append(filename)
-            logger.info(f"Deleted {delete_result.get('removed_count', 0)} processed documents for {filename} from JSONL")
-            
-            # Check if temp folder was deleted by JSONL cleanup
-            if delete_result.get('temp_folder_deleted'):
-                logger.info(f"Session folder {session_id} was automatically deleted (no documents remaining)")
-            elif not os.path.exists(session_dir):
-                logger.info(f"Session folder {session_id} was deleted")
-            elif not os.listdir(session_dir):
-                # Clean up empty session folder
-                os.rmdir(session_dir)
-                logger.info(f"Removed empty session folder {session_id}")
+            # Delete specific file
+            file_path = os.path.join(session_dir, filename)
+            if os.path.exists(file_path) and os.path.isfile(file_path):
+                os.remove(file_path)
+                deleted_files.append(filename)
+                logger.info(f"Deleted temp file {filename} from session {session_id}")
+                
+                # If session folder is now empty, remove it
+                if not os.listdir(session_dir):
+                    os.rmdir(session_dir)
+                    logger.info(f"Removed empty session folder {session_id}")
             else:
-                logger.info(f"Removed {delete_result.get('removed_count', 0)} documents from JSONL, {delete_result.get('remaining_count', 0)} remaining")
+                raise HTTPException(status_code=404, detail=f"File {filename} not found")
         else:
-            # Delete entire session folder (including JSONL)
+            # Delete entire session folder
             import shutil
-            for fname in os.listdir(session_dir):
-                if os.path.isfile(os.path.join(session_dir, fname)):
-                    deleted_files.append(fname)
+            for filename in os.listdir(session_dir):
+                if os.path.isfile(os.path.join(session_dir, filename)):
+                    deleted_files.append(filename)
             
             shutil.rmtree(session_dir)
             logger.info(f"Deleted session folder {session_id} for graph {graphname}")