CogStack · vladd-bit · Dec 24, 2025 · Dec 24, 2025 · Dec 24, 2025 · Dec 24, 2025
diff --git a/ocr_service/processor/converter.py b/ocr_service/processor/converter.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
+import atexit
+import multiprocessing
 import os
 import time
 import traceback
 import uuid
 from io import BytesIO
-from multiprocessing.dummy import Pool
 from subprocess import PIPE, Popen
 from threading import Timer
 from typing import Any, cast
@@ -66,36 +67,61 @@ def _extract_text_fallback(self, stream: bytes, *, is_html: bool, is_xml: bool,
 
         return text.strip()
 
+    @staticmethod
+    def initialize_pdf_worker(stream) -> None:
+        # we are making this a global so that we can use it in the process pool
+        # since Pypdfium2 PdfDocument objects are not thread-safe
+        global CURRENT_PDF_FILE
+        CURRENT_PDF_FILE = pdfium.PdfDocument(stream)
+
+        def _close_pdf():
+            global CURRENT_PDF_FILE
+            if CURRENT_PDF_FILE is not None:
+                CURRENT_PDF_FILE.close()
+
+        atexit.register(_close_pdf)
+
+    @staticmethod
+    def render_page(page_num) -> Image.Image:
+        scale = int(settings.OCR_SERVICE_IMAGE_DPI / 72)
+        page = CURRENT_PDF_FILE.get_page(page_num)
+        img = page.render(
+            scale=scale,
+            may_draw_forms=False,
+            no_smoothtext=True,
+            no_smoothimage=True,
+            no_smoothpath=True,
+            rotation=0,
+            crop=(0, 0, 0, 0),
+            grayscale=settings.OCR_CONVERT_GRAYSCALE_IMAGES,
+        ).to_pil()
+        page.close()
+
+        return img
+
     def _pdf_to_img(self, stream: bytes) -> tuple[list[Image.Image], dict]:
         pdf_image_pages = []
         doc_metadata: dict[str, Any] = {}
 
         pdf = pdfium.PdfDocument(stream)
+        page_count = len(pdf)
+        pdf.close()
+
+        doc_metadata["pages"] = page_count
 
         pdf_conversion_start_time = time.time()
-        scale = int(settings.OCR_SERVICE_IMAGE_DPI / 72)
 
-        def render_page(index: int) -> Image.Image:
-            page = pdf[index]
-            return page.render(
-                scale=scale,
-                may_draw_forms=False,
-                no_smoothtext=True,
-                no_smoothimage=True,
-                no_smoothpath=True,
-                rotation=0,
-                crop=(0, 0, 0, 0),
-                grayscale=settings.OCR_CONVERT_GRAYSCALE_IMAGES
-            ).to_pil()
-
-        with Pool(settings.CONVERTER_THREAD_NUM) as pool:
-            pdf_image_pages = pool.map(render_page, range(len(pdf)))
+        ctx = multiprocessing.get_context("spawn")
+
+        with ctx.Pool(processes=min(settings.CONVERTER_THREAD_NUM, page_count),
+                      initializer=DocumentConverter.initialize_pdf_worker,
+                      initargs=(stream,)) as pool:
+            pdf_image_pages = list(pool.imap_unordered(DocumentConverter.render_page, range(page_count), chunksize=1))
 
         pdf_conversion_end_time = time.time()
 
         self.log.info("PDF conversion to image(s) finished | Elapsed : " +
                       str(pdf_conversion_end_time - pdf_conversion_start_time) + " seconds")
-
         return pdf_image_pages, doc_metadata
 
     def _pdf_to_text(self, stream: bytes) -> tuple[str, dict]:

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,4 +1,4 @@
-ruff==0.12.12
+ruff==0.14.10
 mypy==1.17.0
 mypy-extensions==1.1.0
 types-aiofiles==24.1.0.20250708

diff --git a/requirements.txt b/requirements.txt
@@ -1,30 +1,23 @@
 setuptools==80.9.0
 wheel==0.45.0
-pkgconfig==1.5.0
-cython==3.1.2
-virtualenv==20.31.2
 psutil==6.1.1
 filetype==1.2.0
-Pillow==11.3.0
+Pillow==12.0.0
 html2image==2.0.7
-MarkupSafe==3.0.2
-python-multipart==0.0.20
-tesserocr==2.9.1
+tesserocr==2.9.2
 gunicorn==23.0.0
 pypdfium2==5.2.0
-uharfbuzz==0.50.2
-pyxml2pdf==0.3.4
-matplotlib==3.10.3
 opencv-python-headless==4.12.0.88
-atomicwrites==1.4.1
+pyxml2pdf==0.3.4
 fastapi==0.116.1
 orjson==3.11.2
 a2wsgi==1.10.10
 pydantic==2.12.5
 pydantic-settings==2.12.0
 httpx==0.28.1
 beautifulsoup4==4.12.3
-striprtf==0.0.26
+striprtf==0.0.29
+python-multipart==0.0.21
 
 # Pillow package dependencies
 defusedxml==0.7.1