CosmosShadow · Embracex1998 · Sep 20, 2024 · Sep 20, 2024 · Sep 26, 2024
diff --git a/gptpdf/parse.py b/gptpdf/parse.py
@@ -130,8 +130,7 @@ def _parse_rects(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
 
     return [rect.bounds for rect in merged_rects]
 
-
-def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[str, List[str]]]:
+def _parse_pdf_to_images(pdf_path: str,dpi:int,output_dir: str = './') -> List[Tuple[str, List[str]]]:
     """
     Parse PDF to images and save to output_dir.
     """
@@ -143,11 +142,15 @@ def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[st
         logging.info(f'parse page: {page_index}')
         rect_images = []
         rects = _parse_rects(page)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
         for index, rect in enumerate(rects):
             fitz_rect = fitz.Rect(rect)
             # 保存页面为图片
-            pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(4, 4))
+            pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(dpi/72, dpi/72))
             name = f'{page_index}_{index}.png'
+
+
             pix.save(os.path.join(output_dir, name))
             rect_images.append(name)
             # # 在页面上绘制红色矩形
@@ -164,15 +167,14 @@ def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[st
             page.draw_rect(text_rect, color=(1, 1, 1), fill=(1, 1, 1))
             # 插入带有白色背景的文字
             page.insert_text((text_x, text_y), name, fontsize=10, color=(1, 0, 0))
-        page_image_with_rects = page.get_pixmap(matrix=fitz.Matrix(3, 3))
+        page_image_with_rects = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))
         page_image = os.path.join(output_dir, f'{page_index}.png')
         page_image_with_rects.save(page_image)
         image_infos.append((page_image, rect_images))
 
     pdf_document.close()
     return image_infos
 
-
 def _gpt_parse_images(
         image_infos: List[Tuple[str, List[str]]],
         prompt_dict: Optional[Dict] = None,
@@ -239,7 +241,7 @@ def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, s
 
     return '\n\n'.join(contents)
 
-
+#增加了PDF图片解析的dpi参数，默认值设为200，根据需求设置300-1000可以满足清晰度需求
 def parse_pdf(
         pdf_path: str,
         output_dir: str = './',
@@ -249,6 +251,7 @@ def parse_pdf(
         model: str = 'gpt-4o',
         verbose: bool = False,
         gpt_worker: int = 1,
+        dpi:int = 200,
         **args
 ) -> Tuple[str, List[str]]:
     """
@@ -257,7 +260,7 @@ def parse_pdf(
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
-    image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir)
+    image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir,dpi=dpi)
     content = _gpt_parse_images(
         image_infos=image_infos,
         output_dir=output_dir,

diff --git a/test/test.py b/test/test.py
@@ -19,7 +19,7 @@ def test_use_api_key():
     api_key = os.getenv('OPENAI_API_KEY')
     base_url = os.getenv('OPENAI_API_BASE')
     # Manually provide OPENAI_API_KEY and OPEN_API_BASE
-    content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model='gpt-4o', gpt_worker=6)
+    content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model='gpt-4o', gpt_worker=6,dpi=1000)
     print(content)
     print(image_paths)
     # also output_dir/output.md is generated
@@ -50,7 +50,8 @@ def test_qwen_vl_max():
     base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
     # Refer to: https://help.aliyun.com/zh/dashscope/developer-reference/compatibility-of-openai-with-dashscope
     model  = 'qwen-vl-max'
-    content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=True, temperature=0.5, max_tokens=1000, top_p=0.9, frequency_penalty=1)
+    #新增了dpi参数，默认值为200
+    content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=True, temperature=0.5, max_tokens=1000, top_p=0.9, frequency_penalty=1,dpi=1000)
     print(content)
     print(image_paths)