From 5e8fb5bd7c725b63cdc31040deb29fb2b8a2d772 Mon Sep 17 00:00:00 2001 From: Embracex1998 <1048024020@qq.com> Date: Fri, 20 Sep 2024 15:35:48 +0800 Subject: [PATCH 1/3] change the pdf_images dpi --- gptpdf/parse.py | 14 +++++++++----- test/test.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/gptpdf/parse.py b/gptpdf/parse.py index c948c8f..e2fa134 100644 --- a/gptpdf/parse.py +++ b/gptpdf/parse.py @@ -130,8 +130,8 @@ def _parse_rects(page: fitz.Page) -> List[Tuple[float, float, float, float]]: return [rect.bounds for rect in merged_rects] - -def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[str, List[str]]]: +#增加了PDF图片解析的dpi参数,默认值设为300,根据需求设置300-1000可以满足清晰度需求 +def _parse_pdf_to_images(pdf_path: str,dpi:int,output_dir: str = './') -> List[Tuple[str, List[str]]]: """ Parse PDF to images and save to output_dir. """ @@ -143,11 +143,15 @@ def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[st logging.info(f'parse page: {page_index}') rect_images = [] rects = _parse_rects(page) + if not os.path.exists(output_dir): + os.makedirs(output_dir) for index, rect in enumerate(rects): fitz_rect = fitz.Rect(rect) # 保存页面为图片 - pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(4, 4)) + pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(dpi/72, dpi/72)) name = f'{page_index}_{index}.png' + + pix.save(os.path.join(output_dir, name)) rect_images.append(name) # # 在页面上绘制红色矩形 @@ -172,7 +176,6 @@ def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[st pdf_document.close() return image_infos - def _gpt_parse_images( image_infos: List[Tuple[str, List[str]]], prompt_dict: Optional[Dict] = None, @@ -249,6 +252,7 @@ def parse_pdf( model: str = 'gpt-4o', verbose: bool = False, gpt_worker: int = 1, + dpi:int = 300, **args ) -> Tuple[str, List[str]]: """ @@ -257,7 +261,7 @@ def parse_pdf( if not os.path.exists(output_dir): os.makedirs(output_dir) - image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir) + image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir,dpi=dpi) content = _gpt_parse_images( image_infos=image_infos, output_dir=output_dir, diff --git a/test/test.py b/test/test.py index d79cb98..8713641 100644 --- a/test/test.py +++ b/test/test.py @@ -19,7 +19,7 @@ def test_use_api_key(): api_key = os.getenv('OPENAI_API_KEY') base_url = os.getenv('OPENAI_API_BASE') # Manually provide OPENAI_API_KEY and OPEN_API_BASE - content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model='gpt-4o', gpt_worker=6) + content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model='gpt-4o', gpt_worker=6,dpi=1000) print(content) print(image_paths) # also output_dir/output.md is generated @@ -50,7 +50,7 @@ def test_qwen_vl_max(): base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" # Refer to: https://help.aliyun.com/zh/dashscope/developer-reference/compatibility-of-openai-with-dashscope model = 'qwen-vl-max' - content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=True, temperature=0.5, max_tokens=1000, top_p=0.9, frequency_penalty=1) + content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=True, temperature=0.5, max_tokens=1000, top_p=0.9, frequency_penalty=1,dpi=1000) print(content) print(image_paths) From bf51dae0e034eee7717840f6a3d8f3e3e9dab041 Mon Sep 17 00:00:00 2001 From: Embracex1998 <1048024020@qq.com> Date: Fri, 20 Sep 2024 18:23:26 +0800 Subject: [PATCH 2/3] change the pdf_images dpi --- gptpdf/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptpdf/parse.py b/gptpdf/parse.py index e2fa134..46ab2ca 100644 --- a/gptpdf/parse.py +++ b/gptpdf/parse.py @@ -168,7 +168,7 @@ def _parse_pdf_to_images(pdf_path: str,dpi:int,output_dir: str = './') -> List[T page.draw_rect(text_rect, color=(1, 1, 1), fill=(1, 1, 1)) # 插入带有白色背景的文字 page.insert_text((text_x, text_y), name, fontsize=10, color=(1, 0, 0)) - page_image_with_rects = page.get_pixmap(matrix=fitz.Matrix(3, 3)) + page_image_with_rects = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72)) page_image = os.path.join(output_dir, f'{page_index}.png') page_image_with_rects.save(page_image) image_infos.append((page_image, rect_images)) From b71bb7d922e09e0751e8286d6517326854cd7ac2 Mon Sep 17 00:00:00 2001 From: Embracex1998 <1048024020@qq.com> Date: Thu, 26 Sep 2024 21:53:49 +0800 Subject: [PATCH 3/3] change the pdf_images dpi --- gptpdf/parse.py | 5 ++--- test/test.py | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gptpdf/parse.py b/gptpdf/parse.py index 46ab2ca..99758e3 100644 --- a/gptpdf/parse.py +++ b/gptpdf/parse.py @@ -130,7 +130,6 @@ def _parse_rects(page: fitz.Page) -> List[Tuple[float, float, float, float]]: return [rect.bounds for rect in merged_rects] -#增加了PDF图片解析的dpi参数,默认值设为300,根据需求设置300-1000可以满足清晰度需求 def _parse_pdf_to_images(pdf_path: str,dpi:int,output_dir: str = './') -> List[Tuple[str, List[str]]]: """ Parse PDF to images and save to output_dir. @@ -242,7 +241,7 @@ def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, s return '\n\n'.join(contents) - +#增加了PDF图片解析的dpi参数,默认值设为200,根据需求设置300-1000可以满足清晰度需求 def parse_pdf( pdf_path: str, output_dir: str = './', @@ -252,7 +251,7 @@ def parse_pdf( model: str = 'gpt-4o', verbose: bool = False, gpt_worker: int = 1, - dpi:int = 300, + dpi:int = 200, **args ) -> Tuple[str, List[str]]: """ diff --git a/test/test.py b/test/test.py index 8713641..8d90bb1 100644 --- a/test/test.py +++ b/test/test.py @@ -50,6 +50,7 @@ def test_qwen_vl_max(): base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" # Refer to: https://help.aliyun.com/zh/dashscope/developer-reference/compatibility-of-openai-with-dashscope model = 'qwen-vl-max' + #新增了dpi参数,默认值为200 content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=True, temperature=0.5, max_tokens=1000, top_p=0.9, frequency_penalty=1,dpi=1000) print(content) print(image_paths)