diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 63162d523..e268d6fe3 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,22 +1,18 @@ import sys import io - from typing import BinaryIO, Any - from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later +# Load dependencies _dependency_exc_info = None try: import pdfminer import pdfminer.high_level + import pdfplumber except ImportError: - # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() @@ -28,16 +24,43 @@ ACCEPTED_FILE_EXTENSIONS = [".pdf"] +def _to_markdown_table(table: list[list[str]]) -> str: + """Convert a 2D list (rows/columns) into a nicely aligned Markdown table.""" + if not table: + return "" + + # Normalize None → "" + table = [[cell if cell is not None else "" for cell in row] for row in table] + + # Column widths + col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)] + + def fmt_row(row): + return "| " + " | ".join( + str(cell).ljust(width) for cell, width in zip(row, col_widths) + ) + " |" + + header, *rows = table + md = [fmt_row(header)] + md.append("| " + " | ".join("-" * w for w in col_widths) + " |") + for row in rows: + md.append(fmt_row(row)) + + return "\n".join(md) + + class PdfConverter(DocumentConverter): """ - Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + Converts PDFs to Markdown. + Supports extracting tables into aligned Markdown format (via pdfplumber). + Falls back to pdfminer if pdfplumber is missing or fails. """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() @@ -55,9 +78,8 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> DocumentConverterResult: - # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( @@ -65,13 +87,50 @@ def convert( extension=".pdf", feature="pdf", ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _dependency_exc_info[2] - ) - - assert isinstance(file_stream, io.IOBase) # for mypy - return DocumentConverterResult( - markdown=pdfminer.high_level.extract_text(file_stream), - ) + ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) # type: ignore[union-attr] + + assert isinstance(file_stream, io.IOBase) + + markdown_chunks: list[str] = [] + + try: + with pdfplumber.open(file_stream) as pdf: + for page in pdf.pages: + text = page.extract_text() or "" + page_tables = page.extract_tables() + + # Remove table rows from text to avoid duplication + for table in page_tables: + if not table: + continue + header_line = " ".join(table[0]) + if header_line in text: + text = text.replace(header_line, "") + for row in table[1:]: + row_line = " ".join(row) + if row_line in text: + text = text.replace(row_line, "") + + # Normalize whitespace: collapse multiple blank lines + lines = [line.strip() for line in text.splitlines() if line.strip()] + clean_text = "\n".join(lines) + if clean_text: + markdown_chunks.append(clean_text) + + # Append tables as aligned Markdown + for table in page_tables: + md_table = _to_markdown_table(table) + if md_table: + markdown_chunks.append(md_table) + + markdown = "\n\n".join(markdown_chunks).strip() + + except Exception: + # Fallback if pdfplumber fails + markdown = pdfminer.high_level.extract_text(file_stream) + + # Fallback if still empty + if not markdown: + markdown = pdfminer.high_level.extract_text(file_stream) + + return DocumentConverterResult(markdown=markdown)