diff --git a/.env.example b/.env.example index aae259a..bc8879f 100644 --- a/.env.example +++ b/.env.example @@ -27,5 +27,10 @@ LANGUAGES=mul,eng,nob # and optionally, to restrict the detection to a subset of languages # GIELLADETECT_LANGS=nno,nob,eng,swe,fin +# To use a LLM API service, specify at least the base URL of an OpenAI-style API +# LLM_API_URL=http://localhost:8080/ +# LLM_API_KEY= +# LLM_MODEL= + # To have Meteor run on a different path (only for stage and prod environments), set # CUSTOM_PATH=/meteor-custom-path diff --git a/main.py b/main.py index d78ee97..4e7e390 100644 --- a/main.py +++ b/main.py @@ -64,7 +64,8 @@ async def get_front_page_html(request: Request) -> Response: "index.html", { "request": request, - "root_path": Utils.get_environment_prefix() + "root_path": Utils.get_environment_prefix(), + "backends": Utils.get_available_backends() } ) @@ -78,7 +79,8 @@ def display_error_message_in_template(request: Request, exc: StarletteHTTPExcept "results": { 'error': str(exc.detail) }, - "root_path": Utils.get_environment_prefix() + "root_path": Utils.get_environment_prefix(), + "backends": Utils.get_available_backends() }, status_code=exc.status_code ) diff --git a/metadata_extract/candidate.py b/metadata_extract/candidate.py index 7d7949b..efa7370 100644 --- a/metadata_extract/candidate.py +++ b/metadata_extract/candidate.py @@ -15,6 +15,7 @@ class Origin(Enum): COPYRIGHT = 5 RAPPORT_PREFIX = 6 LANGUAGE_MODEL = 7 + LLM = 8 class OriginType(TypedDict): diff --git a/metadata_extract/llm_extractor.py b/metadata_extract/llm_extractor.py new file mode 100644 index 0000000..1a15fe2 --- /dev/null +++ b/metadata_extract/llm_extractor.py @@ -0,0 +1,127 @@ +"""The LLM extractor module extracts metadata using an external LLM API service.""" + +from typing import TypedDict, Optional +import json +import requests +from .candidate import AuthorType, Candidate, Origin +from .metadata import Metadata +from .meteor_document import MeteorDocument +from .registry import PublisherRegistry + + +class LLMConfig(TypedDict): + """Configuration for LLM API service""" + api_url: str + api_key: str + model: str + + +class LLMExtractor: + """A LLMExtractor object loads a MeteorDocument and fills a Metadata object + by performing a call to an external LLM API service.""" + + SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous " + \ + "cataloguing of digital documents." + INSTRUCTION = "Extract metadata from this document. Return as JSON." + MAX_TOKENS = 1024 + TEMPERATURE = 0.0 + TIMEOUT = 120 + + def __init__(self, doc: MeteorDocument, + registry: Optional[PublisherRegistry], + llm_config: LLMConfig): + self._doc = doc + self._registry = registry + self._config = llm_config + self.metadata = Metadata() + + def extract_metadata(self) -> None: + doc_json = self._doc.extract_text_as_json() + response = self._llm_request(doc_json) + self._parse_response_to_doc(response) + + def _llm_request(self, doc_json: str) -> str: + message = f"{self.INSTRUCTION}\n\n{doc_json}" + + if self._config['api_url'].endswith("/"): + url = self._config['api_url'] + "chat/completions" + else: + url = self._config['api_url'] + "/chat/completions" + + headers = { + "Content-Type": "application/json" + } + + if self._config['api_key']: + headers['Authorization'] = f'Bearer {self._config["api_key"]}' + + data = { + "model": self._config['model'], + "messages": [ + {"role": "system", "content": self.SYSTEM_PROMPT}, + {"role": "user", "content": message}, + ], + "temperature": self.TEMPERATURE, + "max_tokens": self.MAX_TOKENS + } + + api_response = requests.post(url, + headers=headers, + json=data, + timeout=self.TIMEOUT) + + api_response.raise_for_status() + return str(api_response.json()['choices'][0]['message']['content']) + + def _parse_response_to_doc(self, response: str) -> None: + metadata = json.loads(response) + + # language + if 'language' in metadata: + self.metadata.add_candidate('language', Candidate(metadata['language'], Origin.LLM)) + + # title + if 'title' in metadata: + self.metadata.add_candidate('title', Candidate(metadata['title'], Origin.LLM)) + + # creator + if 'creator' in metadata: + for creator in metadata['creator']: + if ', ' in creator: + lastname, firstname = creator.split(', ', maxsplit=1) + else: + lastname = creator + firstname = "" + author_dict: AuthorType = {"firstname": firstname, "lastname": lastname} + self.metadata.add_candidate('author', Candidate(author_dict, Origin.LLM)) + + # year + if 'year' in metadata: + self.metadata.add_candidate('year', Candidate(int(metadata['year']), Origin.LLM)) + + # publisher + if 'publisher' in metadata: + for publisher in metadata['publisher']: + publisher_candidate = Candidate(publisher, Origin.LLM) + if self._registry: + publisher_candidate.reg_entries = self._registry.search(publisher) + self.metadata.add_candidate('publisher', publisher_candidate) + + # doi - not supported by Meteor + + # e-isbn + if 'e-isbn' in metadata: + # This is pretty poor, we just pass the found e-ISBNs (almost never more than one) + # to Meteor directly and let it pick one essentially at random + for e_isbn in metadata['e-isbn']: + self.metadata.add_candidate('ISBN', Candidate(e_isbn, Origin.LLM)) + + # p-isbn - Meteor isn't interested in printed ISBNs + + # e-issn + if 'e-issn' in metadata: + self.metadata.add_candidate('ISSN', Candidate(metadata['e-issn'], Origin.LLM)) + + # p-issn - Meteor isn't interested in printed ISBNs + + # type_coar - not supported by Meteor diff --git a/metadata_extract/metadata.py b/metadata_extract/metadata.py index f1a855b..d541dab 100644 --- a/metadata_extract/metadata.py +++ b/metadata_extract/metadata.py @@ -112,6 +112,11 @@ def choose_title(self) -> Optional[CandidateType]: and not text.has_no_letters(c.value)] if page_title: return page_title[0].to_dict() + llm_title = [c for c in self.candidates['title'] if + c.origin == Origin.LLM and isinstance(c.value, str) + and not text.has_no_letters(c.value)] + if llm_title: + return llm_title[0].to_dict() return None def choose_publishers(self) -> Optional[CandidateType]: diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index 2fffa5a..309cebf 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -9,6 +9,7 @@ from .meteor_document import MeteorDocument from .metadata import Results from .finder import Finder +from .llm_extractor import LLMConfig, LLMExtractor class Meteor: @@ -24,6 +25,7 @@ def __init__(self, languages: Optional[list[str]] = None) -> None: self.registry: Optional[PublisherRegistry] = None ResourceLoader.load(languages) self.detect_language: Callable[[str], Optional[str]] = Meteor.__default_detect + self.llm_config: Optional[LLMConfig] = None @staticmethod def __default_detect(text: str) -> Optional[str]: @@ -41,9 +43,16 @@ def set_registry(self, registry: PublisherRegistry) -> None: def set_language_detection_method(self, detect_language: Callable[[str], str]) -> None: self.detect_language = detect_language - def run(self, file_path: str) -> Results: + def set_llm_config(self, llm_config: LLMConfig) -> None: + self.llm_config = llm_config + + def run(self, file_path: str, backend: Optional[str] = None) -> Results: with MeteorDocument(file_path) as doc: - finder = Finder(doc, self.registry, self.detect_language) - finder.extract_metadata() - finder.metadata.choose_best() - return finder.metadata.results + extractor: Optional[LLMExtractor | Finder] = None + if backend and backend.lower() == 'llmextractor' and self.llm_config: + extractor = LLMExtractor(doc, self.registry, self.llm_config) + else: + extractor = Finder(doc, self.registry, self.detect_language) + extractor.extract_metadata() + extractor.metadata.choose_best() + return extractor.metadata.results diff --git a/metadata_extract/meteor_document.py b/metadata_extract/meteor_document.py index 5bcbb62..48cf657 100644 --- a/metadata_extract/meteor_document.py +++ b/metadata_extract/meteor_document.py @@ -4,10 +4,13 @@ """ +import json from pathlib import Path +import re from types import TracebackType from typing import Optional, Self, Type import fitz +import regex from .page import Page from .alto_utils import AltoFile @@ -19,6 +22,13 @@ class MeteorDocument: content. MeteorDocuments are context managers, so they can be used in `with` statements. """ + # text extraction settings for LLM + PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -2, -1] # pages to analyze: first 8 pages + last 2 pages + THRESHOLD = 100 # paragraphs shorter than this will always be kept + LONG_PARA_PAGES = [0, 1] # on first two pages, some long paragraphs are accepted + LONG_PARA_MAX = 2 # how many long paragraphs to keep on the first two pages + PDF_METADATA_SKIP = {'format', 'creator', 'producer'} # PDF metadata fields not to include + def __init__(self, file_path: str, start: int = 5, end: int = 5): @@ -92,3 +102,47 @@ def get_page_object(self, page_number: int) -> Page: raise ValueError('No PDF file to load page from') self.page_objects[page_number] = Page(pdf_page=self.pdfdoc.load_page(page_number - 1)) return self.page_objects[page_number] + + def extract_text_as_json(self) -> str: + """Extract text and metadata as a JSON string suitable for a LLM""" + + if not self.pdfdoc: + raise ValueError('No PDF document set') + + pdfinfo = {} + pages = [] + + for key in self.pdfdoc.metadata.keys(): + if key not in self.PDF_METADATA_SKIP and self.pdfdoc.metadata.get(key): + pdfinfo[key] = self.pdfdoc.metadata.get(key) + + for page in self.PAGES: + if page > len(self.pdfdoc) - 2: + continue + + texts = [] + text = self.pdfdoc[page].get_text(sort=True) + # Use regular expression to split text into paragraphs + # Delimiter: newline(s) followed by an upper case character + paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE) + long_paragraph_count = 0 + + for paragraph in paragraphs: + paragraph = " ".join(paragraph.strip().split()) + + if '.....' in paragraph or '. . . . .' in paragraph: + # looks like a ToC entry, skip it + continue + if len(paragraph) < self.THRESHOLD: # short paragraph, keep it + texts.append(paragraph) + elif page in self.LONG_PARA_PAGES and long_paragraph_count < self.LONG_PARA_MAX: + # allow some long paragraphs on the first two pages + long_paragraph_count += 1 + texts.append(paragraph) + else: # must be a long paragraph, skip it + pass + text = '\n'.join(texts) + if text: + pages.append({"page": self.pdfdoc[page].number, "text": text}) + + return json.dumps({"pdfinfo": pdfinfo, "pages": pages}) diff --git a/src/routes/extract.py b/src/routes/extract.py index c89650f..f147b54 100644 --- a/src/routes/extract.py +++ b/src/routes/extract.py @@ -5,7 +5,7 @@ from typing import Annotated, Optional -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Query from starlette.datastructures import UploadFile from starlette.requests import Request from starlette.responses import HTMLResponse, JSONResponse, Response @@ -31,17 +31,18 @@ async def post_pdf_html( form = await request.form() file_input = form.get('fileInput') file_url = form.get('fileUrl') + backend = str(form.get('backend')) if file_url != "" and isinstance(file_url, str): utils.verify_url(file_url) filename: Optional[str] = file_url filepath = utils.download_file(file_url) - results = utils.process_and_remove(filename, filepath) + results = utils.process_and_remove(filename, filepath, backend=backend) elif file_input is not None and isinstance(file_input, UploadFile): utils.verify_file(file_input) filename = file_input.filename filepath = utils.save_file(file_input) - results = utils.process_and_remove(filename, filepath) + results = utils.process_and_remove(filename, filepath, backend=backend) else: raise HTTPException(400) return templates.TemplateResponse( @@ -53,7 +54,8 @@ async def post_pdf_html( "filepath": filepath, "filename": filename, "results": results, - "root_path": utils.get_environment_prefix() + "root_path": utils.get_environment_prefix(), + "backends": utils.get_available_backends() } ) @@ -68,15 +70,18 @@ async def post_pdf_json( form = await request.form() file_input = form.get('fileInput') file_url = form.get('fileUrl') + backend = str(form.get('backend')) if file_url != "" and isinstance(file_url, str): utils.verify_url(file_url) filepath = utils.download_file(file_url) - results = utils.process_and_remove(file_url, filepath, delete_immediately=True) + results = utils.process_and_remove( + file_url, filepath, backend=backend, delete_immediately=True) elif file_input is not None and isinstance(file_input, UploadFile): utils.verify_file(file_input) filepath = utils.save_file(file_input) - results = utils.process_and_remove(file_input.filename, filepath, delete_immediately=True) + results = utils.process_and_remove( + file_input.filename, filepath, backend=backend, delete_immediately=True) else: raise HTTPException(400) return JSONResponse(results) @@ -85,13 +90,14 @@ async def post_pdf_json( @router.get("/file/{file_name}", response_class=JSONResponse, status_code=200) async def get_metadata_from_file_on_disk( file_name: str, - conf: Annotated[Settings, Depends(get_settings)] + conf: Annotated[Settings, Depends(get_settings)], + backend: Optional[str] = Query(None) # Define the optional query parameter ) -> JSONResponse: """ Extract metadata from a file on disk and return it as JSON """ try: - results = utils.meteor.run(conf.MOUNT_FOLDER + '/' + file_name) + results = utils.meteor.run(conf.MOUNT_FOLDER + '/' + file_name, backend=backend) except Exception: return JSONResponse({"error": f"Error while processing {file_name}"}) return JSONResponse(results) diff --git a/src/settings.py b/src/settings.py index c3ebafc..24fbf6b 100644 --- a/src/settings.py +++ b/src/settings.py @@ -25,6 +25,9 @@ class Settings(BaseSettings): USE_GIELLADETECT: bool = False GIELLADETECT_LANGS: str = "" CUSTOM_PATH: str = "" + LLM_API_URL: str = "" + LLM_API_KEY: str = "" + LLM_MODEL: str = "" settings = Settings() diff --git a/src/util.py b/src/util.py index c9e7aca..2896aba 100644 --- a/src/util.py +++ b/src/util.py @@ -47,6 +47,14 @@ def __init__(self) -> None: self.meteor.set_language_detection_method( lambda t: gielladetect.detect(t, langs=langs) ) + if get_settings().LLM_API_URL: + self.meteor.set_llm_config( + llm_config={ + 'api_url': get_settings().LLM_API_URL, + 'api_key': get_settings().LLM_API_KEY, + 'model': get_settings().LLM_MODEL + } + ) @staticmethod def get_languages() -> Optional[list[str]]: @@ -60,6 +68,13 @@ def get_environment_prefix() -> str: return "" return get_settings().CUSTOM_PATH or "/meteor" + @staticmethod + def get_available_backends() -> list[str]: + backends = ['Finder'] + if get_settings().LLM_API_URL: + backends.append('LLMExtractor') + return backends + @staticmethod def verify_file(file: UploadFile) -> None: size_limit = int(get_settings().MAX_FILE_SIZE_MB) @@ -104,10 +119,11 @@ def process_and_remove( self, filename: Optional[str], filepath: str, + backend: Optional[str] = None, delete_immediately: bool = False ) -> Union[Error, Results]: try: - results = self.meteor.run(filepath) + results = self.meteor.run(filepath, backend) return results except Exception as exc: print(traceback.format_exc()) diff --git a/templates/index.html b/templates/index.html index 4eb8232..35446d4 100644 --- a/templates/index.html +++ b/templates/index.html @@ -13,6 +13,13 @@
or drop a PDF file
diff --git a/test/resources/pdf_report_llm_response.json b/test/resources/pdf_report_llm_response.json new file mode 100644 index 0000000..8ef3708 --- /dev/null +++ b/test/resources/pdf_report_llm_response.json @@ -0,0 +1 @@ +{"choices": [{"finish_reason": "stop", "index": 0, "message": {"content": "{\"language\": \"no\", \"title\": \"Metadataekstrahering \\u2013 Muligheter og innsikt\", \"creator\": [\"Bj\\u00f8rnson, Bj\\u00f8rnstjerne M.\", \"Camilla-Collett, Jacobine\", \"Ibsen, Henrik J.\", \"McArthur, Raymond\", \"O\\u2019Toole, John\"], \"year\": \"2023\", \"publisher\": [\"Nasjonalbiblioteket\"], \"e-isbn\": [\"9788217022985\"], \"e-issn\": \"2464-1162\", \"type_coar\": \"research report\"}", "role": "assistant"}}], "created": 1724051181, "model": "", "object": "chat.completion", "usage": {"completion_tokens": 154, "prompt_tokens": 457, "total_tokens": 611}, "id": "chatcmpl-AWIMAJoFRqEr8w8vStW9LC3hqOCWdDXw"} diff --git a/test/test_pdf_llm.py b/test/test_pdf_llm.py new file mode 100644 index 0000000..c292227 --- /dev/null +++ b/test/test_pdf_llm.py @@ -0,0 +1,133 @@ +# pylint: disable=R0801 + +"""Test the output from Meteor.run on a sample PDF file using LLM extraction""" + + +import json +import unittest.mock +from metadata_extract.meteor import Meteor + + +meteor = Meteor() +meteor.set_llm_config({ + 'api_url': 'http://localhost:8080/', + 'api_key': '', + 'model': '' +}) + +# set up mock LLM response that will be used instead of a real LLM +with open('test/resources/pdf_report_llm_response.json', encoding='utf-8') as response_file: + mock_llm_response = json.load(response_file) + +with unittest.mock.patch("requests.post") as mock_request: + # create a mock response whose .json() method returns the list that we define here + mock_response = unittest.mock.Mock() + mock_response.json.return_value = mock_llm_response + mock_request.return_value = mock_response + + # run Meteor, triggering the call to the LLM + results = meteor.run('test/resources/report.pdf', backend='LLMExtractor') + + +def test_year(): + assert results['year'] == { + "value": 2023, + "origin": { + "type": "LLM" + } + } + + +def test_language(): + assert results['language'] == { + "value": "no", + "origin": { + "type": "LLM" + } + } + + +def test_title(): + assert results['title'] == { + "value": "Metadataekstrahering – Muligheter og innsikt", + "origin": { + "type": "LLM" + } + } + + +def test_publisher(): + assert results['publisher'] == { + "value": "Nasjonalbiblioteket", + "origin": { + "type": "LLM" + } + } + + +def test_authors(): + expected_dict = [ + { + "firstname": "Bjørnstjerne M.", + "lastname": "Bjørnson", + "origin": { + "type": "LLM" + } + }, + { + "firstname": "Jacobine", + "lastname": "Camilla-Collett", + "origin": { + "type": "LLM" + } + }, + { + "firstname": "Henrik J.", + "lastname": "Ibsen", + "origin": { + "type": "LLM" + } + }, + { + "firstname": "Raymond", + "lastname": "McArthur", + "origin": { + "type": "LLM" + } + }, + { + "firstname": "John", + "lastname": "O’Toole", + "origin": { + "type": "LLM" + } + } + ] + + all_expected_authors_found = True + for author in results['authors']: + if author in expected_dict: + expected_dict.remove(author) + else: + all_expected_authors_found = False + break + + assert len(expected_dict) == 0 and all_expected_authors_found + + +def test_isbn(): + assert results['isbn'] == { + "value": "9788217022985", + "origin": { + "type": "LLM" + } + } + + +def test_issn(): + assert results['issn'] == { + "value": "2464-1162", + "origin": { + "type": "LLM" + } + } diff --git a/test/test_settings.py b/test/test_settings.py index 706fc76..6233019 100644 --- a/test/test_settings.py +++ b/test/test_settings.py @@ -17,4 +17,7 @@ def test_config(): assert settings.REGISTRY_PASSWORD == "" assert settings.USE_GIELLADETECT is False assert settings.GIELLADETECT_LANGS == "" + assert settings.LLM_API_URL == "" + assert settings.LLM_API_KEY == "" + assert settings.LLM_MODEL == "" assert settings.CUSTOM_PATH == ""