From 3617d995be89c9596e2defe3f0e816932c610204 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 31 Jul 2024 16:52:41 +0300 Subject: [PATCH 01/11] initial rough implementation of LLM integration --- metadata_extract/candidate.py | 1 + metadata_extract/llm_extractor.py | 110 ++++++++++++++++++++++++++++ metadata_extract/meteor.py | 13 +++- metadata_extract/meteor_document.py | 54 ++++++++++++++ 4 files changed, 174 insertions(+), 4 deletions(-) create mode 100644 metadata_extract/llm_extractor.py diff --git a/metadata_extract/candidate.py b/metadata_extract/candidate.py index 7d7949b..efa7370 100644 --- a/metadata_extract/candidate.py +++ b/metadata_extract/candidate.py @@ -15,6 +15,7 @@ class Origin(Enum): COPYRIGHT = 5 RAPPORT_PREFIX = 6 LANGUAGE_MODEL = 7 + LLM = 8 class OriginType(TypedDict): diff --git a/metadata_extract/llm_extractor.py b/metadata_extract/llm_extractor.py new file mode 100644 index 0000000..cebdd80 --- /dev/null +++ b/metadata_extract/llm_extractor.py @@ -0,0 +1,110 @@ +"""The LLM extractor module extracts metadata using an external LLM API service.""" + +import json +import os +import requests +from .candidate import AuthorType, Candidate, Origin +from .metadata import Metadata +from .meteor_document import MeteorDocument + + +class LLMExtractor: + """A LLMExtractor object loads a MeteorDocument and fills a Metadata object + by performing a call to an external LLM API service.""" + + MODEL_NAME = "xxx" # doesn't matter if running llama.cpp server + SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous " + \ + "cataloguing of digital documents." + INSTRUCTION = "Extract metadata from this document. Return as JSON." + MAX_TOKENS = 1024 + TEMPERATURE = 0.0 + TIMEOUT = 30 + + def __init__(self, doc: MeteorDocument): + self._doc = doc + self.metadata = Metadata() + + @classmethod + def is_available(cls) -> bool: + return cls._api_url() is not None + + def extract_metadata(self) -> None: + doc_json = self._doc.extract_text_as_json() + response = self._llm_request(doc_json) + self._parse_response_to_doc(response) + + def _llm_request(self, doc_json: str) -> str: + message = f"{self.INSTRUCTION}\n\n{doc_json}" + + headers = { + "Content-Type": "application/json", + } + + data = { + "model": self.MODEL_NAME, + "messages": [ + {"role": "system", "content": self.SYSTEM_PROMPT}, + {"role": "user", "content": message}, + ], + "temperature": self.TEMPERATURE, + "max_tokens": self.MAX_TOKENS + } + + api_response = requests.post(str(self._api_url()), + headers=headers, + json=data, + timeout=self.TIMEOUT) + + api_response.raise_for_status() + return str(api_response.json()['choices'][0]['message']['content']) + + def _parse_response_to_doc(self, response: str) -> None: + metadata = json.loads(response) + + # language + if 'language' in metadata: + self.metadata.add_candidate('language', Candidate(metadata['language'], Origin.LLM)) + + # title + if 'title' in metadata: + self.metadata.add_candidate('title', Candidate(metadata['title'], Origin.LLM)) + + # creator + if 'creator' in metadata: + for creator in metadata['creator']: + lastname, firstname = creator.split(', ', maxsplit=1) + author_dict: AuthorType = {"firstname": firstname, "lastname": lastname} + self.metadata.add_candidate('author', Candidate(author_dict, Origin.LLM)) + + # year + if 'year' in metadata: + self.metadata.add_candidate('year', Candidate(metadata['year'], Origin.LLM)) + + # publisher + if 'publisher' in metadata: + for publisher in metadata['publisher']: + # FIXME should we look up publisher in registry like Finder does? + self.metadata.add_candidate('publisher', Candidate(publisher, Origin.LLM)) + + # doi - not supported by Meteor + + # e-isbn + if 'e-isbn' in metadata: + # This is pretty poor, we just pass the found e-ISBNs (almost never more than one) + # to Meteor directly and let it pick one essentially at random + for e_isbn in metadata['e-isbn']: + self.metadata.add_candidate('ISBN', Candidate(e_isbn, Origin.LLM)) + + # p-isbn - Meteor isn't interested in printed ISBNs + + # e-issn + if 'e-issn' in metadata: + self.metadata.add_candidate('ISSN', Candidate(metadata['e-issn'], Origin.LLM)) + + # p-issn - Meteor isn't interested in printed ISBNs + + # type_coar - not supported by Meteor + + @classmethod + def _api_url(cls) -> str | None: + return os.environ.get('LLM_API_URL') diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index 2fffa5a..c58d657 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -9,6 +9,7 @@ from .meteor_document import MeteorDocument from .metadata import Results from .finder import Finder +from .llm_extractor import LLMExtractor class Meteor: @@ -43,7 +44,11 @@ def set_language_detection_method(self, detect_language: Callable[[str], str]) - def run(self, file_path: str) -> Results: with MeteorDocument(file_path) as doc: - finder = Finder(doc, self.registry, self.detect_language) - finder.extract_metadata() - finder.metadata.choose_best() - return finder.metadata.results + extractor: Optional[LLMExtractor | Finder] = None + if LLMExtractor.is_available(): + extractor = LLMExtractor(doc) + else: + extractor = Finder(doc, self.registry, self.detect_language) + extractor.extract_metadata() + extractor.metadata.choose_best() + return extractor.metadata.results diff --git a/metadata_extract/meteor_document.py b/metadata_extract/meteor_document.py index 5bcbb62..48cf657 100644 --- a/metadata_extract/meteor_document.py +++ b/metadata_extract/meteor_document.py @@ -4,10 +4,13 @@ """ +import json from pathlib import Path +import re from types import TracebackType from typing import Optional, Self, Type import fitz +import regex from .page import Page from .alto_utils import AltoFile @@ -19,6 +22,13 @@ class MeteorDocument: content. MeteorDocuments are context managers, so they can be used in `with` statements. """ + # text extraction settings for LLM + PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -2, -1] # pages to analyze: first 8 pages + last 2 pages + THRESHOLD = 100 # paragraphs shorter than this will always be kept + LONG_PARA_PAGES = [0, 1] # on first two pages, some long paragraphs are accepted + LONG_PARA_MAX = 2 # how many long paragraphs to keep on the first two pages + PDF_METADATA_SKIP = {'format', 'creator', 'producer'} # PDF metadata fields not to include + def __init__(self, file_path: str, start: int = 5, end: int = 5): @@ -92,3 +102,47 @@ def get_page_object(self, page_number: int) -> Page: raise ValueError('No PDF file to load page from') self.page_objects[page_number] = Page(pdf_page=self.pdfdoc.load_page(page_number - 1)) return self.page_objects[page_number] + + def extract_text_as_json(self) -> str: + """Extract text and metadata as a JSON string suitable for a LLM""" + + if not self.pdfdoc: + raise ValueError('No PDF document set') + + pdfinfo = {} + pages = [] + + for key in self.pdfdoc.metadata.keys(): + if key not in self.PDF_METADATA_SKIP and self.pdfdoc.metadata.get(key): + pdfinfo[key] = self.pdfdoc.metadata.get(key) + + for page in self.PAGES: + if page > len(self.pdfdoc) - 2: + continue + + texts = [] + text = self.pdfdoc[page].get_text(sort=True) + # Use regular expression to split text into paragraphs + # Delimiter: newline(s) followed by an upper case character + paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE) + long_paragraph_count = 0 + + for paragraph in paragraphs: + paragraph = " ".join(paragraph.strip().split()) + + if '.....' in paragraph or '. . . . .' in paragraph: + # looks like a ToC entry, skip it + continue + if len(paragraph) < self.THRESHOLD: # short paragraph, keep it + texts.append(paragraph) + elif page in self.LONG_PARA_PAGES and long_paragraph_count < self.LONG_PARA_MAX: + # allow some long paragraphs on the first two pages + long_paragraph_count += 1 + texts.append(paragraph) + else: # must be a long paragraph, skip it + pass + text = '\n'.join(texts) + if text: + pages.append({"page": self.pdfdoc[page].number, "text": text}) + + return json.dumps({"pdfinfo": pdfinfo, "pages": pages}) From d9cf0cd6c596320ec17ad48bf8c4db6428a1bd65 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 31 Jul 2024 17:24:32 +0300 Subject: [PATCH 02/11] increase timeout to avoid issues with long documents --- metadata_extract/llm_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata_extract/llm_extractor.py b/metadata_extract/llm_extractor.py index cebdd80..9912412 100644 --- a/metadata_extract/llm_extractor.py +++ b/metadata_extract/llm_extractor.py @@ -18,7 +18,7 @@ class LLMExtractor: INSTRUCTION = "Extract metadata from this document. Return as JSON." MAX_TOKENS = 1024 TEMPERATURE = 0.0 - TIMEOUT = 30 + TIMEOUT = 120 def __init__(self, doc: MeteorDocument): self._doc = doc From 4ccf5ce8c58177e0df0154428c95429948da46ba Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 31 Jul 2024 17:39:28 +0300 Subject: [PATCH 03/11] let LLM-generated titles be selected as well --- metadata_extract/metadata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/metadata_extract/metadata.py b/metadata_extract/metadata.py index f1a855b..d541dab 100644 --- a/metadata_extract/metadata.py +++ b/metadata_extract/metadata.py @@ -112,6 +112,11 @@ def choose_title(self) -> Optional[CandidateType]: and not text.has_no_letters(c.value)] if page_title: return page_title[0].to_dict() + llm_title = [c for c in self.candidates['title'] if + c.origin == Origin.LLM and isinstance(c.value, str) + and not text.has_no_letters(c.value)] + if llm_title: + return llm_title[0].to_dict() return None def choose_publishers(self) -> Optional[CandidateType]: From e55d3da81cb54283ca52a2a5678c713293b72a70 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 16 Aug 2024 16:02:48 +0300 Subject: [PATCH 04/11] configure LLM API service using src/settings module --- .env.example | 5 +++++ metadata_extract/llm_extractor.py | 35 ++++++++++++++++++------------- metadata_extract/meteor.py | 10 ++++++--- src/settings.py | 3 +++ src/util.py | 8 +++++++ 5 files changed, 44 insertions(+), 17 deletions(-) diff --git a/.env.example b/.env.example index aae259a..bc8879f 100644 --- a/.env.example +++ b/.env.example @@ -27,5 +27,10 @@ LANGUAGES=mul,eng,nob # and optionally, to restrict the detection to a subset of languages # GIELLADETECT_LANGS=nno,nob,eng,swe,fin +# To use a LLM API service, specify at least the base URL of an OpenAI-style API +# LLM_API_URL=http://localhost:8080/ +# LLM_API_KEY= +# LLM_MODEL= + # To have Meteor run on a different path (only for stage and prod environments), set # CUSTOM_PATH=/meteor-custom-path diff --git a/metadata_extract/llm_extractor.py b/metadata_extract/llm_extractor.py index 9912412..fb8fed6 100644 --- a/metadata_extract/llm_extractor.py +++ b/metadata_extract/llm_extractor.py @@ -1,18 +1,24 @@ """The LLM extractor module extracts metadata using an external LLM API service.""" +from typing import TypedDict import json -import os import requests from .candidate import AuthorType, Candidate, Origin from .metadata import Metadata from .meteor_document import MeteorDocument +class LLMConfig(TypedDict): + """Configuration for LLM API service""" + api_url: str + api_key: str + model: str + + class LLMExtractor: """A LLMExtractor object loads a MeteorDocument and fills a Metadata object by performing a call to an external LLM API service.""" - MODEL_NAME = "xxx" # doesn't matter if running llama.cpp server SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous " + \ "cataloguing of digital documents." INSTRUCTION = "Extract metadata from this document. Return as JSON." @@ -20,14 +26,11 @@ class LLMExtractor: TEMPERATURE = 0.0 TIMEOUT = 120 - def __init__(self, doc: MeteorDocument): + def __init__(self, doc: MeteorDocument, llm_config: LLMConfig): self._doc = doc + self._config = llm_config self.metadata = Metadata() - @classmethod - def is_available(cls) -> bool: - return cls._api_url() is not None - def extract_metadata(self) -> None: doc_json = self._doc.extract_text_as_json() response = self._llm_request(doc_json) @@ -36,12 +39,20 @@ def extract_metadata(self) -> None: def _llm_request(self, doc_json: str) -> str: message = f"{self.INSTRUCTION}\n\n{doc_json}" + if self._config['api_url'].endswith("/"): + url = self._config['api_url'] + "chat/completions" + else: + url = self._config['api_url'] + "/chat/completions" + headers = { - "Content-Type": "application/json", + "Content-Type": "application/json" } + if self._config['api_key']: + headers['Authorization'] = f'Bearer {self._config["api_key"]}' + data = { - "model": self.MODEL_NAME, + "model": self._config['model'], "messages": [ {"role": "system", "content": self.SYSTEM_PROMPT}, {"role": "user", "content": message}, @@ -50,7 +61,7 @@ def _llm_request(self, doc_json: str) -> str: "max_tokens": self.MAX_TOKENS } - api_response = requests.post(str(self._api_url()), + api_response = requests.post(url, headers=headers, json=data, timeout=self.TIMEOUT) @@ -104,7 +115,3 @@ def _parse_response_to_doc(self, response: str) -> None: # p-issn - Meteor isn't interested in printed ISBNs # type_coar - not supported by Meteor - - @classmethod - def _api_url(cls) -> str | None: - return os.environ.get('LLM_API_URL') diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index c58d657..4cf4514 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -9,7 +9,7 @@ from .meteor_document import MeteorDocument from .metadata import Results from .finder import Finder -from .llm_extractor import LLMExtractor +from .llm_extractor import LLMConfig, LLMExtractor class Meteor: @@ -25,6 +25,7 @@ def __init__(self, languages: Optional[list[str]] = None) -> None: self.registry: Optional[PublisherRegistry] = None ResourceLoader.load(languages) self.detect_language: Callable[[str], Optional[str]] = Meteor.__default_detect + self.llm_config: Optional[LLMConfig] = None @staticmethod def __default_detect(text: str) -> Optional[str]: @@ -42,11 +43,14 @@ def set_registry(self, registry: PublisherRegistry) -> None: def set_language_detection_method(self, detect_language: Callable[[str], str]) -> None: self.detect_language = detect_language + def set_llm_config(self, llm_config: LLMConfig) -> None: + self.llm_config = llm_config + def run(self, file_path: str) -> Results: with MeteorDocument(file_path) as doc: extractor: Optional[LLMExtractor | Finder] = None - if LLMExtractor.is_available(): - extractor = LLMExtractor(doc) + if self.llm_config: + extractor = LLMExtractor(doc, self.llm_config) else: extractor = Finder(doc, self.registry, self.detect_language) extractor.extract_metadata() diff --git a/src/settings.py b/src/settings.py index c3ebafc..24fbf6b 100644 --- a/src/settings.py +++ b/src/settings.py @@ -25,6 +25,9 @@ class Settings(BaseSettings): USE_GIELLADETECT: bool = False GIELLADETECT_LANGS: str = "" CUSTOM_PATH: str = "" + LLM_API_URL: str = "" + LLM_API_KEY: str = "" + LLM_MODEL: str = "" settings = Settings() diff --git a/src/util.py b/src/util.py index c9e7aca..7d59c25 100644 --- a/src/util.py +++ b/src/util.py @@ -47,6 +47,14 @@ def __init__(self) -> None: self.meteor.set_language_detection_method( lambda t: gielladetect.detect(t, langs=langs) ) + if get_settings().LLM_API_URL: + self.meteor.set_llm_config( + llm_config={ + 'api_url': get_settings().LLM_API_URL, + 'api_key': get_settings().LLM_API_KEY, + 'model': get_settings().LLM_MODEL + } + ) @staticmethod def get_languages() -> Optional[list[str]]: From 52b3de62b7c8eee54e0d374f787be7fe9bff94f8 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 16 Aug 2024 16:53:23 +0300 Subject: [PATCH 05/11] allow selecting between Finder and LLMExtractor in API methods and UI form --- main.py | 6 ++++-- metadata_extract/meteor.py | 4 ++-- src/routes/extract.py | 20 ++++++++++++-------- src/util.py | 10 +++++++++- templates/index.html | 7 +++++++ 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index d78ee97..4e7e390 100644 --- a/main.py +++ b/main.py @@ -64,7 +64,8 @@ async def get_front_page_html(request: Request) -> Response: "index.html", { "request": request, - "root_path": Utils.get_environment_prefix() + "root_path": Utils.get_environment_prefix(), + "backends": Utils.get_available_backends() } ) @@ -78,7 +79,8 @@ def display_error_message_in_template(request: Request, exc: StarletteHTTPExcept "results": { 'error': str(exc.detail) }, - "root_path": Utils.get_environment_prefix() + "root_path": Utils.get_environment_prefix(), + "backends": Utils.get_available_backends() }, status_code=exc.status_code ) diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index 4cf4514..cba433e 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -46,10 +46,10 @@ def set_language_detection_method(self, detect_language: Callable[[str], str]) - def set_llm_config(self, llm_config: LLMConfig) -> None: self.llm_config = llm_config - def run(self, file_path: str) -> Results: + def run(self, file_path: str, backend: Optional[str]) -> Results: with MeteorDocument(file_path) as doc: extractor: Optional[LLMExtractor | Finder] = None - if self.llm_config: + if backend and backend.lower() == 'llmextractor': extractor = LLMExtractor(doc, self.llm_config) else: extractor = Finder(doc, self.registry, self.detect_language) diff --git a/src/routes/extract.py b/src/routes/extract.py index c89650f..4d27eb2 100644 --- a/src/routes/extract.py +++ b/src/routes/extract.py @@ -5,7 +5,7 @@ from typing import Annotated, Optional -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Query from starlette.datastructures import UploadFile from starlette.requests import Request from starlette.responses import HTMLResponse, JSONResponse, Response @@ -31,17 +31,18 @@ async def post_pdf_html( form = await request.form() file_input = form.get('fileInput') file_url = form.get('fileUrl') + backend = form.get('backend') if file_url != "" and isinstance(file_url, str): utils.verify_url(file_url) filename: Optional[str] = file_url filepath = utils.download_file(file_url) - results = utils.process_and_remove(filename, filepath) + results = utils.process_and_remove(filename, filepath, backend=backend) elif file_input is not None and isinstance(file_input, UploadFile): utils.verify_file(file_input) filename = file_input.filename filepath = utils.save_file(file_input) - results = utils.process_and_remove(filename, filepath) + results = utils.process_and_remove(filename, filepath, backend=backend) else: raise HTTPException(400) return templates.TemplateResponse( @@ -53,7 +54,8 @@ async def post_pdf_html( "filepath": filepath, "filename": filename, "results": results, - "root_path": utils.get_environment_prefix() + "root_path": utils.get_environment_prefix(), + "backends": utils.get_available_backends() } ) @@ -68,15 +70,16 @@ async def post_pdf_json( form = await request.form() file_input = form.get('fileInput') file_url = form.get('fileUrl') + backend = form.get('backend') if file_url != "" and isinstance(file_url, str): utils.verify_url(file_url) filepath = utils.download_file(file_url) - results = utils.process_and_remove(file_url, filepath, delete_immediately=True) + results = utils.process_and_remove(file_url, filepath, backend=backend, delete_immediately=True) elif file_input is not None and isinstance(file_input, UploadFile): utils.verify_file(file_input) filepath = utils.save_file(file_input) - results = utils.process_and_remove(file_input.filename, filepath, delete_immediately=True) + results = utils.process_and_remove(file_input.filename, filepath, backend=backend, delete_immediately=True) else: raise HTTPException(400) return JSONResponse(results) @@ -85,13 +88,14 @@ async def post_pdf_json( @router.get("/file/{file_name}", response_class=JSONResponse, status_code=200) async def get_metadata_from_file_on_disk( file_name: str, - conf: Annotated[Settings, Depends(get_settings)] + conf: Annotated[Settings, Depends(get_settings)], + backend: Optional[str] = Query(None) # Define the optional query parameter ) -> JSONResponse: """ Extract metadata from a file on disk and return it as JSON """ try: - results = utils.meteor.run(conf.MOUNT_FOLDER + '/' + file_name) + results = utils.meteor.run(conf.MOUNT_FOLDER + '/' + file_name, backend=backend) except Exception: return JSONResponse({"error": f"Error while processing {file_name}"}) return JSONResponse(results) diff --git a/src/util.py b/src/util.py index 7d59c25..db044fa 100644 --- a/src/util.py +++ b/src/util.py @@ -68,6 +68,13 @@ def get_environment_prefix() -> str: return "" return get_settings().CUSTOM_PATH or "/meteor" + @staticmethod + def get_available_backends() -> list[str]: + backends = ['Finder'] + if get_settings().LLM_API_URL: + backends.append('LLMExtractor') + return backends + @staticmethod def verify_file(file: UploadFile) -> None: size_limit = int(get_settings().MAX_FILE_SIZE_MB) @@ -112,10 +119,11 @@ def process_and_remove( self, filename: Optional[str], filepath: str, + backend: str, delete_immediately: bool = False ) -> Union[Error, Results]: try: - results = self.meteor.run(filepath) + results = self.meteor.run(filepath, backend) return results except Exception as exc: print(traceback.format_exc()) diff --git a/templates/index.html b/templates/index.html index 4eb8232..35446d4 100644 --- a/templates/index.html +++ b/templates/index.html @@ -13,6 +13,13 @@

METEOR +
+ +

or drop a PDF file

From 9f28c281340fbf9e46a0c71e8dd425e7dd9b632c Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 16 Aug 2024 17:04:30 +0300 Subject: [PATCH 06/11] fix pylint warnings (wrap long lines) --- src/routes/extract.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/routes/extract.py b/src/routes/extract.py index 4d27eb2..9f8e754 100644 --- a/src/routes/extract.py +++ b/src/routes/extract.py @@ -75,11 +75,13 @@ async def post_pdf_json( if file_url != "" and isinstance(file_url, str): utils.verify_url(file_url) filepath = utils.download_file(file_url) - results = utils.process_and_remove(file_url, filepath, backend=backend, delete_immediately=True) + results = utils.process_and_remove( + file_url, filepath, backend=backend, delete_immediately=True) elif file_input is not None and isinstance(file_input, UploadFile): utils.verify_file(file_input) filepath = utils.save_file(file_input) - results = utils.process_and_remove(file_input.filename, filepath, backend=backend, delete_immediately=True) + results = utils.process_and_remove( + file_input.filename, filepath, backend=backend, delete_immediately=True) else: raise HTTPException(400) return JSONResponse(results) From 49ea7a99cd805408da73baca850c2731602bab44 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 16 Aug 2024 17:06:24 +0300 Subject: [PATCH 07/11] set default value of backend to None --- metadata_extract/meteor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index cba433e..c739502 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -46,7 +46,7 @@ def set_language_detection_method(self, detect_language: Callable[[str], str]) - def set_llm_config(self, llm_config: LLMConfig) -> None: self.llm_config = llm_config - def run(self, file_path: str, backend: Optional[str]) -> Results: + def run(self, file_path: str, backend: Optional[str] = None) -> Results: with MeteorDocument(file_path) as doc: extractor: Optional[LLMExtractor | Finder] = None if backend and backend.lower() == 'llmextractor': From 3fb20a1c71a40e55043f6fdbe2af109909426a6f Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 16 Aug 2024 17:15:26 +0300 Subject: [PATCH 08/11] fix mypy warnings --- metadata_extract/meteor.py | 2 +- src/routes/extract.py | 4 ++-- src/util.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index c739502..98af19a 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -49,7 +49,7 @@ def set_llm_config(self, llm_config: LLMConfig) -> None: def run(self, file_path: str, backend: Optional[str] = None) -> Results: with MeteorDocument(file_path) as doc: extractor: Optional[LLMExtractor | Finder] = None - if backend and backend.lower() == 'llmextractor': + if backend and backend.lower() == 'llmextractor' and self.llm_config: extractor = LLMExtractor(doc, self.llm_config) else: extractor = Finder(doc, self.registry, self.detect_language) diff --git a/src/routes/extract.py b/src/routes/extract.py index 9f8e754..f147b54 100644 --- a/src/routes/extract.py +++ b/src/routes/extract.py @@ -31,7 +31,7 @@ async def post_pdf_html( form = await request.form() file_input = form.get('fileInput') file_url = form.get('fileUrl') - backend = form.get('backend') + backend = str(form.get('backend')) if file_url != "" and isinstance(file_url, str): utils.verify_url(file_url) @@ -70,7 +70,7 @@ async def post_pdf_json( form = await request.form() file_input = form.get('fileInput') file_url = form.get('fileUrl') - backend = form.get('backend') + backend = str(form.get('backend')) if file_url != "" and isinstance(file_url, str): utils.verify_url(file_url) diff --git a/src/util.py b/src/util.py index db044fa..2896aba 100644 --- a/src/util.py +++ b/src/util.py @@ -119,7 +119,7 @@ def process_and_remove( self, filename: Optional[str], filepath: str, - backend: str, + backend: Optional[str] = None, delete_immediately: bool = False ) -> Union[Error, Results]: try: From d5e3caf9a345f4d6294a9b9e745c85c9679b2f24 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 19 Aug 2024 11:23:47 +0300 Subject: [PATCH 09/11] add unit tests for LLM extraction; make year an integer --- metadata_extract/llm_extractor.py | 2 +- test/resources/pdf_report_llm_response.json | 1 + test/test_pdf_llm.py | 133 ++++++++++++++++++++ test/test_settings.py | 3 + 4 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 test/resources/pdf_report_llm_response.json create mode 100644 test/test_pdf_llm.py diff --git a/metadata_extract/llm_extractor.py b/metadata_extract/llm_extractor.py index fb8fed6..0b7f19a 100644 --- a/metadata_extract/llm_extractor.py +++ b/metadata_extract/llm_extractor.py @@ -89,7 +89,7 @@ def _parse_response_to_doc(self, response: str) -> None: # year if 'year' in metadata: - self.metadata.add_candidate('year', Candidate(metadata['year'], Origin.LLM)) + self.metadata.add_candidate('year', Candidate(int(metadata['year']), Origin.LLM)) # publisher if 'publisher' in metadata: diff --git a/test/resources/pdf_report_llm_response.json b/test/resources/pdf_report_llm_response.json new file mode 100644 index 0000000..8ef3708 --- /dev/null +++ b/test/resources/pdf_report_llm_response.json @@ -0,0 +1 @@ +{"choices": [{"finish_reason": "stop", "index": 0, "message": {"content": "{\"language\": \"no\", \"title\": \"Metadataekstrahering \\u2013 Muligheter og innsikt\", \"creator\": [\"Bj\\u00f8rnson, Bj\\u00f8rnstjerne M.\", \"Camilla-Collett, Jacobine\", \"Ibsen, Henrik J.\", \"McArthur, Raymond\", \"O\\u2019Toole, John\"], \"year\": \"2023\", \"publisher\": [\"Nasjonalbiblioteket\"], \"e-isbn\": [\"9788217022985\"], \"e-issn\": \"2464-1162\", \"type_coar\": \"research report\"}", "role": "assistant"}}], "created": 1724051181, "model": "", "object": "chat.completion", "usage": {"completion_tokens": 154, "prompt_tokens": 457, "total_tokens": 611}, "id": "chatcmpl-AWIMAJoFRqEr8w8vStW9LC3hqOCWdDXw"} diff --git a/test/test_pdf_llm.py b/test/test_pdf_llm.py new file mode 100644 index 0000000..c292227 --- /dev/null +++ b/test/test_pdf_llm.py @@ -0,0 +1,133 @@ +# pylint: disable=R0801 + +"""Test the output from Meteor.run on a sample PDF file using LLM extraction""" + + +import json +import unittest.mock +from metadata_extract.meteor import Meteor + + +meteor = Meteor() +meteor.set_llm_config({ + 'api_url': 'http://localhost:8080/', + 'api_key': '', + 'model': '' +}) + +# set up mock LLM response that will be used instead of a real LLM +with open('test/resources/pdf_report_llm_response.json', encoding='utf-8') as response_file: + mock_llm_response = json.load(response_file) + +with unittest.mock.patch("requests.post") as mock_request: + # create a mock response whose .json() method returns the list that we define here + mock_response = unittest.mock.Mock() + mock_response.json.return_value = mock_llm_response + mock_request.return_value = mock_response + + # run Meteor, triggering the call to the LLM + results = meteor.run('test/resources/report.pdf', backend='LLMExtractor') + + +def test_year(): + assert results['year'] == { + "value": 2023, + "origin": { + "type": "LLM" + } + } + + +def test_language(): + assert results['language'] == { + "value": "no", + "origin": { + "type": "LLM" + } + } + + +def test_title(): + assert results['title'] == { + "value": "Metadataekstrahering – Muligheter og innsikt", + "origin": { + "type": "LLM" + } + } + + +def test_publisher(): + assert results['publisher'] == { + "value": "Nasjonalbiblioteket", + "origin": { + "type": "LLM" + } + } + + +def test_authors(): + expected_dict = [ + { + "firstname": "Bjørnstjerne M.", + "lastname": "Bjørnson", + "origin": { + "type": "LLM" + } + }, + { + "firstname": "Jacobine", + "lastname": "Camilla-Collett", + "origin": { + "type": "LLM" + } + }, + { + "firstname": "Henrik J.", + "lastname": "Ibsen", + "origin": { + "type": "LLM" + } + }, + { + "firstname": "Raymond", + "lastname": "McArthur", + "origin": { + "type": "LLM" + } + }, + { + "firstname": "John", + "lastname": "O’Toole", + "origin": { + "type": "LLM" + } + } + ] + + all_expected_authors_found = True + for author in results['authors']: + if author in expected_dict: + expected_dict.remove(author) + else: + all_expected_authors_found = False + break + + assert len(expected_dict) == 0 and all_expected_authors_found + + +def test_isbn(): + assert results['isbn'] == { + "value": "9788217022985", + "origin": { + "type": "LLM" + } + } + + +def test_issn(): + assert results['issn'] == { + "value": "2464-1162", + "origin": { + "type": "LLM" + } + } diff --git a/test/test_settings.py b/test/test_settings.py index 706fc76..6233019 100644 --- a/test/test_settings.py +++ b/test/test_settings.py @@ -17,4 +17,7 @@ def test_config(): assert settings.REGISTRY_PASSWORD == "" assert settings.USE_GIELLADETECT is False assert settings.GIELLADETECT_LANGS == "" + assert settings.LLM_API_URL == "" + assert settings.LLM_API_KEY == "" + assert settings.LLM_MODEL == "" assert settings.CUSTOM_PATH == "" From 70b2dae567159f6917f0f87d793d7b61dd2e7fe6 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 19 Aug 2024 13:40:38 +0300 Subject: [PATCH 10/11] implement registry lookups for LLMExtractor --- metadata_extract/llm_extractor.py | 14 ++++++++++---- metadata_extract/meteor.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/metadata_extract/llm_extractor.py b/metadata_extract/llm_extractor.py index 0b7f19a..abce339 100644 --- a/metadata_extract/llm_extractor.py +++ b/metadata_extract/llm_extractor.py @@ -1,11 +1,12 @@ """The LLM extractor module extracts metadata using an external LLM API service.""" -from typing import TypedDict +from typing import TypedDict, Optional import json import requests from .candidate import AuthorType, Candidate, Origin from .metadata import Metadata from .meteor_document import MeteorDocument +from .registry import PublisherRegistry class LLMConfig(TypedDict): @@ -26,8 +27,11 @@ class LLMExtractor: TEMPERATURE = 0.0 TIMEOUT = 120 - def __init__(self, doc: MeteorDocument, llm_config: LLMConfig): + def __init__(self, doc: MeteorDocument, + registry: Optional[PublisherRegistry], + llm_config: LLMConfig): self._doc = doc + self._registry = registry self._config = llm_config self.metadata = Metadata() @@ -94,8 +98,10 @@ def _parse_response_to_doc(self, response: str) -> None: # publisher if 'publisher' in metadata: for publisher in metadata['publisher']: - # FIXME should we look up publisher in registry like Finder does? - self.metadata.add_candidate('publisher', Candidate(publisher, Origin.LLM)) + publisher_candidate = Candidate(publisher, Origin.LLM) + if self._registry: + publisher_candidate.reg_entries = self._registry.search(publisher) + self.metadata.add_candidate('publisher', publisher_candidate) # doi - not supported by Meteor diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py index 98af19a..309cebf 100644 --- a/metadata_extract/meteor.py +++ b/metadata_extract/meteor.py @@ -50,7 +50,7 @@ def run(self, file_path: str, backend: Optional[str] = None) -> Results: with MeteorDocument(file_path) as doc: extractor: Optional[LLMExtractor | Finder] = None if backend and backend.lower() == 'llmextractor' and self.llm_config: - extractor = LLMExtractor(doc, self.llm_config) + extractor = LLMExtractor(doc, self.registry, self.llm_config) else: extractor = Finder(doc, self.registry, self.detect_language) extractor.extract_metadata() From d600d52189c9deb3d819c8d3d971057ffdd1ad6b Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 19 Aug 2024 16:50:16 +0300 Subject: [PATCH 11/11] robustness fix for mononyms --- metadata_extract/llm_extractor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/metadata_extract/llm_extractor.py b/metadata_extract/llm_extractor.py index abce339..1a15fe2 100644 --- a/metadata_extract/llm_extractor.py +++ b/metadata_extract/llm_extractor.py @@ -87,7 +87,11 @@ def _parse_response_to_doc(self, response: str) -> None: # creator if 'creator' in metadata: for creator in metadata['creator']: - lastname, firstname = creator.split(', ', maxsplit=1) + if ', ' in creator: + lastname, firstname = creator.split(', ', maxsplit=1) + else: + lastname = creator + firstname = "" author_dict: AuthorType = {"firstname": firstname, "lastname": lastname} self.metadata.add_candidate('author', Candidate(author_dict, Origin.LLM))