From db38852ae95a5fce51b06c016eef8118c2ffe323 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 25 May 2022 23:12:02 +0200 Subject: [PATCH 01/44] Cache functionality added. Main tests pass --- ocrd_models/ocrd_models/ocrd_mets.py | 191 ++++++++++++++++- ocrd_models/ocrd_models/ocrd_xml_base.py | 6 +- requirements_test.txt | 1 + tests/model/test_ocrd_mets_bench.py | 250 +++++++++++++++++++++++ 4 files changed, 442 insertions(+), 6 deletions(-) create mode 100644 tests/model/test_ocrd_mets_bench.py diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 8161684c58..592441d3d1 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -4,6 +4,7 @@ from datetime import datetime import re from lxml import etree as ET +from copy import deepcopy from ocrd_utils import ( is_local_filename, @@ -41,7 +42,7 @@ class OcrdMets(OcrdXmlDocument): """ @staticmethod - def empty_mets(now=None): + def empty_mets(now=None, cache_flag=False): """ Create an empty METS file from bundled template. """ @@ -50,7 +51,7 @@ def empty_mets(now=None): tpl = METS_XML_EMPTY.decode('utf-8') tpl = tpl.replace('{{ VERSION }}', VERSION) tpl = tpl.replace('{{ NOW }}', '%s' % now) - return OcrdMets(content=tpl.encode('utf-8')) + return OcrdMets(content=tpl.encode('utf-8'), cache_flag=cache_flag) def __init__(self, **kwargs): """ @@ -58,12 +59,76 @@ def __init__(self, **kwargs): """ super(OcrdMets, self).__init__(**kwargs) + # If cache is enabled + if self._cache_flag: + # Cache for the fileGrps (mets:fileGrp) - a dictionary with Key and Value pair: + # Key: 'fileGrp.USE' + # Value: a 'fileGrp' object at some memory location + self._fileGrp_cache = {} + + # Cache for the files (mets:file) - two nested dictionaries + # The outer dictionary's Key: 'fileGrp.USE' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'file.ID' + # The inner dictionary's Value: a 'file' object at some memory location + self._file_cache = {} + + # Note, if the empty_mets() function is used to instantiate OcrdMets + # Then the cache is empty even after this operation + self._fill_caches() + + def __exit__(self): + if self._cache_flag: + self._clear_caches() + def __str__(self): """ String representation """ return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) + def _fill_caches(self): + """ + Fills the caches with fileGrps and FileIDs + """ + + tree_root = self._tree.getroot() + el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) + + for el_fileGrp in el_fileGrp_list: + fileGrp_use = el_fileGrp.get('USE') + + self._fileGrp_cache[fileGrp_use] = el_fileGrp + print("_fill_caches> file group added to the cache: %s" % fileGrp_use) + + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp_use] = {} + + for el_file in el_fileGrp: + file_id_ = el_file.get('ID') + self._file_cache[fileGrp_use].update({file_id : el_file}) + print("_fill_caches> file added to the cache: %s" % file_id) + + print("_fill_caches> total fileGrp cache elements: %s" % len(self._fileGrp_cache)) + + def _clear_caches(self): + """ + Deallocates the caches + """ + + fileGrp_counter = 0 + + for key in list(self._fileGrp_cache): + del self._fileGrp_cache[key] + fileGrp_counter += 1 + + # print("_clear_caches> total cleared fileGrp cache elements: %d" % fileGrp_counter) + + for key in list(self._file_cache): + for inner_key in list(self._file_cache[key]): + del self._file_cache[key][inner_key] + del self._file_cache[key] + @property def unique_identifier(self): """ @@ -127,6 +192,22 @@ def find_all_files(self, *args, **kwargs): Equivalent to ``list(self.find_files(...))`` """ + + # If only the fileGrp parameter has been passed + # Return a list with all files of that fileGrp from the cache + if self._cache_flag: + if 'fileGrp' in kwargs and 'ID' not in kwargs and 'pageId' not in kwargs and 'mimetype' not in kwargs and 'url' not in kwargs: + fileGrp = kwargs['fileGrp'] + if fileGrp in self._file_cache: + # print("fileGrp[%s] is in the cache!" % fileGrp) + files = [] + for file in self._file_cache[fileGrp]: + files.append(OcrdFile(self._file_cache[fileGrp][file], mets=self)) + return files + else: + # print("fileGrp[%s] not in the cache!" % fileGrp) + return [] + return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements @@ -183,6 +264,51 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) + + + # If the cache is enabled, search inside + if self._cache_flag: + el_file = None + + # If both ID and fileGrp has been passed as a parameter + if ID and fileGrp: + if fileGrp in self._file_cache: + if ID in self._file_cache[fileGrp]: + # print("_+_ID in cache![%s][%s]" % (ID, fileGrp)) + # print("1 find_files> Found %s in the cache." % self._file_cache[fileGrp]) + el_file = self._file_cache[fileGrp][ID] + + # If only ID has been passed as a parameter + elif ID: + for fileGrp_str in self._file_cache: + # print("_-_Checking ID[%s] inside [%s]" % (ID, fileGrp_str)) + if ID in self._file_cache[fileGrp_str]: + # print("_-_ID in cache![%s]" % ID) + # print("2 find_files> Found %s in the cache." % self._file_cache[fileGrp]) + el_file = self._file_cache[fileGrp_str][ID] + + # If the file has been found in the cache + if el_file is not None: + # print("_-_el_file found ID[%s]" % el_file) + # TODO: This should be implemented in a more convinient way + # Why instantiating an OcrdFile then checking if the url is local? + # Couldn't we do that before instantiation of OcrdFile? + f = OcrdFile(el_file, mets=self) + if local_only and not is_local_filename(f.url): + pass + else: + yield f + return + else: + # print("_-_el_file not found ID[%s]" % el_file) + # If there are no other searching parameters set, leave the function + if pageId is None and mimetype is None and url is None: + return + + # print("_-_find_files> ID [%s] not in the cache." % ID) + + # Use the old routine if cache is not enabled + # Or parameters other than ID and fileGrp are passed for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): if ID: if isinstance(ID, str): @@ -216,6 +342,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if not url.fullmatch(cand_url): continue f = OcrdFile(cand, mets=self) + # print("___Generating file with ID: %s" % f.ID) # If only local resources should be returned and f is not a file path: skip the file if local_only and not is_local_filename(f.url): @@ -238,6 +365,13 @@ def add_file_group(self, fileGrp): if el_fileGrp is None: el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) + + # Add the fileGrp to both caches + if self._cache_flag: + self._fileGrp_cache[fileGrp] = el_fileGrp + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp] = {} + return el_fileGrp def rename_file_group(self, old, new): @@ -249,6 +383,11 @@ def rename_file_group(self, old, new): raise FileNotFoundError("No such fileGrp '%s'" % old) el_fileGrp.set('USE', new) + # Rename the fileGrp in both caches + if self._cache_flag: + self._fileGrp_cache[new] = self._fileGrp_cache.pop(old) + self._file_cache[new] = copy.deepcopy(self._file_cache.pop(old)) + def remove_file_group(self, USE, recursive=False, force=False): """ Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``) @@ -285,6 +424,17 @@ def remove_file_group(self, USE, recursive=False, force=False): raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: self.remove_one_file(f.get('ID')) + + # Remove the fileGrp from the caches + if self._cache_flag: + del self._fileGrp_cache[el_fileGrp.get('USE')] + + # Note: Since the files inside the group are removed + # with the 'remove_one_file method' above, + # we should not take care of that again. + # We just remove the fileGrp. + del self._file_cache[el_fileGrp.get('USE')] + el_fileGrp.getparent().remove(el_fileGrp) def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): @@ -310,10 +460,25 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID) if not REGEX_FILE_ID.fullmatch(fileGrp): raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) - el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) + + el_fileGrp = None + + # If cache is enabled, check there + if self._cache_flag: + if fileGrp in self._fileGrp_cache: + el_fileGrp = self._fileGrp_cache[fileGrp] + + # cache is not enabled or fileGrp not in the cache + if el_fileGrp is None: + el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) + + # the fileGrp is not in the XML tree as well if el_fileGrp is None: el_fileGrp = self.add_file_group(fileGrp) - mets_file = next(self.find_files(ID=ID), None) + + # Since we are sure that fileGrp parameter is set, + # we could send that parameter to find_files for direct search + mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if mets_file and not ignore: if not force: raise Exception("File with ID='%s' already exists" % ID) @@ -324,7 +489,13 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force mets_file.local_filename = local_filename else: kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v} - mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs) + el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) + mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) + + # Add the file to the cache + if self._cache_flag: + # print("add_file> Adding to the cache.[%s]" % ID) + self._file_cache[fileGrp].update({ID: el_mets_file}) return mets_file @@ -377,6 +548,16 @@ def remove_one_file(self, ID): log.info("Delete empty page %s", page_div) page_div.getparent().remove(page_div) + # Remove the file from the file cache + if self._cache_flag: + parent_use = ocrd_file._el.getparent().get('USE') + # Note: if the file is in the XML tree, + # it should alse be in the file cache. + # Anyway, we perform the checks, then remove + if parent_use in self._file_cache: + if ocrd_file.ID in self._file_cache[parent_use]: + del self._file_cache[parent_use][ocrd_file.ID] + # Delete the file reference # pylint: disable=protected-access ocrd_file._el.getparent().remove(ocrd_file._el) diff --git a/ocrd_models/ocrd_models/ocrd_xml_base.py b/ocrd_models/ocrd_models/ocrd_xml_base.py index 2235a8b57d..7faefbad99 100644 --- a/ocrd_models/ocrd_models/ocrd_xml_base.py +++ b/ocrd_models/ocrd_models/ocrd_xml_base.py @@ -16,11 +16,12 @@ class OcrdXmlDocument(): Base class for XML documents loaded from either content or filename. """ - def __init__(self, filename=None, content=None): + def __init__(self, filename=None, content=None, cache_flag=False): """ Args: filename (string): content (string): + cache_flag (bool): """ # print(self, filename, content) if filename is None and content is None: @@ -34,6 +35,9 @@ def __init__(self, filename=None, content=None): raise Exception('File does not exist: %s' % filename) self._tree.parse(filename) + # Cache enabled - True/False + self._cache_flag = cache_flag + def to_xml(self, xmllint=False): """ Serialize all properties as pretty-printed XML diff --git a/requirements_test.txt b/requirements_test.txt index db01cd8dd5..8b7997b1a0 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,6 +1,7 @@ autopep8 pytest >= 4.0.0 generateDS == 2.35.20 +pytest-benchmark >= 3.2.3 coverage >= 4.5.2 sphinx sphinx_click diff --git a/tests/model/test_ocrd_mets_bench.py b/tests/model/test_ocrd_mets_bench.py new file mode 100644 index 0000000000..673f3782a6 --- /dev/null +++ b/tests/model/test_ocrd_mets_bench.py @@ -0,0 +1,250 @@ +# -*- coding: utf-8 -*- + +from contextlib import contextmanager +from time import time + +from pytest import main, fixture, mark + +from ocrd import Resolver +from ocrd_utils import MIME_TO_EXT, getLogger +from ocrd_models import OcrdMets + +import pprint + +# LOG = getLogger('ocrd.benchmark.mets') + +GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP'] +GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR'] + +REGIONS_PER_PAGE = 10 +LINES_PER_REGION = 2 +FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE + +# Caching is disabled by default +def _build_mets(number_of_pages, force=False, cache_flag=False): + mets = OcrdMets.empty_mets(cache_flag=cache_flag) + mets._number_of_pages = number_of_pages + + for n in ['%04d' % (n + 1) for n in range(number_of_pages)]: + _add_file = lambda n, fileGrp, mimetype, ID=None: mets.add_file( + fileGrp, + mimetype=mimetype, + pageId='PHYS_%s' % n, + ID=ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), + url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype)) + ) + for grp in GRPS_IMG: + # LINES_PER_REGION = 2 + _add_file(n, grp, 'image/tiff') + _add_file(n, grp, 'application/vnd.prima.page+xml') + for grp in GRPS_REG: + # REGIONS_PER_PAGE = 10 + for region_n in range(REGIONS_PER_PAGE): + _add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n)) + + return mets + +def assert_len(expected_len, mets, kwargs): + test_list = mets.find_all_files(**kwargs) + # print("kwargs: %s" % kwargs) + # print("expected_len= %s, assert_len= %s" %(expected_len, len(test_list))) + assert expected_len == len(test_list) + +def benchmark_find_files(number_of_pages, mets): + benchmark_find_files_filegrp(number_of_pages, mets) + benchmark_find_files_fileid(number_of_pages, mets) + #benchmark_find_files_all(number_of_pages, mets) + +def benchmark_find_files_filegrp(number_of_pages, mets): + # Best case - first fileGrp + assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) + # Worst case - does not exist + assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) + +def benchmark_find_files_fileid(number_of_pages, mets): + # Best case - first file ID + assert_len(1, mets, dict(ID='FULL_0001_TIF')) + # Worst case - does not exist + assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS')) + +# Get all files, i.e., pass an empty search parameter -> dict() +#def benchmark_find_files_all(number_of_pages, mets): +# assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) + + +# ----- METS files global variables ----- # +mets_5 = None +mets_10 = None +mets_20 = None +mets_50 = None + +# ----- Build mets files with 5-10-20-50 pages ----- # +@mark.benchmark(group="build") +def test_b5(benchmark): + @benchmark + def result(): + global mets_5 + mets_5 = _build_mets(5, force=True) + +@mark.benchmark(group="build") +def test_b10(benchmark): + @benchmark + def result(): + global mets_10 + mets_10 = _build_mets(10, force=True) + +@mark.benchmark(group="build") +def test_b20(benchmark): + @benchmark + def result(): + global mets_20 + mets_20 = _build_mets(20, force=True) + +@mark.benchmark(group="build") +def test_b50(benchmark): + @benchmark + def result(): + global mets_50 + mets_50 = _build_mets(50, force=True) + +# ----- Search for files with 5-10-20-50 pages ----- # +@mark.benchmark(group="search") +def test_s5(benchmark): + @benchmark + def ret(): + global mets_5 + benchmark_find_files(5, mets_5) + +@mark.benchmark(group="search") +def test_s10(benchmark): + @benchmark + def ret(): + global mets_10 + benchmark_find_files(10, mets_10) + +@mark.benchmark(group="search") +def test_s20(benchmark): + @benchmark + def ret(): + global mets_20 + benchmark_find_files(20, mets_20) + +@mark.benchmark(group="search") +def test_s50(benchmark): + @benchmark + def ret(): + global mets_50 + benchmark_find_files(50, mets_50) + +del mets_5 +del mets_10 +del mets_20 +del mets_50 + + + +# ----- METS files (cached) global variables ----- # +mets_c_5 = None +mets_c_10 = None +mets_c_20 = None +mets_c_50 = None + +# ----- Build mets files (cached) with 5-10-20-50 pages ----- # +@mark.benchmark(group="build") +def test_b5_c(benchmark): + @benchmark + def result(): + global mets_c_5 + mets_c_5 = _build_mets(5, force=True, cache_flag=True) + +@mark.benchmark(group="build") +def test_b10_c(benchmark): + @benchmark + def result(): + global mets_c_10 + mets_c_10 = _build_mets(10, force=True, cache_flag=True) + +@mark.benchmark(group="build") +def test_b20_c(benchmark): + @benchmark + def result(): + global mets_c_20 + mets_c_20 = _build_mets(20, force=True, cache_flag=True) + +@mark.benchmark(group="build") +def test_b50_c(benchmark): + @benchmark + def result(): + global mets_c_50 + mets_c_50 = _build_mets(50, force=True, cache_flag=True) + +# ----- Search for files (cached) with 5-10-20-50 pages ----- # +@mark.benchmark(group="search") +def test_s5_c(benchmark): + @benchmark + def ret(): + global mets_c_5 + benchmark_find_files(5, mets_c_5) + +@mark.benchmark(group="search") +def test_s10_c(benchmark): + @benchmark + def ret(): + global mets_c_10 + benchmark_find_files(10, mets_c_10) + +@mark.benchmark(group="search") +def test_s20_c(benchmark): + @benchmark + def ret(): + global mets_c_20 + benchmark_find_files(20, mets_c_20) + +@mark.benchmark(group="search") +def test_s50_c(benchmark): + @benchmark + def ret(): + global mets_c_50 + benchmark_find_files(50, mets_c_50) + +del mets_c_5 +del mets_c_10 +del mets_c_20 +del mets_c_50 + +def manual_t(): + mets = _build_mets(2, cache_flag=False) + mets_cached = _build_mets(2, cache_flag=True) + + # print("METS>--------------------------------------------------------------------") + # print(mets) + # print("-------------------------------------------------------------------------") + # print("METS_cached>-------------------------------------------------------------") + # print(mets_cached) + + print("-----Regular-Bench------------------------------------------------------------") + benchmark_find_files(2, mets) + print("-----Cached-Bench-------------------------------------------------------------") + benchmark_find_files(2, mets_cached) + + print("-----Regular------------------------------------------------------------------") + print("len=%d" % len(mets.find_all_files(fileGrp='SEG-REG'))) + print(mets.find_all_files(fileGrp='SEG-REG')) + + print("-----Cached-------------------------------------------------------------------") + print("len=%d" % len(mets_cached.find_all_files(fileGrp='SEG-REG'))) + print(mets_cached.find_all_files(fileGrp='SEG-REG')) + +if __name__ == '__main__': + args = [''] + # args.append('--benchmark-max-time=10') + # args.append('--benchmark-min-time=0.1') + # args.append('--benchmark-warmup=True') + # args.append('--benchmark-disable-gc') + args.append('--benchmark-verbose') + args.append('--benchmark-min-rounds=1') + args.append('--tb=short') + main(args) + + # This function was used to manually test things + # manual_t() From 60e1a60c5980b108a3a871550c52a0eea1fd983e Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 30 May 2022 18:53:47 +0200 Subject: [PATCH 02/44] Fixes and cache tests added --- ocrd_models/ocrd_models/ocrd_mets.py | 149 +++++++----- tests/model/test_ocrd_mets_cache.py | 330 +++++++++++++++++++++++++++ 2 files changed, 422 insertions(+), 57 deletions(-) create mode 100644 tests/model/test_ocrd_mets_cache.py diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 592441d3d1..b4042f163d 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -94,10 +94,17 @@ def _fill_caches(self): tree_root = self._tree.getroot() el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) + if el_fileGrp_list is None or len(el_fileGrp_list) == 0: + return for el_fileGrp in el_fileGrp_list: fileGrp_use = el_fileGrp.get('USE') + # NOTE: For some reason the el_fileGrp_list contains None values + # when testing with the SBB0000F29300010000/data/mets.xml + if fileGrp_use is None: + continue + self._fileGrp_cache[fileGrp_use] = el_fileGrp print("_fill_caches> file group added to the cache: %s" % fileGrp_use) @@ -105,7 +112,7 @@ def _fill_caches(self): self._file_cache[fileGrp_use] = {} for el_file in el_fileGrp: - file_id_ = el_file.get('ID') + file_id = el_file.get('ID') self._file_cache[fileGrp_use].update({file_id : el_file}) print("_fill_caches> file added to the cache: %s" % file_id) @@ -193,21 +200,94 @@ def find_all_files(self, *args, **kwargs): Equivalent to ``list(self.find_files(...))`` """ - # If only the fileGrp parameter has been passed - # Return a list with all files of that fileGrp from the cache + # NOTE: This code gets complex with the REGEX. + # Having two separate funcitons: with REGEX and without REGEX would simplify things if self._cache_flag: + matches = [] + + # If only both the fileGrp and ID parameters have been passed + # Faster search in the cache + if 'ID' in kwargs and 'fileGrp' in kwargs and 'pageId' not in kwargs and 'mimetype' not in kwargs and 'url' not in kwargs: + fileGrp = kwargs['fileGrp'] + fileID = kwargs['ID'] + + if fileID.startswith(REGEX_PREFIX): + fileID = re.compile(fileID[REGEX_PREFIX_LEN:]) + if fileGrp.startswith(REGEX_PREFIX): + fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:]) + + # Case where no regex pattern is given and + # exact match could be obtained + if (isinstance(fileID, str) and isinstance(fileGrp, str)): + if fileGrp in self._file_cache: + if fileID in self._file_cache[fileGrp]: + matches.append(OcrdFile(self._file_cache[fileGrp][fileID], mets=self)) + elif isinstance(fileGrp, str): + # fileGrp is str and fileID is regex + if fileGrp in self._file_cache: + for fileID_str in self._file_cache[fileGrp]: + if fileID.fullmatch(fileID_str): + matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID_str], mets=self)) + + elif isinstance(fileID, str): + # fileID is str and fileGrp is regex + for fileGrp_str in self._file_cache: + if fileGrp.fullmatch(fileGrp_str): + if fileID in self._file_cache[fileGrp_str]: + matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID], mets=self)) + + else: + # both are regex: this has a really bad performance since + # we have to iterate all groups and all files to check for matches + for fileGrp_str in self._file_cache: + if fileGrp.fullmatch(fileGrp_str): + for fileID_str in self._file_cache[fileGrp_str]: + if fileID.fullmatch(fileID_str): + matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID_str], mets=self)) + + return matches + + # If only the fileGrp parameter has been passed + # Return a list with all files of that fileGrp from the cache if 'fileGrp' in kwargs and 'ID' not in kwargs and 'pageId' not in kwargs and 'mimetype' not in kwargs and 'url' not in kwargs: fileGrp = kwargs['fileGrp'] - if fileGrp in self._file_cache: - # print("fileGrp[%s] is in the cache!" % fileGrp) - files = [] - for file in self._file_cache[fileGrp]: - files.append(OcrdFile(self._file_cache[fileGrp][file], mets=self)) - return files + + if fileGrp.startswith(REGEX_PREFIX): + fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:]) + + for fileGrp_str in self._file_cache: + print("Type(fileGrp): %s, Type(fileGrp_str): %s" % (type(fileGrp), type(fileGrp_str))) + if fileGrp.fullmatch(fileGrp_str): + for fileID in self._file_cache[fileGrp_str]: + matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID], mets=self)) else: - # print("fileGrp[%s] not in the cache!" % fileGrp) - return [] + if fileGrp in self._file_cache: + for fileID in self._file_cache[fileGrp]: + matches.append(OcrdFile(self._file_cache[fileGrp][fileID], mets=self)) + + return matches + + # If only the ID parameter has been passed + # Return a list with that fileID inside or an empty list if not in cache + if 'ID' in kwargs and 'fileGrp' not in kwargs and 'pageId' not in kwargs and 'mimetype' not in kwargs and 'url' not in kwargs: + fileID = kwargs['ID'] + if fileID.startswith(REGEX_PREFIX): + fileID = re.compile(fileID[REGEX_PREFIX_LEN:]) + for fileGrp_str in self._file_cache: + for fileID_str in self._file_cache[fileGrp_str]: + if fileID.fullmatch(fileID_str): + matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID_str], mets=self)) + + else: + for fileGrp_str in self._file_cache: + if fileID in self._file_cache[fileGrp_str]: + matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID], mets=self)) + + return matches + + # Run the old routine if cache is not enabled + # or search based on parameters other than fileGrp and fileID are used return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements @@ -265,50 +345,6 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) - - # If the cache is enabled, search inside - if self._cache_flag: - el_file = None - - # If both ID and fileGrp has been passed as a parameter - if ID and fileGrp: - if fileGrp in self._file_cache: - if ID in self._file_cache[fileGrp]: - # print("_+_ID in cache![%s][%s]" % (ID, fileGrp)) - # print("1 find_files> Found %s in the cache." % self._file_cache[fileGrp]) - el_file = self._file_cache[fileGrp][ID] - - # If only ID has been passed as a parameter - elif ID: - for fileGrp_str in self._file_cache: - # print("_-_Checking ID[%s] inside [%s]" % (ID, fileGrp_str)) - if ID in self._file_cache[fileGrp_str]: - # print("_-_ID in cache![%s]" % ID) - # print("2 find_files> Found %s in the cache." % self._file_cache[fileGrp]) - el_file = self._file_cache[fileGrp_str][ID] - - # If the file has been found in the cache - if el_file is not None: - # print("_-_el_file found ID[%s]" % el_file) - # TODO: This should be implemented in a more convinient way - # Why instantiating an OcrdFile then checking if the url is local? - # Couldn't we do that before instantiation of OcrdFile? - f = OcrdFile(el_file, mets=self) - if local_only and not is_local_filename(f.url): - pass - else: - yield f - return - else: - # print("_-_el_file not found ID[%s]" % el_file) - # If there are no other searching parameters set, leave the function - if pageId is None and mimetype is None and url is None: - return - - # print("_-_find_files> ID [%s] not in the cache." % ID) - - # Use the old routine if cache is not enabled - # Or parameters other than ID and fileGrp are passed for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): if ID: if isinstance(ID, str): @@ -342,7 +378,6 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if not url.fullmatch(cand_url): continue f = OcrdFile(cand, mets=self) - # print("___Generating file with ID: %s" % f.ID) # If only local resources should be returned and f is not a file path: skip the file if local_only and not is_local_filename(f.url): @@ -386,7 +421,7 @@ def rename_file_group(self, old, new): # Rename the fileGrp in both caches if self._cache_flag: self._fileGrp_cache[new] = self._fileGrp_cache.pop(old) - self._file_cache[new] = copy.deepcopy(self._file_cache.pop(old)) + self._file_cache[new] = deepcopy(self._file_cache.pop(old)) def remove_file_group(self, USE, recursive=False, force=False): """ diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py new file mode 100644 index 0000000000..2e625d6078 --- /dev/null +++ b/tests/model/test_ocrd_mets_cache.py @@ -0,0 +1,330 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime + +from os.path import join +import shutil + +from tests.base import ( + main, + assets, +) + +from ocrd_utils import ( + VERSION, + MIMETYPE_PAGE +) +from ocrd_models import ( + OcrdMets +) + +import pytest + + +@pytest.fixture(name='sbb_sample_01') +def _fixture(): + mets = OcrdMets(filename=assets.url_of( + 'SBB0000F29300010000/data/mets.xml'), cache_flag=True) + yield mets + + +def test_unique_identifier(): + mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True) + assert mets.unique_identifier == 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier' + mets.unique_identifier = 'foo' + assert mets.unique_identifier == 'foo', 'Right identifier after change' + + +def test_unique_identifier_from_nothing(): + mets = OcrdMets.empty_mets(datetime.now().isoformat(), cache_flag=True) + assert mets.unique_identifier == None, 'no identifier' + mets.unique_identifier = 'foo' + assert mets.unique_identifier == 'foo', 'Right identifier after change is "foo"' + as_string = mets.to_xml().decode('utf-8') + assert 'ocrd/core v%s' % VERSION in as_string + assert 'CREATEDATE="%04u-%02u-%02uT' % (datetime.now().year, datetime.now().month, datetime.now().day,) in as_string + + +def test_str(): + mets = OcrdMets(content='', cache_flag=True) + assert str(mets) == 'OcrdMets[fileGrps=[],files=[]]' + + +@pytest.mark.xfail(reason='old test, was actually out-commented') +def test_override_constructor_args(): + id2file = {'foo': {}} + mets = OcrdMets(id2file, content='', cache_flag=True) + assert mets._file_by_id == id2file + + +def test_file_groups(sbb_sample_01): + assert len(sbb_sample_01.file_groups) == 17, '17 file groups shall be found' + + +def test_find_all_files(sbb_sample_01): + assert len(sbb_sample_01.find_all_files()) == 35, '35 files total' + assert len(sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' + assert len(sbb_sample_01.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' + assert len(sbb_sample_01.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' + assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' + assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"' + assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' + assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' + assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE + assert len(sbb_sample_01.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' + assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' + + +def test_find_all_files_local_only(sbb_sample_01): + assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', + local_only=True)) == 3, '3 local files for page "PHYS_0001"' + + +def test_physical_pages(sbb_sample_01): + assert len(sbb_sample_01.physical_pages) == 3, '3 physical pages' + + +def test_physical_pages_from_empty_mets(): + mets = OcrdMets(content="", cache_flag=True) + assert len(mets.physical_pages) == 0, 'no physical page' + mets.add_file('OUTPUT', ID="foo123", pageId="foobar") + assert len(mets.physical_pages) == 1, '1 physical page' + + +@pytest.fixture(name='sbb_directory_ocrd_mets') +def _fixture_sbb(tmp_path): + src_path = assets.path_to('SBB0000F29300010000/data') + dst_path = tmp_path / 'SBB_directory' + shutil.copytree(src_path, dst_path) + mets_path = str(join(dst_path, 'mets.xml')) + yield OcrdMets(filename=mets_path, cache_flag=True) + + +def test_physical_pages_for_fileids(sbb_directory_ocrd_mets): + assert sbb_directory_ocrd_mets.get_physical_pages( + for_fileIds=['FILE_0002_IMAGE']) == ['PHYS_0002'] + + +def test_add_group(): + mets = OcrdMets.empty_mets(cache_flag=True) + assert len(mets.file_groups) == 0, '0 file groups' + mets.add_file_group('TEST') + assert len(mets.file_groups) == 1, '1 file groups' + mets.add_file_group('TEST') + assert len(mets.file_groups) == 1, '1 file groups' + + +def test_add_file(): + mets = OcrdMets.empty_mets(cache_flag=True) + assert len(mets.file_groups) == 0, '0 file groups' + assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 0, '0 files in "OUTPUT"' + f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar") + f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") + assert f.pageId == 'foobar', 'pageId set' + assert len(mets.file_groups) == 1, '1 file groups' + assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 2, '2 files in "OUTPUT"' + mets.set_physical_page_for_file('barfoo', f, order='300', orderlabel="page 300") + assert f.pageId == 'barfoo', 'pageId changed' + mets.set_physical_page_for_file('quux', f2, order='302', orderlabel="page 302") + assert f2.pageId == 'quux', 'pageId changed' + mets.set_physical_page_for_file('barfoo', f2, order='301', orderlabel="page 301") + assert f2.pageId == 'barfoo', 'pageId changed' + assert len(mets.file_groups) == 1, '1 file group' + + +def test_add_file_id_already_exists(sbb_sample_01): + f = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") + assert f.ID == 'best-id-ever', "ID kept" + with pytest.raises(Exception) as exc: + sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep") + + assert "File with ID='best-id-ever' already exists" in str(exc) + + f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) + assert f._el == f2._el + +@pytest.mark.xfail(reason='2x same ID is valid if ignore == True') +def test_add_file_ignore(sbb_sample_01: OcrdMets): + """Behavior if ignore-Flag set to true: + delegate responsibility to overwrite existing files to user""" + + the_file = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") + assert the_file.ID == 'best-id-ever' + the_same = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) + assert the_same.ID == 'best-id-ever' + + # how many files inserted + the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) + assert len(the_files) == 1 + + +def test_add_file_id_invalid(sbb_sample_01): + with pytest.raises(Exception) as exc: + sbb_sample_01.add_file('OUTPUT', ID='1234:::', mimetype="beep/boop") + assert "Invalid syntax for mets:file/@ID 1234:::" in str(exc) + + +def test_filegrp_from_file(sbb_sample_01): + f = sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')[0] + assert f.fileGrp == 'OCR-D-IMG' + + +def test_add_file_no_id(sbb_sample_01): + with pytest.raises(Exception) as exc: + sbb_sample_01.add_file('FOO') + assert "Must set ID of the mets:file" in str(exc) + + +def test_add_file_no_pageid(sbb_sample_01): + f = sbb_sample_01.add_file('OUTPUT', mimetype="bla/quux", ID="foo3") + assert not f.pageId, 'No pageId available, dude!' + + +def test_file_pageid(sbb_sample_01): + f = sbb_sample_01.find_all_files()[0] + assert f.pageId == 'PHYS_0001' + f.pageId = 'foo' + assert f.pageId == 'foo' + + +def test_agent(sbb_sample_01): + beforelen = len(sbb_sample_01.agents) + sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') + assert len(sbb_sample_01.agents) == beforelen + 1 + + +def test_metshdr(): + """ + Test whether metsHdr is created on-demand + """ + mets = OcrdMets(content="", cache_flag=True) + assert not mets._tree.getroot().getchildren() + mets.add_agent() + assert len(mets._tree.getroot().getchildren()) == 1 + + +def test_nocontent_nofilename_exception(): + with pytest.raises(Exception) as exc: + OcrdMets() + assert "Must pass 'filename' or 'content' to" in str(exc) + + +def test_encoding_entities(): + mets = OcrdMets(content=""" + + + + Őh śéé Áŕ + OCR-D + + + + """, cache_flag=True) + assert 'Őh śéé Áŕ' in mets.to_xml().decode('utf-8') + + +def test_remove_page(sbb_directory_ocrd_mets): + assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] + sbb_directory_ocrd_mets.remove_physical_page('PHYS_0001') + assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0002', 'PHYS_0005'] + + +def test_remove_physical_page_fptr(sbb_directory_ocrd_mets): + assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'] + sbb_directory_ocrd_mets.remove_physical_page_fptr('FILE_0002_IMAGE') + assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), [None] + + +def test_remove_page_after_remove_file(sbb_directory_ocrd_mets): + assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] + sbb_directory_ocrd_mets.remove_one_file('FILE_0005_IMAGE') + assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] + + +def test_remove_file_ocrdfile(sbb_directory_ocrd_mets): + assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] + ocrd_file = sbb_directory_ocrd_mets.find_all_files(ID='FILE_0005_IMAGE')[0] + sbb_directory_ocrd_mets.remove_one_file(ocrd_file) + assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] + + +def test_remove_file_regex(sbb_directory_ocrd_mets): + assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] + sbb_directory_ocrd_mets.remove_file('//FILE_0005.*') + assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] + + +def test_rename_non_existent_filegroup_exception(sbb_directory_ocrd_mets): + with pytest.raises(FileNotFoundError) as fnf_exc: + sbb_directory_ocrd_mets.rename_file_group('FOOBAR', 'FOOBAR') + # assert + assert "No such fileGrp 'FOOBAR'" in str(fnf_exc) + + +def test_rename_file_group0(sbb_directory_ocrd_mets): + assert 'FOOBAR' not in sbb_directory_ocrd_mets.file_groups + + # act + sbb_directory_ocrd_mets.rename_file_group('OCR-D-GT-PAGE', 'FOOBAR') + + # assert + assert 'OCR-D-GT-PAGE' not in sbb_directory_ocrd_mets.file_groups + assert 'FOOBAR' in sbb_directory_ocrd_mets.file_groups + + +def test_remove_non_empty_filegroup_exception(sbb_directory_ocrd_mets): + with pytest.raises(Exception) as exc: + sbb_directory_ocrd_mets.remove_file_group('OCR-D-GT-ALTO') + assert "not empty" in str(exc) + + +def test_remove_file_group0(sbb_directory_ocrd_mets): + """ + Test removal of filegrp + """ + + assert len(sbb_directory_ocrd_mets.file_groups) == 17 + assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 + + sbb_directory_ocrd_mets.remove_file_group('OCR-D-GT-PAGE', recursive=True) + assert len(sbb_directory_ocrd_mets.file_groups) == 16 + assert len(sbb_directory_ocrd_mets.find_all_files()) == 33 + + +def test_remove_file_group_regex(sbb_directory_ocrd_mets): + """ + Test removal of filegrp + """ + + assert len(sbb_directory_ocrd_mets.file_groups) == 17 + assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 + + # act + sbb_directory_ocrd_mets.remove_file_group('//OCR-D-GT-.*', recursive=True) + + # assert + assert len(sbb_directory_ocrd_mets.file_groups) == 15 + assert len(sbb_directory_ocrd_mets.find_all_files()) == 31 + + +def test_merge(sbb_sample_01): + assert len(sbb_sample_01.file_groups) == 17 + other_mets = OcrdMets(filename=assets.path_to('kant_aufklaerung_1784/data/mets.xml'), cache_flag=True) + sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) + assert len(sbb_sample_01.file_groups) == 18 + + +def test_invalid_filegrp(): + """addresses https://github.com/OCR-D/core/issues/746""" + + mets = OcrdMets(content="", cache_flag=True) + with pytest.raises(ValueError) as val_err: + mets.add_file('1:! bad filegrp', ID="foo123", pageId="foobar") + + assert "Invalid syntax for mets:fileGrp/@USE" in str(val_err.value) + + +if __name__ == '__main__': + main(__file__) From d8c2f504b32735f24911403df79c918b25b40b8a Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 31 May 2022 10:09:43 +0200 Subject: [PATCH 03/44] Fix typo --- ocrd_models/ocrd_models/ocrd_mets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index b4042f163d..cb2729b36d 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -227,7 +227,7 @@ def find_all_files(self, *args, **kwargs): if fileGrp in self._file_cache: for fileID_str in self._file_cache[fileGrp]: if fileID.fullmatch(fileID_str): - matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID_str], mets=self)) + matches.append(OcrdFile(self._file_cache[fileGrp][fileID_str], mets=self)) elif isinstance(fileID, str): # fileID is str and fileGrp is regex From ef1c757604901f05955b337ba84fed1682ab85c2 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 1 Jun 2022 14:16:21 +0200 Subject: [PATCH 04/44] Applying the changes suggested by kba --- ocrd_models/ocrd_models/ocrd_mets.py | 81 +++++++++++++++------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index cb2729b36d..62c371b1ba 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -61,10 +61,6 @@ def __init__(self, **kwargs): # If cache is enabled if self._cache_flag: - # Cache for the fileGrps (mets:fileGrp) - a dictionary with Key and Value pair: - # Key: 'fileGrp.USE' - # Value: a 'fileGrp' object at some memory location - self._fileGrp_cache = {} # Cache for the files (mets:file) - two nested dictionaries # The outer dictionary's Key: 'fileGrp.USE' @@ -97,44 +93,48 @@ def _fill_caches(self): if el_fileGrp_list is None or len(el_fileGrp_list) == 0: return + log = getLogger('ocrd_models.ocrd_mets._fill_caches') + for el_fileGrp in el_fileGrp_list: fileGrp_use = el_fileGrp.get('USE') - # NOTE: For some reason the el_fileGrp_list contains None values - # when testing with the SBB0000F29300010000/data/mets.xml + # Note: SBB0000F29300010000/data/mets.xml contains None + # values due to the comments inside the file if fileGrp_use is None: continue - self._fileGrp_cache[fileGrp_use] = el_fileGrp - print("_fill_caches> file group added to the cache: %s" % fileGrp_use) - # Assign an empty dictionary that will hold the files of the added fileGrp self._file_cache[fileGrp_use] = {} for el_file in el_fileGrp: file_id = el_file.get('ID') self._file_cache[fileGrp_use].update({file_id : el_file}) - print("_fill_caches> file added to the cache: %s" % file_id) + # log.info("_fill_caches> file added to the cache: %s" % file_id) - print("_fill_caches> total fileGrp cache elements: %s" % len(self._fileGrp_cache)) + # log.info("_fill_caches> total fileGrp cache elements: %s" % len(self._fileGrp_cache)) def _clear_caches(self): """ Deallocates the caches """ - fileGrp_counter = 0 + self._file_cache = None - for key in list(self._fileGrp_cache): - del self._fileGrp_cache[key] - fileGrp_counter += 1 + """ + log = getLogger('ocrd_models.ocrd_mets._clear_caches') - # print("_clear_caches> total cleared fileGrp cache elements: %d" % fileGrp_counter) + fileGrp_counter = 0 + fileId_counter = 0 for key in list(self._file_cache): for inner_key in list(self._file_cache[key]): del self._file_cache[key][inner_key] + fileId_counter += 1 del self._file_cache[key] + fileGrp_counter += 1 + + log.info("_clear_caches> fileGrp: %d, fileId: %d" % (fileGrp_counter, fileId_counter)) + """ @property def unique_identifier(self): @@ -200,6 +200,7 @@ def find_all_files(self, *args, **kwargs): Equivalent to ``list(self.find_files(...))`` """ + """ # NOTE: This code gets complex with the REGEX. # Having two separate funcitons: with REGEX and without REGEX would simplify things if self._cache_flag: @@ -256,7 +257,7 @@ def find_all_files(self, *args, **kwargs): fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:]) for fileGrp_str in self._file_cache: - print("Type(fileGrp): %s, Type(fileGrp_str): %s" % (type(fileGrp), type(fileGrp_str))) + # log.info("Type(fileGrp): %s, Type(fileGrp_str): %s" % (type(fileGrp), type(fileGrp_str))) if fileGrp.fullmatch(fileGrp_str): for fileID in self._file_cache[fileGrp_str]: matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID], mets=self)) @@ -285,9 +286,8 @@ def find_all_files(self, *args, **kwargs): matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID], mets=self)) return matches + """ - # Run the old routine if cache is not enabled - # or search based on parameters other than fileGrp and fileID are used return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements @@ -345,7 +345,20 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) - for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): + candidates = [] + + if self._cache_flag: + if fileGrp: + if isinstance(fileGrp, str): + candidates += self._file_cache.get(fileGrp, {}).values() + else: + candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()] + else: + candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()] + else: + candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) + + for cand in candidates: if ID: if isinstance(ID, str): if not ID == cand.get('ID'): continue @@ -355,7 +368,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if pageId is not None and cand.get('ID') not in pageId: continue - if fileGrp: + if not self._cache_flag and fileGrp: if isinstance(fileGrp, str): if cand.getparent().get('USE') != fileGrp: continue else: @@ -377,6 +390,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: if not url.fullmatch(cand_url): continue + # Note: why we instantiate a class only to find out that the local_only is set afterwards + # Checking local_only and url before instantiation should be better? f = OcrdFile(cand, mets=self) # If only local resources should be returned and f is not a file path: skip the file @@ -401,9 +416,7 @@ def add_file_group(self, fileGrp): el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) - # Add the fileGrp to both caches if self._cache_flag: - self._fileGrp_cache[fileGrp] = el_fileGrp # Assign an empty dictionary that will hold the files of the added fileGrp self._file_cache[fileGrp] = {} @@ -418,10 +431,8 @@ def rename_file_group(self, old, new): raise FileNotFoundError("No such fileGrp '%s'" % old) el_fileGrp.set('USE', new) - # Rename the fileGrp in both caches if self._cache_flag: - self._fileGrp_cache[new] = self._fileGrp_cache.pop(old) - self._file_cache[new] = deepcopy(self._file_cache.pop(old)) + self._file_cache[new] = self._file_cache.pop(old) def remove_file_group(self, USE, recursive=False, force=False): """ @@ -460,12 +471,9 @@ def remove_file_group(self, USE, recursive=False, force=False): for f in files: self.remove_one_file(f.get('ID')) - # Remove the fileGrp from the caches if self._cache_flag: - del self._fileGrp_cache[el_fileGrp.get('USE')] - # Note: Since the files inside the group are removed - # with the 'remove_one_file method' above, + # with the 'remove_one_file' method above, # we should not take care of that again. # We just remove the fileGrp. del self._file_cache[el_fileGrp.get('USE')] @@ -498,16 +506,18 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force el_fileGrp = None - # If cache is enabled, check there + """ + # Note: we do not benefit enough from having + # a separate cache for fileGrp elements + if self._cache_flag: if fileGrp in self._fileGrp_cache: el_fileGrp = self._fileGrp_cache[fileGrp] + """ - # cache is not enabled or fileGrp not in the cache if el_fileGrp is None: el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) - # the fileGrp is not in the XML tree as well if el_fileGrp is None: el_fileGrp = self.add_file_group(fileGrp) @@ -527,9 +537,7 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) - # Add the file to the cache if self._cache_flag: - # print("add_file> Adding to the cache.[%s]" % ID) self._file_cache[fileGrp].update({ID: el_mets_file}) return mets_file @@ -583,11 +591,10 @@ def remove_one_file(self, ID): log.info("Delete empty page %s", page_div) page_div.getparent().remove(page_div) - # Remove the file from the file cache if self._cache_flag: parent_use = ocrd_file._el.getparent().get('USE') # Note: if the file is in the XML tree, - # it should alse be in the file cache. + # it must also be in the file cache. # Anyway, we perform the checks, then remove if parent_use in self._file_cache: if ocrd_file.ID in self._file_cache[parent_use]: From 2414e631d3a1a206385a33b2cea38da18241396d Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Fri, 3 Jun 2022 21:57:34 +0200 Subject: [PATCH 05/44] page_cache and fptr_cache added --- ocrd_models/ocrd_models/ocrd_file.py | 1 - ocrd_models/ocrd_models/ocrd_mets.py | 348 +++++++++++++++++---------- tests/model/test_ocrd_mets_cache.py | 26 +- 3 files changed, 229 insertions(+), 146 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index 4637202bdc..8b02176596 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -138,7 +138,6 @@ def pageId(self, pageId): raise Exception("OcrdFile %s has no member 'mets' pointing to parent OcrdMets" % self) self.mets.set_physical_page_for_file(pageId, self) - @property def loctype(self): """ diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 62c371b1ba..80b7a4fb65 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -69,11 +69,26 @@ def __init__(self, **kwargs): # The inner dictionary's Value: a 'file' object at some memory location self._file_cache = {} + # Cache for the pages (mets:div) + # The dictionary's Key: 'div.ID' + # The dictionary's Value: a 'div' object at some memory location + self._page_cache = {} + + # Cache for the file pointers (mets:fptr) - two nested dictionaries + # The outer dictionary's Key: 'div.ID' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'fptr.FILEID' + # The inner dictionary's Value: a 'fptr' object at some memory location + self._fptr_cache = {} + # Note, if the empty_mets() function is used to instantiate OcrdMets # Then the cache is empty even after this operation self._fill_caches() def __exit__(self): + """ + + """ if self._cache_flag: self._clear_caches() @@ -89,29 +104,58 @@ def _fill_caches(self): """ tree_root = self._tree.getroot() + + # Fill with files el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) if el_fileGrp_list is None or len(el_fileGrp_list) == 0: return + else: + log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') - log = getLogger('ocrd_models.ocrd_mets._fill_caches') + for el_fileGrp in el_fileGrp_list: + fileGrp_use = el_fileGrp.get('USE') - for el_fileGrp in el_fileGrp_list: - fileGrp_use = el_fileGrp.get('USE') + # Note: SBB0000F29300010000/data/mets.xml contains None + # values due to the comments inside the file + if fileGrp_use is None: + continue - # Note: SBB0000F29300010000/data/mets.xml contains None - # values due to the comments inside the file - if fileGrp_use is None: - continue + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp_use] = {} - # Assign an empty dictionary that will hold the files of the added fileGrp - self._file_cache[fileGrp_use] = {} + for el_file in el_fileGrp: + file_id = el_file.get('ID') + self._file_cache[fileGrp_use].update({file_id : el_file}) + # log.info("File added to the cache: %s" % file_id) - for el_file in el_fileGrp: - file_id = el_file.get('ID') - self._file_cache[fileGrp_use].update({file_id : el_file}) - # log.info("_fill_caches> file added to the cache: %s" % file_id) + # Fill with pages + el_div_list = tree_root.findall(".//mets:div", NS) + if el_div_list is None or len(el_div_list) == 0: + return + else: + log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') + + for el_div in el_div_list: + div_id = el_div.get('ID') + print("DIV_ID: %s" % el_div.get('ID')) + + # May not be needed if there are no comments inside the mets file + if div_id is None: + continue - # log.info("_fill_caches> total fileGrp cache elements: %s" % len(self._fileGrp_cache)) + self._page_cache[div_id] = el_div + + # Assign an empty dictionary that will hold the fptr of the added page (div) + self._fptr_cache[div_id] = {} + + # log.info("Page_id added to the cache: %s" % div_id) + + for el_fptr in el_div: + self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) + # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) + + # log.info("Len of page_cache: %s" % len(self._page_cache)) + # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) def _clear_caches(self): """ @@ -119,6 +163,7 @@ def _clear_caches(self): """ self._file_cache = None + self._fptr_cache = None """ log = getLogger('ocrd_models.ocrd_mets._clear_caches') @@ -191,6 +236,11 @@ def file_groups(self): """ List the `@USE` of all `mets:fileGrp` entries. """ + + # WARNING: Actually we cannot return strings in place of elements! + #if self._cache_flag: + # return self._file_cache.keys() + return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] def find_all_files(self, *args, **kwargs): @@ -200,94 +250,6 @@ def find_all_files(self, *args, **kwargs): Equivalent to ``list(self.find_files(...))`` """ - """ - # NOTE: This code gets complex with the REGEX. - # Having two separate funcitons: with REGEX and without REGEX would simplify things - if self._cache_flag: - matches = [] - - # If only both the fileGrp and ID parameters have been passed - # Faster search in the cache - if 'ID' in kwargs and 'fileGrp' in kwargs and 'pageId' not in kwargs and 'mimetype' not in kwargs and 'url' not in kwargs: - fileGrp = kwargs['fileGrp'] - fileID = kwargs['ID'] - - if fileID.startswith(REGEX_PREFIX): - fileID = re.compile(fileID[REGEX_PREFIX_LEN:]) - if fileGrp.startswith(REGEX_PREFIX): - fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:]) - - # Case where no regex pattern is given and - # exact match could be obtained - if (isinstance(fileID, str) and isinstance(fileGrp, str)): - if fileGrp in self._file_cache: - if fileID in self._file_cache[fileGrp]: - matches.append(OcrdFile(self._file_cache[fileGrp][fileID], mets=self)) - elif isinstance(fileGrp, str): - # fileGrp is str and fileID is regex - if fileGrp in self._file_cache: - for fileID_str in self._file_cache[fileGrp]: - if fileID.fullmatch(fileID_str): - matches.append(OcrdFile(self._file_cache[fileGrp][fileID_str], mets=self)) - - elif isinstance(fileID, str): - # fileID is str and fileGrp is regex - for fileGrp_str in self._file_cache: - if fileGrp.fullmatch(fileGrp_str): - if fileID in self._file_cache[fileGrp_str]: - matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID], mets=self)) - - else: - # both are regex: this has a really bad performance since - # we have to iterate all groups and all files to check for matches - for fileGrp_str in self._file_cache: - if fileGrp.fullmatch(fileGrp_str): - for fileID_str in self._file_cache[fileGrp_str]: - if fileID.fullmatch(fileID_str): - matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID_str], mets=self)) - - return matches - - # If only the fileGrp parameter has been passed - # Return a list with all files of that fileGrp from the cache - if 'fileGrp' in kwargs and 'ID' not in kwargs and 'pageId' not in kwargs and 'mimetype' not in kwargs and 'url' not in kwargs: - fileGrp = kwargs['fileGrp'] - - if fileGrp.startswith(REGEX_PREFIX): - fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:]) - - for fileGrp_str in self._file_cache: - # log.info("Type(fileGrp): %s, Type(fileGrp_str): %s" % (type(fileGrp), type(fileGrp_str))) - if fileGrp.fullmatch(fileGrp_str): - for fileID in self._file_cache[fileGrp_str]: - matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID], mets=self)) - else: - if fileGrp in self._file_cache: - for fileID in self._file_cache[fileGrp]: - matches.append(OcrdFile(self._file_cache[fileGrp][fileID], mets=self)) - - return matches - - # If only the ID parameter has been passed - # Return a list with that fileID inside or an empty list if not in cache - if 'ID' in kwargs and 'fileGrp' not in kwargs and 'pageId' not in kwargs and 'mimetype' not in kwargs and 'url' not in kwargs: - fileID = kwargs['ID'] - - if fileID.startswith(REGEX_PREFIX): - fileID = re.compile(fileID[REGEX_PREFIX_LEN:]) - for fileGrp_str in self._file_cache: - for fileID_str in self._file_cache[fileGrp_str]: - if fileID.fullmatch(fileID_str): - matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID_str], mets=self)) - - else: - for fileGrp_str in self._file_cache: - if fileID in self._file_cache[fileGrp_str]: - matches.append(OcrdFile(self._file_cache[fileGrp_str][fileID], mets=self)) - - return matches - """ - return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements @@ -469,6 +431,7 @@ def remove_file_group(self, USE, recursive=False, force=False): if not recursive: raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: + # NOTE: Here we know the fileGrp, we should pass it as a parameter self.remove_one_file(f.get('ID')) if self._cache_flag: @@ -504,8 +467,6 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force if not REGEX_FILE_ID.fullmatch(fileGrp): raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) - el_fileGrp = None - """ # Note: we do not benefit enough from having # a separate cache for fileGrp elements @@ -515,8 +476,7 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force el_fileGrp = self._fileGrp_cache[fileGrp] """ - if el_fileGrp is None: - el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) + el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) if el_fileGrp is None: el_fileGrp = self.add_file_group(fileGrp) @@ -530,14 +490,20 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force mets_file.url = url mets_file.mimetype = mimetype mets_file.ID = ID + # The line below uses the pageId setter which + # caches the required data inside + # self._page_cache and self._fptr_cache mets_file.pageId = pageId mets_file.local_filename = local_filename else: - kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v} + # To get rid of Python's FutureWarning + kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) + # The caching of the physical page is done in the OcrdFile constructor mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) if self._cache_flag: + # Add the file to the file cache self._file_cache[fileGrp].update({ID: el_mets_file}) return mets_file @@ -576,21 +542,40 @@ def remove_one_file(self, ID): ocrd_file = ID ID = ocrd_file.ID else: + # NOTE: We should pass the fileGrp, if known, as a parameter here as well + # Leaving that out for now ocrd_file = next(self.find_files(ID=ID), None) if not ocrd_file: raise FileNotFoundError("File not found: %s" % ID) # Delete the physical page ref - for fptr in self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS): + fptrs = [] + if self._cache_flag: + for page in self._fptr_cache.keys(): + if ID in self._fptr_cache[page]: + fptrs.append(self._fptr_cache[page][ID]) + else: + fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) + + for fptr in fptrs: log.info("Delete fptr element %s for page '%s'", fptr, ID) page_div = fptr.getparent() page_div.remove(fptr) + # Remove the fptr from the cache as well + if self._cache_flag: + del self._fptr_cache[page_div.get('ID')][ID] + # delete empty pages if not page_div.getchildren(): log.info("Delete empty page %s", page_div) page_div.getparent().remove(page_div) + # Delete the empty pages from caches as well + if self._cache_flag: + del self._page_cache[page_div.get('ID')] + del self._fptr_cache[page_div.get('ID')] + # Delete the file reference from the cache if self._cache_flag: parent_use = ocrd_file._el.getparent().get('USE') # Note: if the file is in the XML tree, @@ -611,6 +596,9 @@ def physical_pages(self): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) """ + if self._cache_flag: + return self._page_cache.values() + return self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS) @@ -622,13 +610,24 @@ def get_physical_pages(self, for_fileIds=None): """ if for_fileIds is None: return self.physical_pages + ret = [None] * len(for_fileIds) - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - for fptr in page.findall('mets:fptr', NS): - if fptr.get('FILEID') in for_fileIds: - ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + + # Note: This entire function potentially could be further simplified + # TODO: Simplify + if self._cache_flag: + for pageId in self._fptr_cache.keys(): + for fptr in self._fptr_cache[pageId].keys(): + if fptr in for_fileIds: + ret[for_fileIds.index(fptr)] = pageId + else: + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + for fptr in page.findall('mets:fptr', NS): + if fptr.get('FILEID') in for_fileIds: + ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + return ret def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): @@ -645,9 +644,31 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N """ # print(pageId, ocrd_file) # delete any page mapping for this file.ID - for el_fptr in self._tree.getroot().findall( + + # NOTE: The pageId coming from 'test_merge(sbb_sample_01)' is an Element not a string + # if not isinstance(pageId, str): + # pageId = pageId.get('ID') + + candidates = [] + + # WARNING: + # For some reason the parents of el_fptr are None when + # the candidates list is created from the fptr_cache + # I do not know how to fix that yet + # TODO: Have to be fixed + + # The first block is never executed + if False and self._cache_flag: + for page_id in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[page_id].keys(): + if self._fptr_cache[page_id][ocrd_file.ID] is not None: + candidates.append(self._fptr_cache[page_id][ocrd_file.ID]) + else: + candidates = self._tree.getroot().findall( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % - ocrd_file.ID, namespaces=NS): + ocrd_file.ID, namespaces=NS) + + for el_fptr in candidates: el_fptr.getparent().remove(el_fptr) # find/construct as necessary @@ -659,7 +680,23 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N if el_seqdiv is None: el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) el_seqdiv.set('TYPE', 'physSequence') - el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + + el_pagediv = None + + # WARNING: + # The page cache returns: 'lxml.etree._Element' + # The LXML returns: 'lxml.etree._ElementUnicodeResult' + # It bugs a lot of things + # I do not know how to fix that yet + # TODO: Have to be fixed + + # The first block is never executed + if False and self._cache_flag: + if pageId in self._page_cache.keys(): + el_pagediv = self._page_cache[pageId] + else: + el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + if el_pagediv is None: el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) el_pagediv.set('TYPE', 'page') @@ -668,29 +705,65 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv.set('ORDER', order) if orderlabel: el_pagediv.set('ORDERLABEL', orderlabel) + + if self._cache_flag: + self._page_cache[pageId] = el_pagediv + # Assign an empty dictionary to hold the fileids + self._fptr_cache[pageId] = {} + el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) el_fptr.set('FILEID', ocrd_file.ID) + # Assign the fptr to the existing pageId in the cache + if self._cache_flag: + self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) + def get_physical_page_for_file(self, ocrd_file): """ Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ - ret = self._tree.getroot().xpath( - '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % - ocrd_file.ID, namespaces=NS) - if ret: + ret = [] + + # WARNING: + # The page cache returns: 'lxml.etree._Element' + # The LXML returns: 'lxml.etree._ElementUnicodeResult' + # It bugs a lot of things + # I do not know how to fix that yet + # TODO: Have to be fixed + + # The first block is never executed + if False and self._cache_flag: + for pageId in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[pageId].keys(): + # We need the page element, not a string + ret.append(self._page_cache[pageId]) + else: + ret = self._tree.getroot().xpath( + '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % + ocrd_file.ID, namespaces=NS) + + # To get rid of the python's FutureWarning + if len(ret): return ret[0] def remove_physical_page(self, ID): """ Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. """ - mets_div = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, - namespaces=NS) - if mets_div: + mets_div = None + if self._cache_flag: + if ID in self._page_cache.keys(): + mets_div = [self._page_cache[ID]] + else: + mets_div = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, + namespaces=NS) + if mets_div is not None: mets_div[0].getparent().remove(mets_div[0]) + if self._cache_flag: + del self._page_cache[ID] + del self._fptr_cache[ID] def remove_physical_page_fptr(self, fileId): """ @@ -698,13 +771,28 @@ def remove_physical_page_fptr(self, fileId): Returns: List of pageIds that mets:fptrs were deleted from """ - mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, - namespaces=NS) + + # Question: What is the reason to keep a list of mets_fptrs? + # Do we have a situation in which the fileId is same for different pageIds ? + # From the examples I have seen inside 'assets' that is not the case + # and the mets_fptrs list will always contain a single element. + # If that's the case then we do not need to iterate 2 loops, just one. + + mets_fptrs = [] + if self._cache_flag: + for page_id in self._fptr_cache.keys(): + if fileId in self._fptr_cache[page_id].keys(): + mets_fptrs.append(self._fptr_cache[page_id][fileId]) + else: + mets_fptrs = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, + namespaces=NS) ret = [] for mets_fptr in mets_fptrs: mets_div = mets_fptr.getparent() ret.append(mets_div.get('ID')) + if self._cache_flag: + del self._fptr_cache[mets_div.get('ID')][mets_fptr.get('FILEID')] mets_div.remove(mets_fptr) return ret @@ -727,5 +815,9 @@ def merge(self, other_mets, fileGrp_mapping=None, after_add_cb=None, **kwargs): url=f_src.url, ID=f_src.ID, pageId=f_src.pageId) + + print(f"Merge: Type of f_dest pageId is: {type(f_dest.pageId)}") + print(f"Merge: Type of f_src pageId is:{type(f_src.pageId)}") + if after_add_cb: after_add_cb(f_dest) diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py index 2e625d6078..c403fc9bdd 100644 --- a/tests/model/test_ocrd_mets_cache.py +++ b/tests/model/test_ocrd_mets_cache.py @@ -74,7 +74,7 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE assert len(sbb_sample_01.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' + assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_0001 and PHYS_0002' def test_find_all_files_local_only(sbb_sample_01): @@ -133,7 +133,6 @@ def test_add_file(): assert f2.pageId == 'barfoo', 'pageId changed' assert len(mets.file_groups) == 1, '1 file group' - def test_add_file_id_already_exists(sbb_sample_01): f = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") assert f.ID == 'best-id-ever', "ID kept" @@ -147,8 +146,8 @@ def test_add_file_id_already_exists(sbb_sample_01): @pytest.mark.xfail(reason='2x same ID is valid if ignore == True') def test_add_file_ignore(sbb_sample_01: OcrdMets): - """Behavior if ignore-Flag set to true: - delegate responsibility to overwrite existing files to user""" + # Behavior if ignore-Flag set to true: + # delegate responsibility to overwrite existing files to user the_file = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") assert the_file.ID == 'best-id-ever' @@ -181,7 +180,6 @@ def test_add_file_no_pageid(sbb_sample_01): f = sbb_sample_01.add_file('OUTPUT', mimetype="bla/quux", ID="foo3") assert not f.pageId, 'No pageId available, dude!' - def test_file_pageid(sbb_sample_01): f = sbb_sample_01.find_all_files()[0] assert f.pageId == 'PHYS_0001' @@ -196,9 +194,7 @@ def test_agent(sbb_sample_01): def test_metshdr(): - """ - Test whether metsHdr is created on-demand - """ + # Test whether metsHdr is created on-demand mets = OcrdMets(content="", cache_flag=True) assert not mets._tree.getroot().getchildren() mets.add_agent() @@ -281,9 +277,7 @@ def test_remove_non_empty_filegroup_exception(sbb_directory_ocrd_mets): def test_remove_file_group0(sbb_directory_ocrd_mets): - """ - Test removal of filegrp - """ + # Test removal of filegrp assert len(sbb_directory_ocrd_mets.file_groups) == 17 assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 @@ -294,9 +288,7 @@ def test_remove_file_group0(sbb_directory_ocrd_mets): def test_remove_file_group_regex(sbb_directory_ocrd_mets): - """ - Test removal of filegrp - """ + # Test removal of filegrp assert len(sbb_directory_ocrd_mets.file_groups) == 17 assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 @@ -308,16 +300,16 @@ def test_remove_file_group_regex(sbb_directory_ocrd_mets): assert len(sbb_directory_ocrd_mets.file_groups) == 15 assert len(sbb_directory_ocrd_mets.find_all_files()) == 31 - +""" def test_merge(sbb_sample_01): assert len(sbb_sample_01.file_groups) == 17 other_mets = OcrdMets(filename=assets.path_to('kant_aufklaerung_1784/data/mets.xml'), cache_flag=True) sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) assert len(sbb_sample_01.file_groups) == 18 - +""" def test_invalid_filegrp(): - """addresses https://github.com/OCR-D/core/issues/746""" + # addresses https://github.com/OCR-D/core/issues/746 mets = OcrdMets(content="", cache_flag=True) with pytest.raises(ValueError) as val_err: From d7d196e8567db719e0521321e5770fd336890a7e Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Fri, 3 Jun 2022 22:13:00 +0200 Subject: [PATCH 06/44] Add the missed page_cache in clearCache --- ocrd_models/ocrd_models/ocrd_mets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 80b7a4fb65..23e1cc8ba3 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -163,6 +163,7 @@ def _clear_caches(self): """ self._file_cache = None + self._page_cache = None self._fptr_cache = None """ From 0058823c46b6e401ecfc49af7258d46f61ccb072 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 7 Jun 2022 17:02:17 +0200 Subject: [PATCH 07/44] Fixing some bugs --- ocrd_models/ocrd_models/ocrd_mets.py | 65 +++++++++++++--------------- tests/model/test_ocrd_mets_cache.py | 7 ++- 2 files changed, 32 insertions(+), 40 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 23e1cc8ba3..c5237f54da 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -293,12 +293,24 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if '..' in pageId_: pageIds_expanded += generate_range(*pageId_.split('..', 2)) pageIds += pageIds_expanded - for page in self._tree.getroot().xpath( - '//mets:div[@TYPE="page"]', namespaces=NS): - if (page.get('ID') in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page.get('ID'))): - pageId.extend( - [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + + # Note: This code could be further simplified + # Once we get rid of the 'self._cache_flag' checks + # and using the cache becomes the default + if self._cache_flag: + for page in self._page_cache.keys(): + if (page in pageIds if isinstance(pageIds, list) else + pageIds.fullmatch(page)): + pageId.extend(self._fptr_cache[page]) + else: + # Note: this inline written code is horrible to understand and debug... + for page in self._tree.getroot().xpath( + '//mets:div[@TYPE="page"]', namespaces=NS): + if (page.get('ID') in pageIds if isinstance(pageIds, list) else + pageIds.fullmatch(page.get('ID'))): + pageId.extend( + [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + if ID and ID.startswith(REGEX_PREFIX): ID = re.compile(ID[REGEX_PREFIX_LEN:]) if fileGrp and fileGrp.startswith(REGEX_PREFIX): @@ -652,14 +664,7 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N candidates = [] - # WARNING: - # For some reason the parents of el_fptr are None when - # the candidates list is created from the fptr_cache - # I do not know how to fix that yet - # TODO: Have to be fixed - - # The first block is never executed - if False and self._cache_flag: + if self._cache_flag: for page_id in self._fptr_cache.keys(): if ocrd_file.ID in self._fptr_cache[page_id].keys(): if self._fptr_cache[page_id][ocrd_file.ID] is not None: @@ -670,6 +675,9 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N ocrd_file.ID, namespaces=NS) for el_fptr in candidates: + if self._cache_flag: + del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID] + el_fptr.getparent().remove(el_fptr) # find/construct as necessary @@ -684,15 +692,7 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv = None - # WARNING: - # The page cache returns: 'lxml.etree._Element' - # The LXML returns: 'lxml.etree._ElementUnicodeResult' - # It bugs a lot of things - # I do not know how to fix that yet - # TODO: Have to be fixed - - # The first block is never executed - if False and self._cache_flag: + if self._cache_flag: if pageId in self._page_cache.keys(): el_pagediv = self._page_cache[pageId] else: @@ -708,15 +708,17 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv.set('ORDERLABEL', orderlabel) if self._cache_flag: + # Create a new entry in the page cache self._page_cache[pageId] = el_pagediv - # Assign an empty dictionary to hold the fileids + # Create a new entry in the fptr cache and + # assign an empty dictionary to hold the fileids self._fptr_cache[pageId] = {} el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) el_fptr.set('FILEID', ocrd_file.ID) - # Assign the fptr to the existing pageId in the cache if self._cache_flag: + # Assign the ocrd fileID to the pageId in the cache self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) def get_physical_page_for_file(self, ocrd_file): @@ -726,19 +728,10 @@ def get_physical_page_for_file(self, ocrd_file): """ ret = [] - # WARNING: - # The page cache returns: 'lxml.etree._Element' - # The LXML returns: 'lxml.etree._ElementUnicodeResult' - # It bugs a lot of things - # I do not know how to fix that yet - # TODO: Have to be fixed - - # The first block is never executed - if False and self._cache_flag: + if self._cache_flag: for pageId in self._fptr_cache.keys(): if ocrd_file.ID in self._fptr_cache[pageId].keys(): - # We need the page element, not a string - ret.append(self._page_cache[pageId]) + ret.append(self._page_cache[pageId].get('ID')) else: ret = self._tree.getroot().xpath( '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py index c403fc9bdd..7380434006 100644 --- a/tests/model/test_ocrd_mets_cache.py +++ b/tests/model/test_ocrd_mets_cache.py @@ -76,11 +76,11 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_0001 and PHYS_0002' - +""" def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', local_only=True)) == 3, '3 local files for page "PHYS_0001"' - +""" def test_physical_pages(sbb_sample_01): assert len(sbb_sample_01.physical_pages) == 3, '3 physical pages' @@ -300,13 +300,12 @@ def test_remove_file_group_regex(sbb_directory_ocrd_mets): assert len(sbb_directory_ocrd_mets.file_groups) == 15 assert len(sbb_directory_ocrd_mets.find_all_files()) == 31 -""" + def test_merge(sbb_sample_01): assert len(sbb_sample_01.file_groups) == 17 other_mets = OcrdMets(filename=assets.path_to('kant_aufklaerung_1784/data/mets.xml'), cache_flag=True) sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) assert len(sbb_sample_01.file_groups) == 18 -""" def test_invalid_filegrp(): # addresses https://github.com/OCR-D/core/issues/746 From ce3ffc8c4614334f1354db8dc4a5afdcef1c8d70 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 8 Jun 2022 11:43:05 +0200 Subject: [PATCH 08/44] Extend tests for 200 pages --- tests/model/test_ocrd_mets_bench.py | 53 ++++++++++++++++++++++------- tests/model/test_ocrd_mets_cache.py | 4 +-- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/tests/model/test_ocrd_mets_bench.py b/tests/model/test_ocrd_mets_bench.py index 673f3782a6..d4f88d0665 100644 --- a/tests/model/test_ocrd_mets_bench.py +++ b/tests/model/test_ocrd_mets_bench.py @@ -46,30 +46,28 @@ def _build_mets(number_of_pages, force=False, cache_flag=False): def assert_len(expected_len, mets, kwargs): test_list = mets.find_all_files(**kwargs) - # print("kwargs: %s" % kwargs) - # print("expected_len= %s, assert_len= %s" %(expected_len, len(test_list))) assert expected_len == len(test_list) def benchmark_find_files(number_of_pages, mets): benchmark_find_files_filegrp(number_of_pages, mets) benchmark_find_files_fileid(number_of_pages, mets) - #benchmark_find_files_all(number_of_pages, mets) + benchmark_find_files_all(number_of_pages, mets) def benchmark_find_files_filegrp(number_of_pages, mets): - # Best case - first fileGrp + # Best case - first fileGrp assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) # Worst case - does not exist assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) def benchmark_find_files_fileid(number_of_pages, mets): - # Best case - first file ID + # Best case - first file ID assert_len(1, mets, dict(ID='FULL_0001_TIF')) # Worst case - does not exist assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS')) # Get all files, i.e., pass an empty search parameter -> dict() -#def benchmark_find_files_all(number_of_pages, mets): -# assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) +def benchmark_find_files_all(number_of_pages, mets): + assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) # ----- METS files global variables ----- # @@ -77,8 +75,9 @@ def benchmark_find_files_fileid(number_of_pages, mets): mets_10 = None mets_20 = None mets_50 = None +mets_200 = None -# ----- Build mets files with 5-10-20-50 pages ----- # +# ----- Build mets files with 5-10-20-50-200 pages ----- # @mark.benchmark(group="build") def test_b5(benchmark): @benchmark @@ -106,8 +105,15 @@ def test_b50(benchmark): def result(): global mets_50 mets_50 = _build_mets(50, force=True) + +@mark.benchmark(group="build") +def test_b200(benchmark): + @benchmark + def result(): + global mets_200 + mets_200 = _build_mets(200, force=True) -# ----- Search for files with 5-10-20-50 pages ----- # +# ----- Search for files with 5-10-20-50-200 pages ----- # @mark.benchmark(group="search") def test_s5(benchmark): @benchmark @@ -136,11 +142,18 @@ def ret(): global mets_50 benchmark_find_files(50, mets_50) +@mark.benchmark(group="search") +def test_s200(benchmark): + @benchmark + def ret(): + global mets_200 + benchmark_find_files(200, mets_200) + del mets_5 del mets_10 del mets_20 del mets_50 - +del mets_200 # ----- METS files (cached) global variables ----- # @@ -148,8 +161,9 @@ def ret(): mets_c_10 = None mets_c_20 = None mets_c_50 = None +mets_c_200 = None -# ----- Build mets files (cached) with 5-10-20-50 pages ----- # +# ----- Build mets files (cached) with 5-10-20-50-200 pages ----- # @mark.benchmark(group="build") def test_b5_c(benchmark): @benchmark @@ -177,8 +191,15 @@ def test_b50_c(benchmark): def result(): global mets_c_50 mets_c_50 = _build_mets(50, force=True, cache_flag=True) + +@mark.benchmark(group="build") +def test_b200_c(benchmark): + @benchmark + def result(): + global mets_c_200 + mets_c_200 = _build_mets(200, force=True, cache_flag=True) -# ----- Search for files (cached) with 5-10-20-50 pages ----- # +# ----- Search for files (cached) with 5-10-20-50-200 pages ----- # @mark.benchmark(group="search") def test_s5_c(benchmark): @benchmark @@ -207,10 +228,18 @@ def ret(): global mets_c_50 benchmark_find_files(50, mets_c_50) +@mark.benchmark(group="search") +def test_s200_c(benchmark): + @benchmark + def ret(): + global mets_c_200 + benchmark_find_files(200, mets_c_200) + del mets_c_5 del mets_c_10 del mets_c_20 del mets_c_50 +del mets_c_200 def manual_t(): mets = _build_mets(2, cache_flag=False) diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py index 7380434006..691f422bc7 100644 --- a/tests/model/test_ocrd_mets_cache.py +++ b/tests/model/test_ocrd_mets_cache.py @@ -76,11 +76,11 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_0001 and PHYS_0002' -""" + def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', local_only=True)) == 3, '3 local files for page "PHYS_0001"' -""" + def test_physical_pages(sbb_sample_01): assert len(sbb_sample_01.physical_pages) == 3, '3 physical pages' From 39bdf5e7baa5dc966798ca8881f22a372b4444dd Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 8 Jun 2022 12:10:00 +0200 Subject: [PATCH 09/44] Comment out test case for 200 pages - takes too long --- tests/model/test_ocrd_mets_bench.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/model/test_ocrd_mets_bench.py b/tests/model/test_ocrd_mets_bench.py index d4f88d0665..1d42a4cddb 100644 --- a/tests/model/test_ocrd_mets_bench.py +++ b/tests/model/test_ocrd_mets_bench.py @@ -105,13 +105,15 @@ def test_b50(benchmark): def result(): global mets_50 mets_50 = _build_mets(50, force=True) - + +""" @mark.benchmark(group="build") def test_b200(benchmark): @benchmark def result(): global mets_200 mets_200 = _build_mets(200, force=True) +""" # ----- Search for files with 5-10-20-50-200 pages ----- # @mark.benchmark(group="search") @@ -142,12 +144,14 @@ def ret(): global mets_50 benchmark_find_files(50, mets_50) +""" @mark.benchmark(group="search") def test_s200(benchmark): @benchmark def ret(): global mets_200 benchmark_find_files(200, mets_200) +""" del mets_5 del mets_10 @@ -191,13 +195,15 @@ def test_b50_c(benchmark): def result(): global mets_c_50 mets_c_50 = _build_mets(50, force=True, cache_flag=True) - + +""" @mark.benchmark(group="build") def test_b200_c(benchmark): @benchmark def result(): global mets_c_200 mets_c_200 = _build_mets(200, force=True, cache_flag=True) +""" # ----- Search for files (cached) with 5-10-20-50-200 pages ----- # @mark.benchmark(group="search") @@ -228,13 +234,15 @@ def ret(): global mets_c_50 benchmark_find_files(50, mets_c_50) +""" @mark.benchmark(group="search") def test_s200_c(benchmark): @benchmark def ret(): global mets_c_200 benchmark_find_files(200, mets_c_200) - +""" + del mets_c_5 del mets_c_10 del mets_c_20 From f8d3ac246bab71b6db6cd30bd302fa6715e12319 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 15 Jun 2022 11:16:21 +0200 Subject: [PATCH 10/44] No change. Trigger scrutinizer again. --- tests/model/test_ocrd_mets_bench.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/model/test_ocrd_mets_bench.py b/tests/model/test_ocrd_mets_bench.py index 1d42a4cddb..7a58458007 100644 --- a/tests/model/test_ocrd_mets_bench.py +++ b/tests/model/test_ocrd_mets_bench.py @@ -11,6 +11,7 @@ import pprint + # LOG = getLogger('ocrd.benchmark.mets') GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP'] From b423d1db50f276bf821aa7dbfe10e56c066a6799 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 28 Jun 2022 11:11:45 +0200 Subject: [PATCH 11/44] Include extreme example benchmarking tests --- tests/model/mets_bench_extreme.py | 277 ++++++++++++++++++++++++++++ tests/model/test_ocrd_mets_bench.py | 61 ++---- 2 files changed, 292 insertions(+), 46 deletions(-) create mode 100644 tests/model/mets_bench_extreme.py diff --git a/tests/model/mets_bench_extreme.py b/tests/model/mets_bench_extreme.py new file mode 100644 index 0000000000..63b30e31db --- /dev/null +++ b/tests/model/mets_bench_extreme.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- + +from contextlib import contextmanager +from time import time + +from pytest import main, fixture, mark + +from ocrd import Resolver +from ocrd_utils import MIME_TO_EXT, getLogger +from ocrd_models import OcrdMets + +logger = getLogger('ocrd.benchmark.mets') + +GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP'] +GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR'] + +REGIONS_PER_PAGE = 2 +LINES_PER_REGION = 2 +FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE + +# Caching is disabled by default +def _build_mets(number_of_pages, force=False, cache_flag=False): + mets = OcrdMets.empty_mets(cache_flag=cache_flag) + mets._number_of_pages = number_of_pages + + for n in ['%04d' % (n + 1) for n in range(number_of_pages)]: + _add_file = lambda n, fileGrp, mimetype, ID=None: mets.add_file( + fileGrp, + mimetype=mimetype, + pageId='PHYS_%s' % n, + ID=ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), + url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype)) + ) + for grp in GRPS_IMG: + # LINES_PER_REGION = 2 + _add_file(n, grp, 'image/tiff') + _add_file(n, grp, 'application/vnd.prima.page+xml') + for grp in GRPS_REG: + # REGIONS_PER_PAGE = 2 + for region_n in range(REGIONS_PER_PAGE): + _add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n)) + + return mets + +def assert_len(expected_len, mets, kwargs): + test_list = mets.find_all_files(**kwargs) + assert expected_len == len(test_list) + +def benchmark_find_files(number_of_pages, mets): + benchmark_find_files_filegrp(number_of_pages, mets) + benchmark_find_files_fileid(number_of_pages, mets) + benchmark_find_files_physical_page(number_of_pages, mets) + # This is not really useful to measure. + # We iterate all files in both cached and non-cached in the same routine + # When no specific search parameters are provided + # benchmark_find_files_all(number_of_pages, mets) + +def benchmark_find_files_filegrp(number_of_pages, mets): + # Best case - first fileGrp + assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) + # Worst case - does not exist + assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) + +def benchmark_find_files_fileid(number_of_pages, mets): + # Best case - first file ID + assert_len(1, mets, dict(ID='FULL_0001_TIF', fileGrp='FULL')) + # Worst case - does not exist + assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS', fileGrp='FULL-NOTEXIST')) + +def benchmark_find_files_physical_page(number_of_pages, mets): + # Best case - first physical page + assert_len(FILES_PER_PAGE, mets, dict(pageId='PHYS_0001')) + # Worst case - does not exist + assert_len(0, mets, dict(pageId='PHYS_0001-NOTEXISTS')) + +# Get all files, i.e., pass an empty search parameter -> dict() +def benchmark_find_files_all(number_of_pages, mets): + assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) + + + + +# ---- BENCHMARKING for 50-500-1000-2000-5000 pages ---- # + +# ----- 50 pages -> build, search, build (cached), search (cached) ----- # +mets_50 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b50(benchmark): + @benchmark + def result(): + global mets_50 + mets_50 = _build_mets(50, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s50(benchmark): + @benchmark + def ret(): + global mets_50 + benchmark_find_files(50, mets_50) +del mets_50 + +mets_c_50 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b50_c(benchmark): + @benchmark + def result(): + global mets_c_50 + mets_c_50 = _build_mets(50, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s50_c(benchmark): + @benchmark + def ret(): + global mets_c_50 + benchmark_find_files(50, mets_c_50) +del mets_c_50 +# ----------------------------------------------------------------------- # + + + +# ----- 500 pages -> build, search, build (cached), search (cached) ----- # +mets_500 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b500(benchmark): + @benchmark + def result(): + global mets_500 + mets_500 = _build_mets(500, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s500(benchmark): + @benchmark + def ret(): + global mets_500 + benchmark_find_files(500, mets_500) +del mets_500 + + +mets_c_500 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b500_c(benchmark): + @benchmark + def result(): + global mets_c_500 + mets_c_500 = _build_mets(500, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s500_c(benchmark): + @benchmark + def ret(): + global mets_c_500 + benchmark_find_files(500, mets_c_500) +del mets_c_500 + +# ----------------------------------------------------------------------- # + + + +# ----- 1000 pages -> build, search, build (cached), search (cached) ----- # +mets_1000 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b1000(benchmark): + @benchmark + def result(): + global mets_1000 + mets_1000 = _build_mets(1000, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s1000(benchmark): + @benchmark + def ret(): + global mets_1000 + benchmark_find_files(1000, mets_1000) +del mets_1000 + +mets_c_1000 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b1000_c(benchmark): + @benchmark + def result(): + global mets_c_1000 + mets_c_1000 = _build_mets(1000, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s1000_c(benchmark): + @benchmark + def ret(): + global mets_c_1000 + benchmark_find_files(1000, mets_c_1000) +del mets_c_1000 + +# ------------------------------------------------------------------------ # + + + +# ----- 2000 pages -> build, search, build (cached), search (cached) ----- # +mets_2000 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b2000(benchmark): + @benchmark + def result(): + global mets_2000 + mets_2000 = _build_mets(2000, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s2000(benchmark): + @benchmark + def ret(): + global mets_2000 + benchmark_find_files(2000, mets_2000) +del mets_2000 + +mets_c_2000 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b2000_c(benchmark): + @benchmark + def result(): + global mets_c_2000 + mets_c_2000 = _build_mets(2000, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s2000_c(benchmark): + @benchmark + def ret(): + global mets_c_2000 + benchmark_find_files(2000, mets_c_2000) +del mets_c_2000 + +# ------------------------------------------------------------------------ # + + + +# ----- 5000 pages -> build, search, build (cached), search (cached) ----- # +mets_5000 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b5000(benchmark): + @benchmark + def result(): + global mets_5000 + mets_5000 = _build_mets(5000, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s5000(benchmark): + @benchmark + def ret(): + global mets_5000 + benchmark_find_files(5000, mets_5000) +del mets_5000 + +mets_c_5000 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b5000_c(benchmark): + @benchmark + def result(): + global mets_c_5000 + mets_c_5000 = _build_mets(5000, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s5000_c(benchmark): + @benchmark + def ret(): + global mets_c_5000 + benchmark_find_files(5000, mets_c_5000) +del mets_c_5000 + +# ------------------------------------------------------------------------ # + +if __name__ == '__main__': + args = [''] + # args.append('--benchmark-max-time=10') + # args.append('--benchmark-min-time=0.1') + # args.append('--benchmark-warmup=False') + # args.append('--benchmark-disable-gc') + args.append('--benchmark-verbose') + args.append('--benchmark-min-rounds=1') + args.append('--tb=short') + main(args) diff --git a/tests/model/test_ocrd_mets_bench.py b/tests/model/test_ocrd_mets_bench.py index 7a58458007..ace6387336 100644 --- a/tests/model/test_ocrd_mets_bench.py +++ b/tests/model/test_ocrd_mets_bench.py @@ -56,19 +56,25 @@ def benchmark_find_files(number_of_pages, mets): def benchmark_find_files_filegrp(number_of_pages, mets): # Best case - first fileGrp - assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) - # Worst case - does not exist - assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) + assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) + # Worst case - does not exist + assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) def benchmark_find_files_fileid(number_of_pages, mets): # Best case - first file ID - assert_len(1, mets, dict(ID='FULL_0001_TIF')) - # Worst case - does not exist - assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS')) + assert_len(1, mets, dict(ID='FULL_0001_TIF')) + # Worst case - does not exist + assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS')) + +def benchmark_find_files_physical_page(number_of_pages, mets): + # Best case - first physical page + assert_len(1, mets, dict(pageId='PHYS_0001')) + # Worst case - does not exist + assert_len(0, mets, dict(pageId='PHYS_0001-NOTEXISTS')) # Get all files, i.e., pass an empty search parameter -> dict() def benchmark_find_files_all(number_of_pages, mets): - assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) + assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) # ----- METS files global variables ----- # @@ -76,7 +82,6 @@ def benchmark_find_files_all(number_of_pages, mets): mets_10 = None mets_20 = None mets_50 = None -mets_200 = None # ----- Build mets files with 5-10-20-50-200 pages ----- # @mark.benchmark(group="build") @@ -106,15 +111,7 @@ def test_b50(benchmark): def result(): global mets_50 mets_50 = _build_mets(50, force=True) - -""" -@mark.benchmark(group="build") -def test_b200(benchmark): - @benchmark - def result(): - global mets_200 - mets_200 = _build_mets(200, force=True) -""" + # ----- Search for files with 5-10-20-50-200 pages ----- # @mark.benchmark(group="search") @@ -145,20 +142,11 @@ def ret(): global mets_50 benchmark_find_files(50, mets_50) -""" -@mark.benchmark(group="search") -def test_s200(benchmark): - @benchmark - def ret(): - global mets_200 - benchmark_find_files(200, mets_200) -""" del mets_5 del mets_10 del mets_20 del mets_50 -del mets_200 # ----- METS files (cached) global variables ----- # @@ -166,7 +154,6 @@ def ret(): mets_c_10 = None mets_c_20 = None mets_c_50 = None -mets_c_200 = None # ----- Build mets files (cached) with 5-10-20-50-200 pages ----- # @mark.benchmark(group="build") @@ -196,15 +183,7 @@ def test_b50_c(benchmark): def result(): global mets_c_50 mets_c_50 = _build_mets(50, force=True, cache_flag=True) - -""" -@mark.benchmark(group="build") -def test_b200_c(benchmark): - @benchmark - def result(): - global mets_c_200 - mets_c_200 = _build_mets(200, force=True, cache_flag=True) -""" + # ----- Search for files (cached) with 5-10-20-50-200 pages ----- # @mark.benchmark(group="search") @@ -234,21 +213,11 @@ def test_s50_c(benchmark): def ret(): global mets_c_50 benchmark_find_files(50, mets_c_50) - -""" -@mark.benchmark(group="search") -def test_s200_c(benchmark): - @benchmark - def ret(): - global mets_c_200 - benchmark_find_files(200, mets_c_200) -""" del mets_c_5 del mets_c_10 del mets_c_20 del mets_c_50 -del mets_c_200 def manual_t(): mets = _build_mets(2, cache_flag=False) From dc6e387f90c32bc858e52be3c7a41c18ee83b26f Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 4 Oct 2022 12:18:17 +0200 Subject: [PATCH 12/44] Extreme benchmark test for 750 files per page (5000 pages) --- tests/model/mets_bench_extreme_additional.py | 125 +++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 tests/model/mets_bench_extreme_additional.py diff --git a/tests/model/mets_bench_extreme_additional.py b/tests/model/mets_bench_extreme_additional.py new file mode 100644 index 0000000000..31332cada9 --- /dev/null +++ b/tests/model/mets_bench_extreme_additional.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +from contextlib import contextmanager +from time import time + +from pytest import main, fixture, mark + +from ocrd import Resolver +from ocrd_utils import MIME_TO_EXT, getLogger +from ocrd_models import OcrdMets + +logger = getLogger('ocrd.benchmark.mets') + +GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP'] +GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR'] + +# 750 files per page +REGIONS_PER_PAGE = 50 +LINES_PER_REGION = 50 +FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE + +# Caching is disabled by default +def _build_mets(number_of_pages, force=False, cache_flag=False): + mets = OcrdMets.empty_mets(cache_flag=cache_flag) + mets._number_of_pages = number_of_pages + + for n in ['%04d' % (n + 1) for n in range(number_of_pages)]: + _add_file = lambda n, fileGrp, mimetype, ID=None: mets.add_file( + fileGrp, + mimetype=mimetype, + pageId='PHYS_%s' % n, + ID=ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), + url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype)) + ) + for grp in GRPS_IMG: + # LINES_PER_REGION = 2 + _add_file(n, grp, 'image/tiff') + _add_file(n, grp, 'application/vnd.prima.page+xml') + for grp in GRPS_REG: + # REGIONS_PER_PAGE = 2 + for region_n in range(REGIONS_PER_PAGE): + _add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n)) + + return mets + +def assert_len(expected_len, mets, kwargs): + test_list = mets.find_all_files(**kwargs) + assert expected_len == len(test_list) + +def benchmark_find_files(number_of_pages, mets): + benchmark_find_files_filegrp(number_of_pages, mets) + benchmark_find_files_fileid(number_of_pages, mets) + benchmark_find_files_physical_page(number_of_pages, mets) + # This is not really useful to measure. + # We iterate all files in both cached and non-cached in the same routine + # When no specific search parameters are provided + # benchmark_find_files_all(number_of_pages, mets) + +def benchmark_find_files_filegrp(number_of_pages, mets): + # Best case - first fileGrp + assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) + # Worst case - does not exist + assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) + +def benchmark_find_files_fileid(number_of_pages, mets): + # Best case - first file ID + assert_len(1, mets, dict(ID='FULL_0001_TIF', fileGrp='FULL')) + # Worst case - does not exist + assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS', fileGrp='FULL-NOTEXIST')) + +def benchmark_find_files_physical_page(number_of_pages, mets): + # Best case - first physical page + assert_len(FILES_PER_PAGE, mets, dict(pageId='PHYS_0001')) + # Worst case - does not exist + assert_len(0, mets, dict(pageId='PHYS_0001-NOTEXISTS')) + +# Get all files, i.e., pass an empty search parameter -> dict() +def benchmark_find_files_all(number_of_pages, mets): + assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) + +# ----- 5000 pages -> build, search, build (cached), search (cached) ----- # +mets_5000 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b5000(benchmark): + @benchmark + def result(): + global mets_5000 + mets_5000 = _build_mets(5000, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s5000(benchmark): + @benchmark + def ret(): + global mets_5000 + benchmark_find_files(5000, mets_5000) +del mets_5000 + +mets_c_5000 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b5000_c(benchmark): + @benchmark + def result(): + global mets_c_5000 + mets_c_5000 = _build_mets(5000, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s5000_c(benchmark): + @benchmark + def ret(): + global mets_c_5000 + benchmark_find_files(5000, mets_c_5000) +del mets_c_5000 + +# ------------------------------------------------------------------------ # + +if __name__ == '__main__': + args = [''] + # args.append('--benchmark-max-time=10') + # args.append('--benchmark-min-time=0.1') + # args.append('--benchmark-warmup=False') + # args.append('--benchmark-disable-gc') + args.append('--benchmark-verbose') + args.append('--benchmark-min-rounds=1') + args.append('--tb=short') + main(args) From e86b8c2020c085a877a9a240bb9fc639daaf3778 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 12 Oct 2022 17:53:50 +0200 Subject: [PATCH 13/44] clean the changes --- ocrd_models/ocrd_models/ocrd_mets.py | 393 ++++----------------------- 1 file changed, 46 insertions(+), 347 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index c5237f54da..0e4a3e2dda 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -4,7 +4,6 @@ from datetime import datetime import re from lxml import etree as ET -from copy import deepcopy from ocrd_utils import ( is_local_filename, @@ -42,7 +41,7 @@ class OcrdMets(OcrdXmlDocument): """ @staticmethod - def empty_mets(now=None, cache_flag=False): + def empty_mets(now=None): """ Create an empty METS file from bundled template. """ @@ -51,7 +50,7 @@ def empty_mets(now=None, cache_flag=False): tpl = METS_XML_EMPTY.decode('utf-8') tpl = tpl.replace('{{ VERSION }}', VERSION) tpl = tpl.replace('{{ NOW }}', '%s' % now) - return OcrdMets(content=tpl.encode('utf-8'), cache_flag=cache_flag) + return OcrdMets(content=tpl.encode('utf-8')) def __init__(self, **kwargs): """ @@ -59,129 +58,12 @@ def __init__(self, **kwargs): """ super(OcrdMets, self).__init__(**kwargs) - # If cache is enabled - if self._cache_flag: - - # Cache for the files (mets:file) - two nested dictionaries - # The outer dictionary's Key: 'fileGrp.USE' - # The outer dictionary's Value: Inner dictionary - # The inner dictionary's Key: 'file.ID' - # The inner dictionary's Value: a 'file' object at some memory location - self._file_cache = {} - - # Cache for the pages (mets:div) - # The dictionary's Key: 'div.ID' - # The dictionary's Value: a 'div' object at some memory location - self._page_cache = {} - - # Cache for the file pointers (mets:fptr) - two nested dictionaries - # The outer dictionary's Key: 'div.ID' - # The outer dictionary's Value: Inner dictionary - # The inner dictionary's Key: 'fptr.FILEID' - # The inner dictionary's Value: a 'fptr' object at some memory location - self._fptr_cache = {} - - # Note, if the empty_mets() function is used to instantiate OcrdMets - # Then the cache is empty even after this operation - self._fill_caches() - - def __exit__(self): - """ - - """ - if self._cache_flag: - self._clear_caches() - def __str__(self): """ String representation """ return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) - def _fill_caches(self): - """ - Fills the caches with fileGrps and FileIDs - """ - - tree_root = self._tree.getroot() - - # Fill with files - el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) - if el_fileGrp_list is None or len(el_fileGrp_list) == 0: - return - else: - log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') - - for el_fileGrp in el_fileGrp_list: - fileGrp_use = el_fileGrp.get('USE') - - # Note: SBB0000F29300010000/data/mets.xml contains None - # values due to the comments inside the file - if fileGrp_use is None: - continue - - # Assign an empty dictionary that will hold the files of the added fileGrp - self._file_cache[fileGrp_use] = {} - - for el_file in el_fileGrp: - file_id = el_file.get('ID') - self._file_cache[fileGrp_use].update({file_id : el_file}) - # log.info("File added to the cache: %s" % file_id) - - # Fill with pages - el_div_list = tree_root.findall(".//mets:div", NS) - if el_div_list is None or len(el_div_list) == 0: - return - else: - log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') - - for el_div in el_div_list: - div_id = el_div.get('ID') - print("DIV_ID: %s" % el_div.get('ID')) - - # May not be needed if there are no comments inside the mets file - if div_id is None: - continue - - self._page_cache[div_id] = el_div - - # Assign an empty dictionary that will hold the fptr of the added page (div) - self._fptr_cache[div_id] = {} - - # log.info("Page_id added to the cache: %s" % div_id) - - for el_fptr in el_div: - self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) - # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) - - # log.info("Len of page_cache: %s" % len(self._page_cache)) - # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) - - def _clear_caches(self): - """ - Deallocates the caches - """ - - self._file_cache = None - self._page_cache = None - self._fptr_cache = None - - """ - log = getLogger('ocrd_models.ocrd_mets._clear_caches') - - fileGrp_counter = 0 - fileId_counter = 0 - - for key in list(self._file_cache): - for inner_key in list(self._file_cache[key]): - del self._file_cache[key][inner_key] - fileId_counter += 1 - del self._file_cache[key] - fileGrp_counter += 1 - - log.info("_clear_caches> fileGrp: %d, fileId: %d" % (fileGrp_counter, fileId_counter)) - """ - @property def unique_identifier(self): """ @@ -237,11 +119,6 @@ def file_groups(self): """ List the `@USE` of all `mets:fileGrp` entries. """ - - # WARNING: Actually we cannot return strings in place of elements! - #if self._cache_flag: - # return self._file_cache.keys() - return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] def find_all_files(self, *args, **kwargs): @@ -250,7 +127,6 @@ def find_all_files(self, *args, **kwargs): Equivalent to ``list(self.find_files(...))`` """ - return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements @@ -291,26 +167,14 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None pageIds_expanded = [] for pageId_ in pageIds: if '..' in pageId_: - pageIds_expanded += generate_range(*pageId_.split('..', 2)) + pageIds_expanded += generate_range(*pageId_.split('..', 1)) pageIds += pageIds_expanded - - # Note: This code could be further simplified - # Once we get rid of the 'self._cache_flag' checks - # and using the cache becomes the default - if self._cache_flag: - for page in self._page_cache.keys(): - if (page in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page)): - pageId.extend(self._fptr_cache[page]) - else: - # Note: this inline written code is horrible to understand and debug... - for page in self._tree.getroot().xpath( - '//mets:div[@TYPE="page"]', namespaces=NS): - if (page.get('ID') in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page.get('ID'))): - pageId.extend( - [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) - + for page in self._tree.getroot().xpath( + '//mets:div[@TYPE="page"]', namespaces=NS): + if (page.get('ID') in pageIds if isinstance(pageIds, list) else + pageIds.fullmatch(page.get('ID'))): + pageId.extend( + [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) if ID and ID.startswith(REGEX_PREFIX): ID = re.compile(ID[REGEX_PREFIX_LEN:]) if fileGrp and fileGrp.startswith(REGEX_PREFIX): @@ -319,21 +183,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) - - candidates = [] - - if self._cache_flag: - if fileGrp: - if isinstance(fileGrp, str): - candidates += self._file_cache.get(fileGrp, {}).values() - else: - candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()] - else: - candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()] - else: - candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) - - for cand in candidates: + for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): if ID: if isinstance(ID, str): if not ID == cand.get('ID'): continue @@ -343,7 +193,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if pageId is not None and cand.get('ID') not in pageId: continue - if not self._cache_flag and fileGrp: + if fileGrp: if isinstance(fileGrp, str): if cand.getparent().get('USE') != fileGrp: continue else: @@ -365,8 +215,6 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: if not url.fullmatch(cand_url): continue - # Note: why we instantiate a class only to find out that the local_only is set afterwards - # Checking local_only and url before instantiation should be better? f = OcrdFile(cand, mets=self) # If only local resources should be returned and f is not a file path: skip the file @@ -390,11 +238,6 @@ def add_file_group(self, fileGrp): if el_fileGrp is None: el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) - - if self._cache_flag: - # Assign an empty dictionary that will hold the files of the added fileGrp - self._file_cache[fileGrp] = {} - return el_fileGrp def rename_file_group(self, old, new): @@ -406,9 +249,6 @@ def rename_file_group(self, old, new): raise FileNotFoundError("No such fileGrp '%s'" % old) el_fileGrp.set('USE', new) - if self._cache_flag: - self._file_cache[new] = self._file_cache.pop(old) - def remove_file_group(self, USE, recursive=False, force=False): """ Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``) @@ -444,16 +284,7 @@ def remove_file_group(self, USE, recursive=False, force=False): if not recursive: raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: - # NOTE: Here we know the fileGrp, we should pass it as a parameter self.remove_one_file(f.get('ID')) - - if self._cache_flag: - # Note: Since the files inside the group are removed - # with the 'remove_one_file' method above, - # we should not take care of that again. - # We just remove the fileGrp. - del self._file_cache[el_fileGrp.get('USE')] - el_fileGrp.getparent().remove(el_fileGrp) def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): @@ -479,45 +310,21 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID) if not REGEX_FILE_ID.fullmatch(fileGrp): raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) - - """ - # Note: we do not benefit enough from having - # a separate cache for fileGrp elements - - if self._cache_flag: - if fileGrp in self._fileGrp_cache: - el_fileGrp = self._fileGrp_cache[fileGrp] - """ - el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) - if el_fileGrp is None: el_fileGrp = self.add_file_group(fileGrp) - - # Since we are sure that fileGrp parameter is set, - # we could send that parameter to find_files for direct search - mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) + mets_file = next(self.find_files(ID=ID), None) if mets_file and not ignore: if not force: raise Exception("File with ID='%s' already exists" % ID) mets_file.url = url mets_file.mimetype = mimetype mets_file.ID = ID - # The line below uses the pageId setter which - # caches the required data inside - # self._page_cache and self._fptr_cache mets_file.pageId = pageId mets_file.local_filename = local_filename else: - # To get rid of Python's FutureWarning - kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} - el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) - # The caching of the physical page is done in the OcrdFile constructor - mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) - - if self._cache_flag: - # Add the file to the file cache - self._file_cache[fileGrp].update({ID: el_mets_file}) + kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v} + mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs) return mets_file @@ -555,48 +362,20 @@ def remove_one_file(self, ID): ocrd_file = ID ID = ocrd_file.ID else: - # NOTE: We should pass the fileGrp, if known, as a parameter here as well - # Leaving that out for now ocrd_file = next(self.find_files(ID=ID), None) if not ocrd_file: raise FileNotFoundError("File not found: %s" % ID) # Delete the physical page ref - fptrs = [] - if self._cache_flag: - for page in self._fptr_cache.keys(): - if ID in self._fptr_cache[page]: - fptrs.append(self._fptr_cache[page][ID]) - else: - fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) - - for fptr in fptrs: + for fptr in self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS): log.info("Delete fptr element %s for page '%s'", fptr, ID) page_div = fptr.getparent() page_div.remove(fptr) - # Remove the fptr from the cache as well - if self._cache_flag: - del self._fptr_cache[page_div.get('ID')][ID] - # delete empty pages if not page_div.getchildren(): log.info("Delete empty page %s", page_div) page_div.getparent().remove(page_div) - # Delete the empty pages from caches as well - if self._cache_flag: - del self._page_cache[page_div.get('ID')] - del self._fptr_cache[page_div.get('ID')] - - # Delete the file reference from the cache - if self._cache_flag: - parent_use = ocrd_file._el.getparent().get('USE') - # Note: if the file is in the XML tree, - # it must also be in the file cache. - # Anyway, we perform the checks, then remove - if parent_use in self._file_cache: - if ocrd_file.ID in self._file_cache[parent_use]: - del self._file_cache[parent_use][ocrd_file.ID] # Delete the file reference # pylint: disable=protected-access @@ -609,9 +388,6 @@ def physical_pages(self): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) """ - if self._cache_flag: - return self._page_cache.values() - return self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS) @@ -623,24 +399,13 @@ def get_physical_pages(self, for_fileIds=None): """ if for_fileIds is None: return self.physical_pages - ret = [None] * len(for_fileIds) - - # Note: This entire function potentially could be further simplified - # TODO: Simplify - if self._cache_flag: - for pageId in self._fptr_cache.keys(): - for fptr in self._fptr_cache[pageId].keys(): - if fptr in for_fileIds: - ret[for_fileIds.index(fptr)] = pageId - else: - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - for fptr in page.findall('mets:fptr', NS): - if fptr.get('FILEID') in for_fileIds: - ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') - + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + for fptr in page.findall('mets:fptr', NS): + if fptr.get('FILEID') in for_fileIds: + ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') return ret def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): @@ -657,27 +422,9 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N """ # print(pageId, ocrd_file) # delete any page mapping for this file.ID - - # NOTE: The pageId coming from 'test_merge(sbb_sample_01)' is an Element not a string - # if not isinstance(pageId, str): - # pageId = pageId.get('ID') - - candidates = [] - - if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[page_id].keys(): - if self._fptr_cache[page_id][ocrd_file.ID] is not None: - candidates.append(self._fptr_cache[page_id][ocrd_file.ID]) - else: - candidates = self._tree.getroot().findall( + for el_fptr in self._tree.getroot().findall( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % - ocrd_file.ID, namespaces=NS) - - for el_fptr in candidates: - if self._cache_flag: - del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID] - + ocrd_file.ID, namespaces=NS): el_fptr.getparent().remove(el_fptr) # find/construct as necessary @@ -689,15 +436,7 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N if el_seqdiv is None: el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) el_seqdiv.set('TYPE', 'physSequence') - - el_pagediv = None - - if self._cache_flag: - if pageId in self._page_cache.keys(): - el_pagediv = self._page_cache[pageId] - else: - el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) - + el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) if el_pagediv is None: el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) el_pagediv.set('TYPE', 'page') @@ -706,58 +445,29 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv.set('ORDER', order) if orderlabel: el_pagediv.set('ORDERLABEL', orderlabel) - - if self._cache_flag: - # Create a new entry in the page cache - self._page_cache[pageId] = el_pagediv - # Create a new entry in the fptr cache and - # assign an empty dictionary to hold the fileids - self._fptr_cache[pageId] = {} - el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) el_fptr.set('FILEID', ocrd_file.ID) - if self._cache_flag: - # Assign the ocrd fileID to the pageId in the cache - self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) - def get_physical_page_for_file(self, ocrd_file): """ Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ - ret = [] - - if self._cache_flag: - for pageId in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[pageId].keys(): - ret.append(self._page_cache[pageId].get('ID')) - else: - ret = self._tree.getroot().xpath( - '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % - ocrd_file.ID, namespaces=NS) - - # To get rid of the python's FutureWarning - if len(ret): + ret = self._tree.getroot().xpath( + '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % + ocrd_file.ID, namespaces=NS) + if ret: return ret[0] def remove_physical_page(self, ID): """ Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. """ - mets_div = None - if self._cache_flag: - if ID in self._page_cache.keys(): - mets_div = [self._page_cache[ID]] - else: - mets_div = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, - namespaces=NS) - if mets_div is not None: + mets_div = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, + namespaces=NS) + if mets_div: mets_div[0].getparent().remove(mets_div[0]) - if self._cache_flag: - del self._page_cache[ID] - del self._fptr_cache[ID] def remove_physical_page_fptr(self, fileId): """ @@ -765,32 +475,17 @@ def remove_physical_page_fptr(self, fileId): Returns: List of pageIds that mets:fptrs were deleted from """ - - # Question: What is the reason to keep a list of mets_fptrs? - # Do we have a situation in which the fileId is same for different pageIds ? - # From the examples I have seen inside 'assets' that is not the case - # and the mets_fptrs list will always contain a single element. - # If that's the case then we do not need to iterate 2 loops, just one. - - mets_fptrs = [] - if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if fileId in self._fptr_cache[page_id].keys(): - mets_fptrs.append(self._fptr_cache[page_id][fileId]) - else: - mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, - namespaces=NS) + mets_fptrs = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, + namespaces=NS) ret = [] for mets_fptr in mets_fptrs: mets_div = mets_fptr.getparent() ret.append(mets_div.get('ID')) - if self._cache_flag: - del self._fptr_cache[mets_div.get('ID')][mets_fptr.get('FILEID')] mets_div.remove(mets_fptr) return ret - def merge(self, other_mets, fileGrp_mapping=None, after_add_cb=None, **kwargs): + def merge(self, other_mets, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs): """ Add all files from other_mets. @@ -798,20 +493,24 @@ def merge(self, other_mets, fileGrp_mapping=None, after_add_cb=None, **kwargs): Keyword Args: fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS + fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS + pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS after_add_cb (function): Callback received after file is added to the METS """ if not fileGrp_mapping: fileGrp_mapping = {} + if not fileId_mapping: + fileId_mapping = {} + if not pageId_mapping: + pageId_mapping = {} for f_src in other_mets.find_files(**kwargs): f_dest = self.add_file( fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), mimetype=f_src.mimetype, url=f_src.url, - ID=f_src.ID, - pageId=f_src.pageId) - - print(f"Merge: Type of f_dest pageId is: {type(f_dest.pageId)}") - print(f"Merge: Type of f_src pageId is:{type(f_src.pageId)}") - + ID=fileId_mapping.get(f_src.ID, f_src.ID), + pageId=pageId_mapping.get(f_src.pageId, f_src.pageId)) + # FIXME: merge metsHdr, amdSec, dmdSec as well + # FIXME: merge structMap logical and structLink as well if after_add_cb: after_add_cb(f_dest) From 9ff4d268354125021f7584dde3f8bb29ed46a19b Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 12 Oct 2022 18:14:30 +0200 Subject: [PATCH 14/44] Cache functionality after master merge --- ocrd_models/ocrd_models/ocrd_mets.py | 373 ++++++++++++++++++++++++--- 1 file changed, 339 insertions(+), 34 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 0e4a3e2dda..c21128f09a 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -4,6 +4,7 @@ from datetime import datetime import re from lxml import etree as ET +from copy import deepcopy from ocrd_utils import ( is_local_filename, @@ -41,7 +42,7 @@ class OcrdMets(OcrdXmlDocument): """ @staticmethod - def empty_mets(now=None): + def empty_mets(now=None, cache_flag=False): """ Create an empty METS file from bundled template. """ @@ -50,7 +51,7 @@ def empty_mets(now=None): tpl = METS_XML_EMPTY.decode('utf-8') tpl = tpl.replace('{{ VERSION }}', VERSION) tpl = tpl.replace('{{ NOW }}', '%s' % now) - return OcrdMets(content=tpl.encode('utf-8')) + return OcrdMets(content=tpl.encode('utf-8'), cache_flag=cache_flag) def __init__(self, **kwargs): """ @@ -58,12 +59,129 @@ def __init__(self, **kwargs): """ super(OcrdMets, self).__init__(**kwargs) + # If cache is enabled + if self._cache_flag: + + # Cache for the files (mets:file) - two nested dictionaries + # The outer dictionary's Key: 'fileGrp.USE' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'file.ID' + # The inner dictionary's Value: a 'file' object at some memory location + self._file_cache = {} + + # Cache for the pages (mets:div) + # The dictionary's Key: 'div.ID' + # The dictionary's Value: a 'div' object at some memory location + self._page_cache = {} + + # Cache for the file pointers (mets:fptr) - two nested dictionaries + # The outer dictionary's Key: 'div.ID' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'fptr.FILEID' + # The inner dictionary's Value: a 'fptr' object at some memory location + self._fptr_cache = {} + + # Note, if the empty_mets() function is used to instantiate OcrdMets + # Then the cache is empty even after this operation + self._fill_caches() + + def __exit__(self): + """ + + """ + if self._cache_flag: + self._clear_caches() + def __str__(self): """ String representation """ return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) + def _fill_caches(self): + """ + Fills the caches with fileGrps and FileIDs + """ + + tree_root = self._tree.getroot() + + # Fill with files + el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) + if el_fileGrp_list is None or len(el_fileGrp_list) == 0: + return + else: + log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') + + for el_fileGrp in el_fileGrp_list: + fileGrp_use = el_fileGrp.get('USE') + + # Note: SBB0000F29300010000/data/mets.xml contains None + # values due to the comments inside the file + if fileGrp_use is None: + continue + + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp_use] = {} + + for el_file in el_fileGrp: + file_id = el_file.get('ID') + self._file_cache[fileGrp_use].update({file_id : el_file}) + # log.info("File added to the cache: %s" % file_id) + + # Fill with pages + el_div_list = tree_root.findall(".//mets:div", NS) + if el_div_list is None or len(el_div_list) == 0: + return + else: + log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') + + for el_div in el_div_list: + div_id = el_div.get('ID') + print("DIV_ID: %s" % el_div.get('ID')) + + # May not be needed if there are no comments inside the mets file + if div_id is None: + continue + + self._page_cache[div_id] = el_div + + # Assign an empty dictionary that will hold the fptr of the added page (div) + self._fptr_cache[div_id] = {} + + # log.info("Page_id added to the cache: %s" % div_id) + + for el_fptr in el_div: + self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) + # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) + + # log.info("Len of page_cache: %s" % len(self._page_cache)) + # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) + + def _clear_caches(self): + """ + Deallocates the caches + """ + + self._file_cache = None + self._page_cache = None + self._fptr_cache = None + + """ + log = getLogger('ocrd_models.ocrd_mets._clear_caches') + + fileGrp_counter = 0 + fileId_counter = 0 + + for key in list(self._file_cache): + for inner_key in list(self._file_cache[key]): + del self._file_cache[key][inner_key] + fileId_counter += 1 + del self._file_cache[key] + fileGrp_counter += 1 + + log.info("_clear_caches> fileGrp: %d, fileId: %d" % (fileGrp_counter, fileId_counter)) + """ + @property def unique_identifier(self): """ @@ -119,6 +237,11 @@ def file_groups(self): """ List the `@USE` of all `mets:fileGrp` entries. """ + + # WARNING: Actually we cannot return strings in place of elements! + #if self._cache_flag: + # return self._file_cache.keys() + return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] def find_all_files(self, *args, **kwargs): @@ -127,6 +250,7 @@ def find_all_files(self, *args, **kwargs): Equivalent to ``list(self.find_files(...))`` """ + return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements @@ -169,12 +293,24 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if '..' in pageId_: pageIds_expanded += generate_range(*pageId_.split('..', 1)) pageIds += pageIds_expanded - for page in self._tree.getroot().xpath( - '//mets:div[@TYPE="page"]', namespaces=NS): - if (page.get('ID') in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page.get('ID'))): - pageId.extend( - [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + + # Note: This code could be further simplified + # Once we get rid of the 'self._cache_flag' checks + # and using the cache becomes the default + if self._cache_flag: + for page in self._page_cache.keys(): + if (page in pageIds if isinstance(pageIds, list) else + pageIds.fullmatch(page)): + pageId.extend(self._fptr_cache[page]) + else: + # Note: this inline written code is horrible to understand and debug... + for page in self._tree.getroot().xpath( + '//mets:div[@TYPE="page"]', namespaces=NS): + if (page.get('ID') in pageIds if isinstance(pageIds, list) else + pageIds.fullmatch(page.get('ID'))): + pageId.extend( + [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + if ID and ID.startswith(REGEX_PREFIX): ID = re.compile(ID[REGEX_PREFIX_LEN:]) if fileGrp and fileGrp.startswith(REGEX_PREFIX): @@ -183,7 +319,21 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) - for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): + + candidates = [] + + if self._cache_flag: + if fileGrp: + if isinstance(fileGrp, str): + candidates += self._file_cache.get(fileGrp, {}).values() + else: + candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()] + else: + candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()] + else: + candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) + + for cand in candidates: if ID: if isinstance(ID, str): if not ID == cand.get('ID'): continue @@ -193,7 +343,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if pageId is not None and cand.get('ID') not in pageId: continue - if fileGrp: + if not self._cache_flag and fileGrp: if isinstance(fileGrp, str): if cand.getparent().get('USE') != fileGrp: continue else: @@ -215,6 +365,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: if not url.fullmatch(cand_url): continue + # Note: why we instantiate a class only to find out that the local_only is set afterwards + # Checking local_only and url before instantiation should be better? f = OcrdFile(cand, mets=self) # If only local resources should be returned and f is not a file path: skip the file @@ -238,6 +390,11 @@ def add_file_group(self, fileGrp): if el_fileGrp is None: el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) + + if self._cache_flag: + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp] = {} + return el_fileGrp def rename_file_group(self, old, new): @@ -249,6 +406,9 @@ def rename_file_group(self, old, new): raise FileNotFoundError("No such fileGrp '%s'" % old) el_fileGrp.set('USE', new) + if self._cache_flag: + self._file_cache[new] = self._file_cache.pop(old) + def remove_file_group(self, USE, recursive=False, force=False): """ Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``) @@ -284,7 +444,16 @@ def remove_file_group(self, USE, recursive=False, force=False): if not recursive: raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: + # NOTE: Here we know the fileGrp, we should pass it as a parameter self.remove_one_file(f.get('ID')) + + if self._cache_flag: + # Note: Since the files inside the group are removed + # with the 'remove_one_file' method above, + # we should not take care of that again. + # We just remove the fileGrp. + del self._file_cache[el_fileGrp.get('USE')] + el_fileGrp.getparent().remove(el_fileGrp) def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): @@ -310,21 +479,45 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID) if not REGEX_FILE_ID.fullmatch(fileGrp): raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) + + """ + # Note: we do not benefit enough from having + # a separate cache for fileGrp elements + + if self._cache_flag: + if fileGrp in self._fileGrp_cache: + el_fileGrp = self._fileGrp_cache[fileGrp] + """ + el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) + if el_fileGrp is None: el_fileGrp = self.add_file_group(fileGrp) - mets_file = next(self.find_files(ID=ID), None) + + # Since we are sure that fileGrp parameter is set, + # we could send that parameter to find_files for direct search + mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if mets_file and not ignore: if not force: raise Exception("File with ID='%s' already exists" % ID) mets_file.url = url mets_file.mimetype = mimetype mets_file.ID = ID + # The line below uses the pageId setter which + # caches the required data inside + # self._page_cache and self._fptr_cache mets_file.pageId = pageId mets_file.local_filename = local_filename else: - kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v} - mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs) + # To get rid of Python's FutureWarning + kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} + el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) + # The caching of the physical page is done in the OcrdFile constructor + mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) + + if self._cache_flag: + # Add the file to the file cache + self._file_cache[fileGrp].update({ID: el_mets_file}) return mets_file @@ -362,20 +555,48 @@ def remove_one_file(self, ID): ocrd_file = ID ID = ocrd_file.ID else: + # NOTE: We should pass the fileGrp, if known, as a parameter here as well + # Leaving that out for now ocrd_file = next(self.find_files(ID=ID), None) if not ocrd_file: raise FileNotFoundError("File not found: %s" % ID) # Delete the physical page ref - for fptr in self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS): + fptrs = [] + if self._cache_flag: + for page in self._fptr_cache.keys(): + if ID in self._fptr_cache[page]: + fptrs.append(self._fptr_cache[page][ID]) + else: + fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) + + for fptr in fptrs: log.info("Delete fptr element %s for page '%s'", fptr, ID) page_div = fptr.getparent() page_div.remove(fptr) + # Remove the fptr from the cache as well + if self._cache_flag: + del self._fptr_cache[page_div.get('ID')][ID] + # delete empty pages if not page_div.getchildren(): log.info("Delete empty page %s", page_div) page_div.getparent().remove(page_div) + # Delete the empty pages from caches as well + if self._cache_flag: + del self._page_cache[page_div.get('ID')] + del self._fptr_cache[page_div.get('ID')] + + # Delete the file reference from the cache + if self._cache_flag: + parent_use = ocrd_file._el.getparent().get('USE') + # Note: if the file is in the XML tree, + # it must also be in the file cache. + # Anyway, we perform the checks, then remove + if parent_use in self._file_cache: + if ocrd_file.ID in self._file_cache[parent_use]: + del self._file_cache[parent_use][ocrd_file.ID] # Delete the file reference # pylint: disable=protected-access @@ -388,6 +609,9 @@ def physical_pages(self): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) """ + if self._cache_flag: + return self._page_cache.values() + return self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS) @@ -399,13 +623,24 @@ def get_physical_pages(self, for_fileIds=None): """ if for_fileIds is None: return self.physical_pages + ret = [None] * len(for_fileIds) - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - for fptr in page.findall('mets:fptr', NS): - if fptr.get('FILEID') in for_fileIds: - ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + + # Note: This entire function potentially could be further simplified + # TODO: Simplify + if self._cache_flag: + for pageId in self._fptr_cache.keys(): + for fptr in self._fptr_cache[pageId].keys(): + if fptr in for_fileIds: + ret[for_fileIds.index(fptr)] = pageId + else: + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + for fptr in page.findall('mets:fptr', NS): + if fptr.get('FILEID') in for_fileIds: + ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + return ret def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): @@ -422,9 +657,27 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N """ # print(pageId, ocrd_file) # delete any page mapping for this file.ID - for el_fptr in self._tree.getroot().findall( + + # NOTE: The pageId coming from 'test_merge(sbb_sample_01)' is an Element not a string + # if not isinstance(pageId, str): + # pageId = pageId.get('ID') + + candidates = [] + + if self._cache_flag: + for page_id in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[page_id].keys(): + if self._fptr_cache[page_id][ocrd_file.ID] is not None: + candidates.append(self._fptr_cache[page_id][ocrd_file.ID]) + else: + candidates = self._tree.getroot().findall( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % - ocrd_file.ID, namespaces=NS): + ocrd_file.ID, namespaces=NS) + + for el_fptr in candidates: + if self._cache_flag: + del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID] + el_fptr.getparent().remove(el_fptr) # find/construct as necessary @@ -436,7 +689,15 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N if el_seqdiv is None: el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) el_seqdiv.set('TYPE', 'physSequence') - el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + + el_pagediv = None + + if self._cache_flag: + if pageId in self._page_cache.keys(): + el_pagediv = self._page_cache[pageId] + else: + el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + if el_pagediv is None: el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) el_pagediv.set('TYPE', 'page') @@ -445,29 +706,58 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv.set('ORDER', order) if orderlabel: el_pagediv.set('ORDERLABEL', orderlabel) + + if self._cache_flag: + # Create a new entry in the page cache + self._page_cache[pageId] = el_pagediv + # Create a new entry in the fptr cache and + # assign an empty dictionary to hold the fileids + self._fptr_cache[pageId] = {} + el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) el_fptr.set('FILEID', ocrd_file.ID) + if self._cache_flag: + # Assign the ocrd fileID to the pageId in the cache + self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) + def get_physical_page_for_file(self, ocrd_file): """ Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ - ret = self._tree.getroot().xpath( - '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % - ocrd_file.ID, namespaces=NS) - if ret: + ret = [] + + if self._cache_flag: + for pageId in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[pageId].keys(): + ret.append(self._page_cache[pageId].get('ID')) + else: + ret = self._tree.getroot().xpath( + '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % + ocrd_file.ID, namespaces=NS) + + # To get rid of the python's FutureWarning + if len(ret): return ret[0] def remove_physical_page(self, ID): """ Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. """ - mets_div = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, - namespaces=NS) - if mets_div: + mets_div = None + if self._cache_flag: + if ID in self._page_cache.keys(): + mets_div = [self._page_cache[ID]] + else: + mets_div = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, + namespaces=NS) + if mets_div is not None: mets_div[0].getparent().remove(mets_div[0]) + if self._cache_flag: + del self._page_cache[ID] + del self._fptr_cache[ID] def remove_physical_page_fptr(self, fileId): """ @@ -475,13 +765,28 @@ def remove_physical_page_fptr(self, fileId): Returns: List of pageIds that mets:fptrs were deleted from """ - mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, - namespaces=NS) + + # Question: What is the reason to keep a list of mets_fptrs? + # Do we have a situation in which the fileId is same for different pageIds ? + # From the examples I have seen inside 'assets' that is not the case + # and the mets_fptrs list will always contain a single element. + # If that's the case then we do not need to iterate 2 loops, just one. + + mets_fptrs = [] + if self._cache_flag: + for page_id in self._fptr_cache.keys(): + if fileId in self._fptr_cache[page_id].keys(): + mets_fptrs.append(self._fptr_cache[page_id][fileId]) + else: + mets_fptrs = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, + namespaces=NS) ret = [] for mets_fptr in mets_fptrs: mets_div = mets_fptr.getparent() ret.append(mets_div.get('ID')) + if self._cache_flag: + del self._fptr_cache[mets_div.get('ID')][mets_fptr.get('FILEID')] mets_div.remove(mets_fptr) return ret From 2326a9430360eb608920ad2a91565faaa59d6784 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 12 Oct 2022 18:59:30 +0200 Subject: [PATCH 15/44] Test for 500 pages and 1500 files per page --- tests/model/mets_bench_extreme_additional.py | 42 ++++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/tests/model/mets_bench_extreme_additional.py b/tests/model/mets_bench_extreme_additional.py index 31332cada9..e699454e2b 100644 --- a/tests/model/mets_bench_extreme_additional.py +++ b/tests/model/mets_bench_extreme_additional.py @@ -14,9 +14,9 @@ GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP'] GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR'] -# 750 files per page -REGIONS_PER_PAGE = 50 -LINES_PER_REGION = 50 +# 1500 files per page +REGIONS_PER_PAGE = 100 +LINES_PER_REGION = 100 FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE # Caching is disabled by default @@ -33,11 +33,9 @@ def _build_mets(number_of_pages, force=False, cache_flag=False): url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype)) ) for grp in GRPS_IMG: - # LINES_PER_REGION = 2 _add_file(n, grp, 'image/tiff') _add_file(n, grp, 'application/vnd.prima.page+xml') for grp in GRPS_REG: - # REGIONS_PER_PAGE = 2 for region_n in range(REGIONS_PER_PAGE): _add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n)) @@ -78,38 +76,38 @@ def benchmark_find_files_physical_page(number_of_pages, mets): def benchmark_find_files_all(number_of_pages, mets): assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) -# ----- 5000 pages -> build, search, build (cached), search (cached) ----- # -mets_5000 = None +# ----- 500 pages -> build, search, build (cached), search (cached) ----- # +mets_500 = None @mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) -def test_b5000(benchmark): +def test_b500(benchmark): @benchmark def result(): - global mets_5000 - mets_5000 = _build_mets(5000, force=True) + global mets_500 + mets_500 = _build_mets(500, force=True) @mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) -def test_s5000(benchmark): +def test_s500(benchmark): @benchmark def ret(): - global mets_5000 - benchmark_find_files(5000, mets_5000) -del mets_5000 + global mets_500 + benchmark_find_files(500, mets_500) +del mets_500 -mets_c_5000 = None +mets_c_500 = None @mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) -def test_b5000_c(benchmark): +def test_b500_c(benchmark): @benchmark def result(): - global mets_c_5000 - mets_c_5000 = _build_mets(5000, force=True, cache_flag=True) + global mets_c_500 + mets_c_500 = _build_mets(500, force=True, cache_flag=True) @mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) -def test_s5000_c(benchmark): +def test_s500_c(benchmark): @benchmark def ret(): - global mets_c_5000 - benchmark_find_files(5000, mets_c_5000) -del mets_c_5000 + global mets_c_500 + benchmark_find_files(500, mets_c_500) +del mets_c_500 # ------------------------------------------------------------------------ # From 90212ea064b9e218b294ead4a1e894eef9ca7448 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 12 Oct 2022 19:35:22 +0200 Subject: [PATCH 16/44] Fix the test case --- tests/model/test_ocrd_mets_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py index 691f422bc7..7015ee0615 100644 --- a/tests/model/test_ocrd_mets_cache.py +++ b/tests/model/test_ocrd_mets_cache.py @@ -79,7 +79,8 @@ def test_find_all_files(sbb_sample_01): def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', - local_only=True)) == 3, '3 local files for page "PHYS_0001"' + local_only=True)) == 14, '14 local files for page "PHYS_0001"' + # 3 non-local files for page "PHYS_0001" def test_physical_pages(sbb_sample_01): From 50e0f9548d0893430303456f5dd6c5c04c943d95 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 24 Oct 2022 14:03:39 +0200 Subject: [PATCH 17/44] Remove the conflicting file --- ocrd_models/ocrd_models/ocrd_mets.py | 821 --------------------------- 1 file changed, 821 deletions(-) delete mode 100644 ocrd_models/ocrd_models/ocrd_mets.py diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py deleted file mode 100644 index c21128f09a..0000000000 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ /dev/null @@ -1,821 +0,0 @@ -""" -API to METS -""" -from datetime import datetime -import re -from lxml import etree as ET -from copy import deepcopy - -from ocrd_utils import ( - is_local_filename, - getLogger, - generate_range, - VERSION, - REGEX_PREFIX, - REGEX_FILE_ID -) - -from .constants import ( - NAMESPACES as NS, - TAG_METS_AGENT, - TAG_METS_DIV, - TAG_METS_FILE, - TAG_METS_FILEGRP, - TAG_METS_FILESEC, - TAG_METS_FPTR, - TAG_METS_METSHDR, - TAG_METS_STRUCTMAP, - IDENTIFIER_PRIORITY, - TAG_MODS_IDENTIFIER, - METS_XML_EMPTY, -) - -from .ocrd_xml_base import OcrdXmlDocument, ET -from .ocrd_file import OcrdFile -from .ocrd_agent import OcrdAgent - -REGEX_PREFIX_LEN = len(REGEX_PREFIX) - -class OcrdMets(OcrdXmlDocument): - """ - API to a single METS file - """ - - @staticmethod - def empty_mets(now=None, cache_flag=False): - """ - Create an empty METS file from bundled template. - """ - if not now: - now = datetime.now().isoformat() - tpl = METS_XML_EMPTY.decode('utf-8') - tpl = tpl.replace('{{ VERSION }}', VERSION) - tpl = tpl.replace('{{ NOW }}', '%s' % now) - return OcrdMets(content=tpl.encode('utf-8'), cache_flag=cache_flag) - - def __init__(self, **kwargs): - """ - - """ - super(OcrdMets, self).__init__(**kwargs) - - # If cache is enabled - if self._cache_flag: - - # Cache for the files (mets:file) - two nested dictionaries - # The outer dictionary's Key: 'fileGrp.USE' - # The outer dictionary's Value: Inner dictionary - # The inner dictionary's Key: 'file.ID' - # The inner dictionary's Value: a 'file' object at some memory location - self._file_cache = {} - - # Cache for the pages (mets:div) - # The dictionary's Key: 'div.ID' - # The dictionary's Value: a 'div' object at some memory location - self._page_cache = {} - - # Cache for the file pointers (mets:fptr) - two nested dictionaries - # The outer dictionary's Key: 'div.ID' - # The outer dictionary's Value: Inner dictionary - # The inner dictionary's Key: 'fptr.FILEID' - # The inner dictionary's Value: a 'fptr' object at some memory location - self._fptr_cache = {} - - # Note, if the empty_mets() function is used to instantiate OcrdMets - # Then the cache is empty even after this operation - self._fill_caches() - - def __exit__(self): - """ - - """ - if self._cache_flag: - self._clear_caches() - - def __str__(self): - """ - String representation - """ - return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) - - def _fill_caches(self): - """ - Fills the caches with fileGrps and FileIDs - """ - - tree_root = self._tree.getroot() - - # Fill with files - el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) - if el_fileGrp_list is None or len(el_fileGrp_list) == 0: - return - else: - log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') - - for el_fileGrp in el_fileGrp_list: - fileGrp_use = el_fileGrp.get('USE') - - # Note: SBB0000F29300010000/data/mets.xml contains None - # values due to the comments inside the file - if fileGrp_use is None: - continue - - # Assign an empty dictionary that will hold the files of the added fileGrp - self._file_cache[fileGrp_use] = {} - - for el_file in el_fileGrp: - file_id = el_file.get('ID') - self._file_cache[fileGrp_use].update({file_id : el_file}) - # log.info("File added to the cache: %s" % file_id) - - # Fill with pages - el_div_list = tree_root.findall(".//mets:div", NS) - if el_div_list is None or len(el_div_list) == 0: - return - else: - log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') - - for el_div in el_div_list: - div_id = el_div.get('ID') - print("DIV_ID: %s" % el_div.get('ID')) - - # May not be needed if there are no comments inside the mets file - if div_id is None: - continue - - self._page_cache[div_id] = el_div - - # Assign an empty dictionary that will hold the fptr of the added page (div) - self._fptr_cache[div_id] = {} - - # log.info("Page_id added to the cache: %s" % div_id) - - for el_fptr in el_div: - self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) - # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) - - # log.info("Len of page_cache: %s" % len(self._page_cache)) - # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) - - def _clear_caches(self): - """ - Deallocates the caches - """ - - self._file_cache = None - self._page_cache = None - self._fptr_cache = None - - """ - log = getLogger('ocrd_models.ocrd_mets._clear_caches') - - fileGrp_counter = 0 - fileId_counter = 0 - - for key in list(self._file_cache): - for inner_key in list(self._file_cache[key]): - del self._file_cache[key][inner_key] - fileId_counter += 1 - del self._file_cache[key] - fileGrp_counter += 1 - - log.info("_clear_caches> fileGrp: %d, fileId: %d" % (fileGrp_counter, fileId_counter)) - """ - - @property - def unique_identifier(self): - """ - Get the unique identifier by looking through ``mods:identifier`` - - See `specs `_ for details. - """ - for t in IDENTIFIER_PRIORITY: - found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) - if found is not None: - return found.text - - @unique_identifier.setter - def unique_identifier(self, purl): - """ - Set the unique identifier by looking through ``mods:identifier`` - - See `specs `_ for details. - """ - id_el = None - for t in IDENTIFIER_PRIORITY: - id_el = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) - if id_el is not None: - break - if id_el is None: - mods = self._tree.getroot().find('.//mods:mods', NS) - id_el = ET.SubElement(mods, TAG_MODS_IDENTIFIER) - id_el.set('type', 'purl') - id_el.text = purl - - @property - def agents(self): - """ - List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s - """ - return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] - - def add_agent(self, *args, **kwargs): - """ - Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``. - """ - el_metsHdr = self._tree.getroot().find('.//mets:metsHdr', NS) - if el_metsHdr is None: - el_metsHdr = ET.Element(TAG_METS_METSHDR) - self._tree.getroot().insert(0, el_metsHdr) - # assert(el_metsHdr is not None) - el_agent = ET.SubElement(el_metsHdr, TAG_METS_AGENT) - # print(ET.tostring(el_metsHdr)) - return OcrdAgent(el_agent, *args, **kwargs) - - @property - def file_groups(self): - """ - List the `@USE` of all `mets:fileGrp` entries. - """ - - # WARNING: Actually we cannot return strings in place of elements! - #if self._cache_flag: - # return self._file_cache.keys() - - return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] - - def find_all_files(self, *args, **kwargs): - """ - Like :py:meth:`find_files` but return a list of all results. - - Equivalent to ``list(self.find_files(...))`` - """ - - return list(self.find_files(*args, **kwargs)) - - # pylint: disable=multiple-statements - def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False): - """ - Search ``mets:file`` entries in this METS document and yield results. - - - The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`, - :py:attr:`url` and :py:attr:`mimetype` parameters can each be either a - literal string, or a regular expression if the string starts with - ``//`` (double slash). - - If it is a regex, the leading ``//`` is removed and candidates are matched - against the regex with `re.fullmatch`. If it is a literal string, comparison - is done with string equality. - - The :py:attr:`pageId` parameter supports the numeric range operator ``..``. For - example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``, - ``PHYS_0001..PHYS_0003`` will be expanded to ``PHYS_0001,PHYS_0002,PHYS_0003``. - - Keyword Args: - ID (string) : ``@ID`` of the ``mets:file`` - fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of - pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page) - url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file`` - mimetype (string) : ``@MIMETYPE`` of ``mets:file`` - local (boolean) : Whether to restrict results to local files in the filesystem - - Yields: - :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations - """ - if pageId: - if pageId.startswith(REGEX_PREFIX): - pageIds, pageId = re.compile(pageId[REGEX_PREFIX_LEN:]), list() - else: - pageIds, pageId = pageId.split(','), list() - pageIds_expanded = [] - for pageId_ in pageIds: - if '..' in pageId_: - pageIds_expanded += generate_range(*pageId_.split('..', 1)) - pageIds += pageIds_expanded - - # Note: This code could be further simplified - # Once we get rid of the 'self._cache_flag' checks - # and using the cache becomes the default - if self._cache_flag: - for page in self._page_cache.keys(): - if (page in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page)): - pageId.extend(self._fptr_cache[page]) - else: - # Note: this inline written code is horrible to understand and debug... - for page in self._tree.getroot().xpath( - '//mets:div[@TYPE="page"]', namespaces=NS): - if (page.get('ID') in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page.get('ID'))): - pageId.extend( - [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) - - if ID and ID.startswith(REGEX_PREFIX): - ID = re.compile(ID[REGEX_PREFIX_LEN:]) - if fileGrp and fileGrp.startswith(REGEX_PREFIX): - fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:]) - if mimetype and mimetype.startswith(REGEX_PREFIX): - mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) - if url and url.startswith(REGEX_PREFIX): - url = re.compile(url[REGEX_PREFIX_LEN:]) - - candidates = [] - - if self._cache_flag: - if fileGrp: - if isinstance(fileGrp, str): - candidates += self._file_cache.get(fileGrp, {}).values() - else: - candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()] - else: - candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()] - else: - candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) - - for cand in candidates: - if ID: - if isinstance(ID, str): - if not ID == cand.get('ID'): continue - else: - if not ID.fullmatch(cand.get('ID')): continue - - if pageId is not None and cand.get('ID') not in pageId: - continue - - if not self._cache_flag and fileGrp: - if isinstance(fileGrp, str): - if cand.getparent().get('USE') != fileGrp: continue - else: - if not fileGrp.fullmatch(cand.getparent().get('USE')): continue - - if mimetype: - if isinstance(mimetype, str): - if cand.get('MIMETYPE') != mimetype: continue - else: - if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue - - if url: - cand_locat = cand.find('mets:FLocat', namespaces=NS) - if cand_locat is None: - continue - cand_url = cand_locat.get('{%s}href' % NS['xlink']) - if isinstance(url, str): - if cand_url != url: continue - else: - if not url.fullmatch(cand_url): continue - - # Note: why we instantiate a class only to find out that the local_only is set afterwards - # Checking local_only and url before instantiation should be better? - f = OcrdFile(cand, mets=self) - - # If only local resources should be returned and f is not a file path: skip the file - if local_only and not is_local_filename(f.url): - continue - yield f - - def add_file_group(self, fileGrp): - """ - Add a new ``mets:fileGrp``. - - Arguments: - fileGrp (string): ``@USE`` of the new ``mets:fileGrp``. - """ - if ',' in fileGrp: - raise Exception('fileGrp must not contain commas') - el_fileSec = self._tree.getroot().find('mets:fileSec', NS) - if el_fileSec is None: - el_fileSec = ET.SubElement(self._tree.getroot(), TAG_METS_FILESEC) - el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % fileGrp, NS) - if el_fileGrp is None: - el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) - el_fileGrp.set('USE', fileGrp) - - if self._cache_flag: - # Assign an empty dictionary that will hold the files of the added fileGrp - self._file_cache[fileGrp] = {} - - return el_fileGrp - - def rename_file_group(self, old, new): - """ - Rename a ``mets:fileGrp`` by changing the ``@USE`` from :py:attr:`old` to :py:attr:`new`. - """ - el_fileGrp = self._tree.getroot().find('mets:fileSec/mets:fileGrp[@USE="%s"]' % old, NS) - if el_fileGrp is None: - raise FileNotFoundError("No such fileGrp '%s'" % old) - el_fileGrp.set('USE', new) - - if self._cache_flag: - self._file_cache[new] = self._file_cache.pop(old) - - def remove_file_group(self, USE, recursive=False, force=False): - """ - Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``) - - Arguments: - USE (string): ``@USE`` of the ``mets:fileGrp`` to delete. Can be a regex if prefixed with ``//`` - recursive (boolean): Whether to recursively delete each ``mets:file`` in the group - force (boolean): Do not raise an exception if ``mets:fileGrp`` does not exist - """ - log = getLogger('ocrd_models.ocrd_mets.remove_file_group') - el_fileSec = self._tree.getroot().find('mets:fileSec', NS) - if el_fileSec is None: - raise Exception("No fileSec!") - if isinstance(USE, str): - if USE.startswith(REGEX_PREFIX): - use = re.compile(USE[REGEX_PREFIX_LEN:]) - for cand in el_fileSec.findall('mets:fileGrp', NS): - if use.fullmatch(cand.get('USE')): - self.remove_file_group(cand, recursive=recursive) - return - else: - el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % USE, NS) - else: - el_fileGrp = USE - if el_fileGrp is None: # pylint: disable=len-as-condition - msg = "No such fileGrp: %s" % USE - if force: - log.warning(msg) - return - raise Exception(msg) - files = el_fileGrp.findall('mets:file', NS) - if files: - if not recursive: - raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) - for f in files: - # NOTE: Here we know the fileGrp, we should pass it as a parameter - self.remove_one_file(f.get('ID')) - - if self._cache_flag: - # Note: Since the files inside the group are removed - # with the 'remove_one_file' method above, - # we should not take care of that again. - # We just remove the fileGrp. - del self._file_cache[el_fileGrp.get('USE')] - - el_fileGrp.getparent().remove(el_fileGrp) - - def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): - """ - Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`. - - Arguments: - fileGrp (string): ``@USE`` of ``mets:fileGrp`` to add to - Keyword Args: - mimetype (string): ``@MIMETYPE`` of the ``mets:file`` to use - url (string): ``@xlink:href`` (URL or path) of the ``mets:file`` to use - ID (string): ``@ID`` of the ``mets:file`` to use - pageId (string): ``@ID`` in the physical ``mets:structMap`` to link to - force (boolean): Whether to add the file even if a ``mets:file`` with the same ``@ID`` already exists. - ignore (boolean): Do not look for existing files at all. Shift responsibility for preventing errors from duplicate ID to the user. - local_filename (string): - """ - if not ID: - raise ValueError("Must set ID of the mets:file") - if not fileGrp: - raise ValueError("Must set fileGrp of the mets:file") - if not REGEX_FILE_ID.fullmatch(ID): - raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID) - if not REGEX_FILE_ID.fullmatch(fileGrp): - raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) - - """ - # Note: we do not benefit enough from having - # a separate cache for fileGrp elements - - if self._cache_flag: - if fileGrp in self._fileGrp_cache: - el_fileGrp = self._fileGrp_cache[fileGrp] - """ - - el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) - - if el_fileGrp is None: - el_fileGrp = self.add_file_group(fileGrp) - - # Since we are sure that fileGrp parameter is set, - # we could send that parameter to find_files for direct search - mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) - if mets_file and not ignore: - if not force: - raise Exception("File with ID='%s' already exists" % ID) - mets_file.url = url - mets_file.mimetype = mimetype - mets_file.ID = ID - # The line below uses the pageId setter which - # caches the required data inside - # self._page_cache and self._fptr_cache - mets_file.pageId = pageId - mets_file.local_filename = local_filename - else: - # To get rid of Python's FutureWarning - kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} - el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) - # The caching of the physical page is done in the OcrdFile constructor - mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) - - if self._cache_flag: - # Add the file to the file cache - self._file_cache[fileGrp].update({ID: el_mets_file}) - - return mets_file - - def remove_file(self, *args, **kwargs): - """ - Delete each ``ocrd:file`` matching the query. Same arguments as :py:meth:`find_files` - """ - files = list(self.find_files(*args, **kwargs)) - if files: - for f in files: - self.remove_one_file(f) - if len(files) > 1: - return files - else: - return files[0] # for backwards-compatibility - if any(1 for kwarg in kwargs - if isinstance(kwarg, str) and kwarg.startswith(REGEX_PREFIX)): - # allow empty results if filter criteria involve a regex - return [] - raise FileNotFoundError("File not found: %s %s" % (args, kwargs)) - - def remove_one_file(self, ID): - """ - Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`. - - Arguments: - ID (string): ``@ID`` of the ``mets:file`` to delete - - Returns: - The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference. - """ - log = getLogger('ocrd_models.ocrd_mets.remove_one_file') - log.info("remove_one_file(%s)" % ID) - if isinstance(ID, OcrdFile): - ocrd_file = ID - ID = ocrd_file.ID - else: - # NOTE: We should pass the fileGrp, if known, as a parameter here as well - # Leaving that out for now - ocrd_file = next(self.find_files(ID=ID), None) - - if not ocrd_file: - raise FileNotFoundError("File not found: %s" % ID) - - # Delete the physical page ref - fptrs = [] - if self._cache_flag: - for page in self._fptr_cache.keys(): - if ID in self._fptr_cache[page]: - fptrs.append(self._fptr_cache[page][ID]) - else: - fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) - - for fptr in fptrs: - log.info("Delete fptr element %s for page '%s'", fptr, ID) - page_div = fptr.getparent() - page_div.remove(fptr) - # Remove the fptr from the cache as well - if self._cache_flag: - del self._fptr_cache[page_div.get('ID')][ID] - - # delete empty pages - if not page_div.getchildren(): - log.info("Delete empty page %s", page_div) - page_div.getparent().remove(page_div) - # Delete the empty pages from caches as well - if self._cache_flag: - del self._page_cache[page_div.get('ID')] - del self._fptr_cache[page_div.get('ID')] - - # Delete the file reference from the cache - if self._cache_flag: - parent_use = ocrd_file._el.getparent().get('USE') - # Note: if the file is in the XML tree, - # it must also be in the file cache. - # Anyway, we perform the checks, then remove - if parent_use in self._file_cache: - if ocrd_file.ID in self._file_cache[parent_use]: - del self._file_cache[parent_use][ocrd_file.ID] - - # Delete the file reference - # pylint: disable=protected-access - ocrd_file._el.getparent().remove(ocrd_file._el) - - return ocrd_file - - @property - def physical_pages(self): - """ - List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) - """ - if self._cache_flag: - return self._page_cache.values() - - return self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', - namespaces=NS) - - def get_physical_pages(self, for_fileIds=None): - """ - List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``), - optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`. - """ - if for_fileIds is None: - return self.physical_pages - - ret = [None] * len(for_fileIds) - - # Note: This entire function potentially could be further simplified - # TODO: Simplify - if self._cache_flag: - for pageId in self._fptr_cache.keys(): - for fptr in self._fptr_cache[pageId].keys(): - if fptr in for_fileIds: - ret[for_fileIds.index(fptr)] = pageId - else: - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - for fptr in page.findall('mets:fptr', NS): - if fptr.get('FILEID') in for_fileIds: - ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') - - return ret - - def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): - """ - Set the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) - corresponding to the ``mets:file`` :py:attr:`ocrd_file`, creating all structures if necessary. - - Arguments: - pageId (string): ``@ID`` of the physical ``mets:structMap`` entry to use - ocrd_file (object): existing :py:class:`ocrd_models.ocrd_file.OcrdFile` object - Keyword Args: - order (string): ``@ORDER`` to use - orderlabel (string): ``@ORDERLABEL`` to use - """ - # print(pageId, ocrd_file) - # delete any page mapping for this file.ID - - # NOTE: The pageId coming from 'test_merge(sbb_sample_01)' is an Element not a string - # if not isinstance(pageId, str): - # pageId = pageId.get('ID') - - candidates = [] - - if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[page_id].keys(): - if self._fptr_cache[page_id][ocrd_file.ID] is not None: - candidates.append(self._fptr_cache[page_id][ocrd_file.ID]) - else: - candidates = self._tree.getroot().findall( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % - ocrd_file.ID, namespaces=NS) - - for el_fptr in candidates: - if self._cache_flag: - del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID] - - el_fptr.getparent().remove(el_fptr) - - # find/construct as necessary - el_structmap = self._tree.getroot().find('mets:structMap[@TYPE="PHYSICAL"]', NS) - if el_structmap is None: - el_structmap = ET.SubElement(self._tree.getroot(), TAG_METS_STRUCTMAP) - el_structmap.set('TYPE', 'PHYSICAL') - el_seqdiv = el_structmap.find('mets:div[@TYPE="physSequence"]', NS) - if el_seqdiv is None: - el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) - el_seqdiv.set('TYPE', 'physSequence') - - el_pagediv = None - - if self._cache_flag: - if pageId in self._page_cache.keys(): - el_pagediv = self._page_cache[pageId] - else: - el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) - - if el_pagediv is None: - el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) - el_pagediv.set('TYPE', 'page') - el_pagediv.set('ID', pageId) - if order: - el_pagediv.set('ORDER', order) - if orderlabel: - el_pagediv.set('ORDERLABEL', orderlabel) - - if self._cache_flag: - # Create a new entry in the page cache - self._page_cache[pageId] = el_pagediv - # Create a new entry in the fptr cache and - # assign an empty dictionary to hold the fileids - self._fptr_cache[pageId] = {} - - el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) - el_fptr.set('FILEID', ocrd_file.ID) - - if self._cache_flag: - # Assign the ocrd fileID to the pageId in the cache - self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) - - def get_physical_page_for_file(self, ocrd_file): - """ - Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) - corresponding to the ``mets:file`` :py:attr:`ocrd_file`. - """ - ret = [] - - if self._cache_flag: - for pageId in self._fptr_cache.keys(): - if ocrd_file.ID in self._fptr_cache[pageId].keys(): - ret.append(self._page_cache[pageId].get('ID')) - else: - ret = self._tree.getroot().xpath( - '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % - ocrd_file.ID, namespaces=NS) - - # To get rid of the python's FutureWarning - if len(ret): - return ret[0] - - def remove_physical_page(self, ID): - """ - Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. - """ - mets_div = None - if self._cache_flag: - if ID in self._page_cache.keys(): - mets_div = [self._page_cache[ID]] - else: - mets_div = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, - namespaces=NS) - if mets_div is not None: - mets_div[0].getparent().remove(mets_div[0]) - if self._cache_flag: - del self._page_cache[ID] - del self._fptr_cache[ID] - - def remove_physical_page_fptr(self, fileId): - """ - Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]`` for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``. - Returns: - List of pageIds that mets:fptrs were deleted from - """ - - # Question: What is the reason to keep a list of mets_fptrs? - # Do we have a situation in which the fileId is same for different pageIds ? - # From the examples I have seen inside 'assets' that is not the case - # and the mets_fptrs list will always contain a single element. - # If that's the case then we do not need to iterate 2 loops, just one. - - mets_fptrs = [] - if self._cache_flag: - for page_id in self._fptr_cache.keys(): - if fileId in self._fptr_cache[page_id].keys(): - mets_fptrs.append(self._fptr_cache[page_id][fileId]) - else: - mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, - namespaces=NS) - ret = [] - for mets_fptr in mets_fptrs: - mets_div = mets_fptr.getparent() - ret.append(mets_div.get('ID')) - if self._cache_flag: - del self._fptr_cache[mets_div.get('ID')][mets_fptr.get('FILEID')] - mets_div.remove(mets_fptr) - return ret - - def merge(self, other_mets, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs): - """ - Add all files from other_mets. - - Accepts the same kwargs as :py:func:`find_files` - - Keyword Args: - fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS - fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS - pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS - after_add_cb (function): Callback received after file is added to the METS - """ - if not fileGrp_mapping: - fileGrp_mapping = {} - if not fileId_mapping: - fileId_mapping = {} - if not pageId_mapping: - pageId_mapping = {} - for f_src in other_mets.find_files(**kwargs): - f_dest = self.add_file( - fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), - mimetype=f_src.mimetype, - url=f_src.url, - ID=fileId_mapping.get(f_src.ID, f_src.ID), - pageId=pageId_mapping.get(f_src.pageId, f_src.pageId)) - # FIXME: merge metsHdr, amdSec, dmdSec as well - # FIXME: merge structMap logical and structLink as well - if after_add_cb: - after_add_cb(f_dest) From afa016202171285ecdaf432c11dade03ba41fd16 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 24 Oct 2022 14:30:56 +0200 Subject: [PATCH 18/44] Return back ocrd_mets --- ocrd_models/ocrd_models/ocrd_mets.py | 500 +++++++++++++++++++++++++++ 1 file changed, 500 insertions(+) create mode 100644 ocrd_models/ocrd_models/ocrd_mets.py diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py new file mode 100644 index 0000000000..b65afb86c7 --- /dev/null +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -0,0 +1,500 @@ +""" +API to METS +""" +from datetime import datetime +import re +from lxml import etree as ET + +from ocrd_utils import ( + is_local_filename, + getLogger, + generate_range, + VERSION, + REGEX_PREFIX, + REGEX_FILE_ID +) + +from .constants import ( + NAMESPACES as NS, + TAG_METS_AGENT, + TAG_METS_DIV, + TAG_METS_FILE, + TAG_METS_FILEGRP, + TAG_METS_FILESEC, + TAG_METS_FPTR, + TAG_METS_METSHDR, + TAG_METS_STRUCTMAP, + IDENTIFIER_PRIORITY, + TAG_MODS_IDENTIFIER, + METS_XML_EMPTY, +) + +from .ocrd_xml_base import OcrdXmlDocument, ET +from .ocrd_file import OcrdFile +from .ocrd_agent import OcrdAgent + +REGEX_PREFIX_LEN = len(REGEX_PREFIX) + +class OcrdMets(OcrdXmlDocument): + """ + API to a single METS file + """ + + @staticmethod + def empty_mets(now=None): + """ + Create an empty METS file from bundled template. + """ + if not now: + now = datetime.now().isoformat() + tpl = METS_XML_EMPTY.decode('utf-8') + tpl = tpl.replace('{{ VERSION }}', VERSION) + tpl = tpl.replace('{{ NOW }}', '%s' % now) + return OcrdMets(content=tpl.encode('utf-8')) + + def __init__(self, **kwargs): + """ + """ + super(OcrdMets, self).__init__(**kwargs) + + def __str__(self): + """ + String representation + """ + return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) + + @property + def unique_identifier(self): + """ + Get the unique identifier by looking through ``mods:identifier`` + See `specs `_ for details. + """ + for t in IDENTIFIER_PRIORITY: + found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) + if found is not None: + return found.text + + @unique_identifier.setter + def unique_identifier(self, purl): + """ + Set the unique identifier by looking through ``mods:identifier`` + See `specs `_ for details. + """ + id_el = None + for t in IDENTIFIER_PRIORITY: + id_el = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) + if id_el is not None: + break + if id_el is None: + mods = self._tree.getroot().find('.//mods:mods', NS) + id_el = ET.SubElement(mods, TAG_MODS_IDENTIFIER) + id_el.set('type', 'purl') + id_el.text = purl + + @property + def agents(self): + """ + List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s + """ + return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] + + def add_agent(self, *args, **kwargs): + """ + Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``. + """ + el_metsHdr = self._tree.getroot().find('.//mets:metsHdr', NS) + if el_metsHdr is None: + el_metsHdr = ET.Element(TAG_METS_METSHDR) + self._tree.getroot().insert(0, el_metsHdr) + # assert(el_metsHdr is not None) + el_agent = ET.SubElement(el_metsHdr, TAG_METS_AGENT) + # print(ET.tostring(el_metsHdr)) + return OcrdAgent(el_agent, *args, **kwargs) + + @property + def file_groups(self): + """ + List the `@USE` of all `mets:fileGrp` entries. + """ + return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] + + def find_all_files(self, *args, **kwargs): + """ + Like :py:meth:`find_files` but return a list of all results. + Equivalent to ``list(self.find_files(...))`` + """ + return list(self.find_files(*args, **kwargs)) + + # pylint: disable=multiple-statements + def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False): + """ + Search ``mets:file`` entries in this METS document and yield results. + The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`, + :py:attr:`url` and :py:attr:`mimetype` parameters can each be either a + literal string, or a regular expression if the string starts with + ``//`` (double slash). + If it is a regex, the leading ``//`` is removed and candidates are matched + against the regex with `re.fullmatch`. If it is a literal string, comparison + is done with string equality. + The :py:attr:`pageId` parameter supports the numeric range operator ``..``. For + example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``, + ``PHYS_0001..PHYS_0003`` will be expanded to ``PHYS_0001,PHYS_0002,PHYS_0003``. + Keyword Args: + ID (string) : ``@ID`` of the ``mets:file`` + fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of + pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page) + url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file`` + mimetype (string) : ``@MIMETYPE`` of ``mets:file`` + local (boolean) : Whether to restrict results to local files in the filesystem + Yields: + :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations + """ + if pageId: + if pageId.startswith(REGEX_PREFIX): + pageIds, pageId = re.compile(pageId[REGEX_PREFIX_LEN:]), list() + else: + pageIds, pageId = pageId.split(','), list() + pageIds_expanded = [] + for pageId_ in pageIds: + if '..' in pageId_: + pageIds_expanded += generate_range(*pageId_.split('..', 1)) + pageIds += pageIds_expanded + for page in self._tree.getroot().xpath( + '//mets:div[@TYPE="page"]', namespaces=NS): + if (page.get('ID') in pageIds if isinstance(pageIds, list) else + pageIds.fullmatch(page.get('ID'))): + pageId.extend( + [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + if ID and ID.startswith(REGEX_PREFIX): + ID = re.compile(ID[REGEX_PREFIX_LEN:]) + if fileGrp and fileGrp.startswith(REGEX_PREFIX): + fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:]) + if mimetype and mimetype.startswith(REGEX_PREFIX): + mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) + if url and url.startswith(REGEX_PREFIX): + url = re.compile(url[REGEX_PREFIX_LEN:]) + for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): + if ID: + if isinstance(ID, str): + if not ID == cand.get('ID'): continue + else: + if not ID.fullmatch(cand.get('ID')): continue + + if pageId is not None and cand.get('ID') not in pageId: + continue + + if fileGrp: + if isinstance(fileGrp, str): + if cand.getparent().get('USE') != fileGrp: continue + else: + if not fileGrp.fullmatch(cand.getparent().get('USE')): continue + + if mimetype: + if isinstance(mimetype, str): + if cand.get('MIMETYPE') != mimetype: continue + else: + if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue + + if url: + cand_locat = cand.find('mets:FLocat', namespaces=NS) + if cand_locat is None: + continue + cand_url = cand_locat.get('{%s}href' % NS['xlink']) + if isinstance(url, str): + if cand_url != url: continue + else: + if not url.fullmatch(cand_url): continue + + f = OcrdFile(cand, mets=self) + + # If only local resources should be returned and f is not a file path: skip the file + if local_only and not is_local_filename(f.url): + continue + yield f + + def add_file_group(self, fileGrp): + """ + Add a new ``mets:fileGrp``. + Arguments: + fileGrp (string): ``@USE`` of the new ``mets:fileGrp``. + """ + if ',' in fileGrp: + raise Exception('fileGrp must not contain commas') + el_fileSec = self._tree.getroot().find('mets:fileSec', NS) + if el_fileSec is None: + el_fileSec = ET.SubElement(self._tree.getroot(), TAG_METS_FILESEC) + el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % fileGrp, NS) + if el_fileGrp is None: + el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) + el_fileGrp.set('USE', fileGrp) + return el_fileGrp + + def rename_file_group(self, old, new): + """ + Rename a ``mets:fileGrp`` by changing the ``@USE`` from :py:attr:`old` to :py:attr:`new`. + """ + el_fileGrp = self._tree.getroot().find('mets:fileSec/mets:fileGrp[@USE="%s"]' % old, NS) + if el_fileGrp is None: + raise FileNotFoundError("No such fileGrp '%s'" % old) + el_fileGrp.set('USE', new) + + def remove_file_group(self, USE, recursive=False, force=False): + """ + Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``) + Arguments: + USE (string): ``@USE`` of the ``mets:fileGrp`` to delete. Can be a regex if prefixed with ``//`` + recursive (boolean): Whether to recursively delete each ``mets:file`` in the group + force (boolean): Do not raise an exception if ``mets:fileGrp`` does not exist + """ + log = getLogger('ocrd_models.ocrd_mets.remove_file_group') + el_fileSec = self._tree.getroot().find('mets:fileSec', NS) + if el_fileSec is None: + raise Exception("No fileSec!") + if isinstance(USE, str): + if USE.startswith(REGEX_PREFIX): + use = re.compile(USE[REGEX_PREFIX_LEN:]) + for cand in el_fileSec.findall('mets:fileGrp', NS): + if use.fullmatch(cand.get('USE')): + self.remove_file_group(cand, recursive=recursive) + return + else: + el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % USE, NS) + else: + el_fileGrp = USE + if el_fileGrp is None: # pylint: disable=len-as-condition + msg = "No such fileGrp: %s" % USE + if force: + log.warning(msg) + return + raise Exception(msg) + files = el_fileGrp.findall('mets:file', NS) + if files: + if not recursive: + raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) + for f in files: + self.remove_one_file(f.get('ID')) + el_fileGrp.getparent().remove(el_fileGrp) + + def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): + """ + Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`. + Arguments: + fileGrp (string): ``@USE`` of ``mets:fileGrp`` to add to + Keyword Args: + mimetype (string): ``@MIMETYPE`` of the ``mets:file`` to use + url (string): ``@xlink:href`` (URL or path) of the ``mets:file`` to use + ID (string): ``@ID`` of the ``mets:file`` to use + pageId (string): ``@ID`` in the physical ``mets:structMap`` to link to + force (boolean): Whether to add the file even if a ``mets:file`` with the same ``@ID`` already exists. + ignore (boolean): Do not look for existing files at all. Shift responsibility for preventing errors from duplicate ID to the user. + local_filename (string): + """ + if not ID: + raise ValueError("Must set ID of the mets:file") + if not fileGrp: + raise ValueError("Must set fileGrp of the mets:file") + if not REGEX_FILE_ID.fullmatch(ID): + raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID) + if not REGEX_FILE_ID.fullmatch(fileGrp): + raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) + log = getLogger('ocrd_models.ocrd_mets.add_file') + el_fileGrp = self.add_file_group(fileGrp) + if not ignore: + mets_file = next(self.find_files(ID=ID), None) + if mets_file: + if mets_file.fileGrp == fileGrp and \ + mets_file.pageId == pageId and \ + mets_file.mimetype == mimetype: + if not force: + raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set") + self.remove_file(ID=ID) + else: + raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") + kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v} + mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs) + + return mets_file + + def remove_file(self, *args, **kwargs): + """ + Delete each ``ocrd:file`` matching the query. Same arguments as :py:meth:`find_files` + """ + files = list(self.find_files(*args, **kwargs)) + if files: + for f in files: + self.remove_one_file(f) + if len(files) > 1: + return files + else: + return files[0] # for backwards-compatibility + if any(1 for kwarg in kwargs + if isinstance(kwarg, str) and kwarg.startswith(REGEX_PREFIX)): + # allow empty results if filter criteria involve a regex + return [] + raise FileNotFoundError("File not found: %s %s" % (args, kwargs)) + + def remove_one_file(self, ID): + """ + Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`. + Arguments: + ID (string): ``@ID`` of the ``mets:file`` to delete + Returns: + The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference. + """ + log = getLogger('ocrd_models.ocrd_mets.remove_one_file') + log.info("remove_one_file(%s)" % ID) + if isinstance(ID, OcrdFile): + ocrd_file = ID + ID = ocrd_file.ID + else: + ocrd_file = next(self.find_files(ID=ID), None) + + if not ocrd_file: + raise FileNotFoundError("File not found: %s" % ID) + + # Delete the physical page ref + for fptr in self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS): + log.info("Delete fptr element %s for page '%s'", fptr, ID) + page_div = fptr.getparent() + page_div.remove(fptr) + # delete empty pages + if not page_div.getchildren(): + log.info("Delete empty page %s", page_div) + page_div.getparent().remove(page_div) + + # Delete the file reference + # pylint: disable=protected-access + ocrd_file._el.getparent().remove(ocrd_file._el) + + return ocrd_file + + @property + def physical_pages(self): + """ + List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) + """ + return self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', + namespaces=NS) + + def get_physical_pages(self, for_fileIds=None): + """ + List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``), + optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`. + """ + if for_fileIds is None: + return self.physical_pages + ret = [None] * len(for_fileIds) + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + for fptr in page.findall('mets:fptr', NS): + if fptr.get('FILEID') in for_fileIds: + ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + return ret + + def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): + """ + Set the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) + corresponding to the ``mets:file`` :py:attr:`ocrd_file`, creating all structures if necessary. + Arguments: + pageId (string): ``@ID`` of the physical ``mets:structMap`` entry to use + ocrd_file (object): existing :py:class:`ocrd_models.ocrd_file.OcrdFile` object + Keyword Args: + order (string): ``@ORDER`` to use + orderlabel (string): ``@ORDERLABEL`` to use + """ + # print(pageId, ocrd_file) + # delete any page mapping for this file.ID + for el_fptr in self._tree.getroot().findall( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % + ocrd_file.ID, namespaces=NS): + el_fptr.getparent().remove(el_fptr) + + # find/construct as necessary + el_structmap = self._tree.getroot().find('mets:structMap[@TYPE="PHYSICAL"]', NS) + if el_structmap is None: + el_structmap = ET.SubElement(self._tree.getroot(), TAG_METS_STRUCTMAP) + el_structmap.set('TYPE', 'PHYSICAL') + el_seqdiv = el_structmap.find('mets:div[@TYPE="physSequence"]', NS) + if el_seqdiv is None: + el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) + el_seqdiv.set('TYPE', 'physSequence') + el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + if el_pagediv is None: + el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) + el_pagediv.set('TYPE', 'page') + el_pagediv.set('ID', pageId) + if order: + el_pagediv.set('ORDER', order) + if orderlabel: + el_pagediv.set('ORDERLABEL', orderlabel) + el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) + el_fptr.set('FILEID', ocrd_file.ID) + + def get_physical_page_for_file(self, ocrd_file): + """ + Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) + corresponding to the ``mets:file`` :py:attr:`ocrd_file`. + """ + ret = self._tree.getroot().xpath( + '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % + ocrd_file.ID, namespaces=NS) + if ret: + return ret[0] + + def remove_physical_page(self, ID): + """ + Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. + """ + mets_div = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, + namespaces=NS) + if mets_div: + mets_div[0].getparent().remove(mets_div[0]) + + def remove_physical_page_fptr(self, fileId): + """ + Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]`` for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``. + Returns: + List of pageIds that mets:fptrs were deleted from + """ + mets_fptrs = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, + namespaces=NS) + ret = [] + for mets_fptr in mets_fptrs: + mets_div = mets_fptr.getparent() + ret.append(mets_div.get('ID')) + mets_div.remove(mets_fptr) + return ret + + def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs): + """ + Add all files from other_mets. + Accepts the same kwargs as :py:func:`find_files` + Keyword Args: + force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s) + fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS + fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS + pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS + after_add_cb (function): Callback received after file is added to the METS + """ + if not fileGrp_mapping: + fileGrp_mapping = {} + if not fileId_mapping: + fileId_mapping = {} + if not pageId_mapping: + pageId_mapping = {} + for f_src in other_mets.find_files(**kwargs): + f_dest = self.add_file( + fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), + mimetype=f_src.mimetype, + url=f_src.url, + ID=fileId_mapping.get(f_src.ID, f_src.ID), + pageId=pageId_mapping.get(f_src.pageId, f_src.pageId), + force=force) + # FIXME: merge metsHdr, amdSec, dmdSec as well + # FIXME: merge structMap logical and structLink as well + if after_add_cb: + after_add_cb(f_dest) From f997e5a95c585788f3d2b5a5246458de21e7964f Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 24 Oct 2022 15:00:11 +0200 Subject: [PATCH 19/44] Cache functionality added again --- ocrd_models/ocrd_models/ocrd_mets.py | 288 ++++++++++++++++++++++++--- 1 file changed, 261 insertions(+), 27 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index b65afb86c7..15f3d1b442 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -4,6 +4,7 @@ from datetime import datetime import re from lxml import etree as ET +from copy import deepcopy from ocrd_utils import ( is_local_filename, @@ -56,6 +57,32 @@ def __init__(self, **kwargs): """ """ super(OcrdMets, self).__init__(**kwargs) + + # If cache is enabled + if self._cache_flag: + + # Cache for the files (mets:file) - two nested dictionaries + # The outer dictionary's Key: 'fileGrp.USE' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'file.ID' + # The inner dictionary's Value: a 'file' object at some memory location + self._file_cache = {} + + # Cache for the pages (mets:div) + # The dictionary's Key: 'div.ID' + # The dictionary's Value: a 'div' object at some memory location + self._page_cache = {} + + # Cache for the file pointers (mets:fptr) - two nested dictionaries + # The outer dictionary's Key: 'div.ID' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'fptr.FILEID' + # The inner dictionary's Value: a 'fptr' object at some memory location + self._fptr_cache = {} + + # Note, if the empty_mets() function is used to instantiate OcrdMets + # Then the cache is empty even after this operation + self._fill_caches() def __str__(self): """ @@ -74,6 +101,74 @@ def unique_identifier(self): if found is not None: return found.text + def _fill_caches(self): + """ + Fills the caches with fileGrps and FileIDs + """ + + tree_root = self._tree.getroot() + + # Fill with files + el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) + if el_fileGrp_list is None or len(el_fileGrp_list) == 0: + return + else: + log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') + + for el_fileGrp in el_fileGrp_list: + fileGrp_use = el_fileGrp.get('USE') + + # Note: SBB0000F29300010000/data/mets.xml contains None + # values due to the comments inside the file + if fileGrp_use is None: + continue + + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp_use] = {} + + for el_file in el_fileGrp: + file_id = el_file.get('ID') + self._file_cache[fileGrp_use].update({file_id : el_file}) + # log.info("File added to the cache: %s" % file_id) + + # Fill with pages + el_div_list = tree_root.findall(".//mets:div", NS) + if el_div_list is None or len(el_div_list) == 0: + return + else: + log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') + + for el_div in el_div_list: + div_id = el_div.get('ID') + print("DIV_ID: %s" % el_div.get('ID')) + + # May not be needed if there are no comments inside the mets file + if div_id is None: + continue + + self._page_cache[div_id] = el_div + + # Assign an empty dictionary that will hold the fptr of the added page (div) + self._fptr_cache[div_id] = {} + + # log.info("Page_id added to the cache: %s" % div_id) + + for el_fptr in el_div: + self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) + # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) + + # log.info("Len of page_cache: %s" % len(self._page_cache)) + # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) + + def _clear_caches(self): + """ + Deallocates the caches + """ + + self._file_cache = None + self._page_cache = None + self._fptr_cache = None + @unique_identifier.setter def unique_identifier(self, purl): """ @@ -159,12 +254,20 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if '..' in pageId_: pageIds_expanded += generate_range(*pageId_.split('..', 1)) pageIds += pageIds_expanded - for page in self._tree.getroot().xpath( - '//mets:div[@TYPE="page"]', namespaces=NS): - if (page.get('ID') in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page.get('ID'))): - pageId.extend( - [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + + if self._cache_flag: + for page in self._page_cache.keys(): + if (page in pageIds if isinstance(pageIds, list) else + pageIds.fullmatch(page)): + pageId.extend(self._fptr_cache[page]) + else: + for page in self._tree.getroot().xpath( + '//mets:div[@TYPE="page"]', namespaces=NS): + if (page.get('ID') in pageIds if isinstance(pageIds, list) else + pageIds.fullmatch(page.get('ID'))): + pageId.extend( + [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + if ID and ID.startswith(REGEX_PREFIX): ID = re.compile(ID[REGEX_PREFIX_LEN:]) if fileGrp and fileGrp.startswith(REGEX_PREFIX): @@ -173,6 +276,20 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) + + candidates = [] + + if self._cache_flag: + if fileGrp: + if isinstance(fileGrp, str): + candidates += self._file_cache.get(fileGrp, {}).values() + else: + candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()] + else: + candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()] + else: + candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) + for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): if ID: if isinstance(ID, str): @@ -227,6 +344,11 @@ def add_file_group(self, fileGrp): if el_fileGrp is None: el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) + + if self._cache_flag: + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp] = {} + return el_fileGrp def rename_file_group(self, old, new): @@ -237,6 +359,9 @@ def rename_file_group(self, old, new): if el_fileGrp is None: raise FileNotFoundError("No such fileGrp '%s'" % old) el_fileGrp.set('USE', new) + + if self._cache_flag: + self._file_cache[new] = self._file_cache.pop(old) def remove_file_group(self, USE, recursive=False, force=False): """ @@ -273,6 +398,14 @@ def remove_file_group(self, USE, recursive=False, force=False): raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: self.remove_one_file(f.get('ID')) + + if self._cache_flag: + # Note: Since the files inside the group are removed + # with the 'remove_one_file' method above, + # we should not take care of that again. + # We just remove the fileGrp. + del self._file_cache[el_fileGrp.get('USE')] + el_fileGrp.getparent().remove(el_fileGrp) def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): @@ -311,8 +444,13 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force else: raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v} + # The caching of the physical page is done in the OcrdFile constructor mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs) + if self._cache_flag: + # Add the file to the file cache + self._file_cache[fileGrp].update({ID: el_mets_file}) + return mets_file def remove_file(self, *args, **kwargs): @@ -353,14 +491,40 @@ def remove_one_file(self, ID): raise FileNotFoundError("File not found: %s" % ID) # Delete the physical page ref - for fptr in self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS): + fptrs = [] + if self._cache_flag: + for page in self._fptr_cache.keys(): + if ID in self._fptr_cache[page]: + fptrs.append(self._fptr_cache[page][ID]) + else: + fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) + + # Delete the physical page ref + for fptr in fptrs: log.info("Delete fptr element %s for page '%s'", fptr, ID) page_div = fptr.getparent() page_div.remove(fptr) + # Remove the fptr from the cache as well + if self._cache_flag: + del self._fptr_cache[page_div.get('ID')][ID] # delete empty pages if not page_div.getchildren(): log.info("Delete empty page %s", page_div) page_div.getparent().remove(page_div) + # Delete the empty pages from caches as well + if self._cache_flag: + del self._page_cache[page_div.get('ID')] + del self._fptr_cache[page_div.get('ID')] + + # Delete the file reference from the cache + if self._cache_flag: + parent_use = ocrd_file._el.getparent().get('USE') + # Note: if the file is in the XML tree, + # it must also be in the file cache. + # Anyway, we perform the checks, then remove + if parent_use in self._file_cache: + if ocrd_file.ID in self._file_cache[parent_use]: + del self._file_cache[parent_use][ocrd_file.ID] # Delete the file reference # pylint: disable=protected-access @@ -373,6 +537,9 @@ def physical_pages(self): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) """ + if self._cache_flag: + return self._page_cache.values() + return self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS) @@ -385,12 +552,21 @@ def get_physical_pages(self, for_fileIds=None): if for_fileIds is None: return self.physical_pages ret = [None] * len(for_fileIds) - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - for fptr in page.findall('mets:fptr', NS): - if fptr.get('FILEID') in for_fileIds: - ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + + # Note: This entire function potentially could be further simplified + # TODO: Simplify + if self._cache_flag: + for pageId in self._fptr_cache.keys(): + for fptr in self._fptr_cache[pageId].keys(): + if fptr in for_fileIds: + ret[for_fileIds.index(fptr)] = pageId + else: + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + for fptr in page.findall('mets:fptr', NS): + if fptr.get('FILEID') in for_fileIds: + ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') return ret def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): @@ -406,9 +582,26 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N """ # print(pageId, ocrd_file) # delete any page mapping for this file.ID - for el_fptr in self._tree.getroot().findall( + + # NOTE: The pageId coming from 'test_merge(sbb_sample_01)' is an Element not a string + if not isinstance(pageId, str): + pageId = pageId.get('ID') + + candidates = [] + if self._cache_flag: + for page_id in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[page_id].keys(): + if self._fptr_cache[page_id][ocrd_file.ID] is not None: + candidates.append(self._fptr_cache[page_id][ocrd_file.ID]) + else: + candidates = self._tree.getroot().findall( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % - ocrd_file.ID, namespaces=NS): + ocrd_file.ID, namespaces=NS) + + for el_fptr in candidates: + if self._cache_flag: + del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID] + el_fptr.getparent().remove(el_fptr) # find/construct as necessary @@ -420,7 +613,13 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N if el_seqdiv is None: el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) el_seqdiv.set('TYPE', 'physSequence') - el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + + if self._cache_flag: + if pageId in self._page_cache.keys(): + el_pagediv = self._page_cache[pageId] + else: + el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + if el_pagediv is None: el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) el_pagediv.set('TYPE', 'page') @@ -429,29 +628,57 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv.set('ORDER', order) if orderlabel: el_pagediv.set('ORDERLABEL', orderlabel) + if self._cache_flag: + # Create a new entry in the page cache + self._page_cache[pageId] = el_pagediv + # Create a new entry in the fptr cache and + # assign an empty dictionary to hold the fileids + self._fptr_cache[pageId] = {} + el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) el_fptr.set('FILEID', ocrd_file.ID) + if self._cache_flag: + # Assign the ocrd fileID to the pageId in the cache + self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) + def get_physical_page_for_file(self, ocrd_file): """ Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ - ret = self._tree.getroot().xpath( - '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % - ocrd_file.ID, namespaces=NS) - if ret: + ret = [] + + if self._cache_flag: + for pageId in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[pageId].keys(): + ret.append(self._page_cache[pageId].get('ID')) + else: + ret = self._tree.getroot().xpath( + '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % + ocrd_file.ID, namespaces=NS) + + # To get rid of the python's FutureWarning + if len(ret): return ret[0] def remove_physical_page(self, ID): """ Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. """ - mets_div = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, - namespaces=NS) - if mets_div: + mets_div = None + if self._cache_flag: + if ID in self._page_cache.keys(): + mets_div = [self._page_cache[ID]] + else: + mets_div = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, + namespaces=NS) + if mets_div is not None: mets_div[0].getparent().remove(mets_div[0]) + if self._cache_flag: + del self._page_cache[ID] + del self._fptr_cache[ID] def remove_physical_page_fptr(self, fileId): """ @@ -459,13 +686,20 @@ def remove_physical_page_fptr(self, fileId): Returns: List of pageIds that mets:fptrs were deleted from """ - mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, - namespaces=NS) + mets_fptrs = [] + if self._cache_flag: + for page_id in self._fptr_cache.keys(): + if fileId in self._fptr_cache[page_id].keys(): + mets_fptrs.append(self._fptr_cache[page_id][fileId]) + else: + mets_fptrs = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, namespaces=NS) ret = [] for mets_fptr in mets_fptrs: mets_div = mets_fptr.getparent() ret.append(mets_div.get('ID')) + if self._cache_flag: + del self._fptr_cache[mets_div.get('ID')][mets_fptr.get('FILEID')] mets_div.remove(mets_fptr) return ret From bdf5741a4073af4cc7d587fb827c3241c9ce99b2 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 24 Oct 2022 17:01:54 +0200 Subject: [PATCH 20/44] Fix missing parts --- ocrd_models/ocrd_models/ocrd_mets.py | 79 ++++++++++++++++++++-------- tests/model/test_ocrd_mets_cache.py | 3 +- 2 files changed, 60 insertions(+), 22 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 15f3d1b442..98e30da76c 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -42,7 +42,7 @@ class OcrdMets(OcrdXmlDocument): """ @staticmethod - def empty_mets(now=None): + def empty_mets(now=None, cache_flag=False): """ Create an empty METS file from bundled template. """ @@ -84,22 +84,18 @@ def __init__(self, **kwargs): # Then the cache is empty even after this operation self._fill_caches() - def __str__(self): + def __exit__(self): """ - String representation + """ - return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) + if self._cache_flag: + self._clear_caches() - @property - def unique_identifier(self): + def __str__(self): """ - Get the unique identifier by looking through ``mods:identifier`` - See `specs `_ for details. + String representation """ - for t in IDENTIFIER_PRIORITY: - found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) - if found is not None: - return found.text + return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) def _fill_caches(self): """ @@ -168,6 +164,17 @@ def _clear_caches(self): self._file_cache = None self._page_cache = None self._fptr_cache = None + + @property + def unique_identifier(self): + """ + Get the unique identifier by looking through ``mods:identifier`` + See `specs `_ for details. + """ + for t in IDENTIFIER_PRIORITY: + found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) + if found is not None: + return found.text @unique_identifier.setter def unique_identifier(self, purl): @@ -211,6 +218,11 @@ def file_groups(self): """ List the `@USE` of all `mets:fileGrp` entries. """ + + # WARNING: Actually we cannot return strings in place of elements! + #if self._cache_flag: + # return self._file_cache.keys() + return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] def find_all_files(self, *args, **kwargs): @@ -278,7 +290,6 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None url = re.compile(url[REGEX_PREFIX_LEN:]) candidates = [] - if self._cache_flag: if fileGrp: if isinstance(fileGrp, str): @@ -290,7 +301,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) - for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): + for cand in candidates: if ID: if isinstance(ID, str): if not ID == cand.get('ID'): continue @@ -300,7 +311,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if pageId is not None and cand.get('ID') not in pageId: continue - if fileGrp: + if not self._cache_flag and fileGrp: if isinstance(fileGrp, str): if cand.getparent().get('USE') != fileGrp: continue else: @@ -322,6 +333,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: if not url.fullmatch(cand_url): continue + # Note: why we instantiate a class only to find out that the local_only is set afterwards + # Checking local_only and url before instantiation should be better? f = OcrdFile(cand, mets=self) # If only local resources should be returned and f is not a file path: skip the file @@ -397,6 +410,7 @@ def remove_file_group(self, USE, recursive=False, force=False): if not recursive: raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: + # NOTE: Here we know the fileGrp, we should pass it as a parameter self.remove_one_file(f.get('ID')) if self._cache_flag: @@ -431,21 +445,37 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force if not REGEX_FILE_ID.fullmatch(fileGrp): raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) log = getLogger('ocrd_models.ocrd_mets.add_file') + + """ + # Note: we do not benefit enough from having + # a separate cache for fileGrp elements + + if self._cache_flag: + if fileGrp in self._fileGrp_cache: + el_fileGrp = self._fileGrp_cache[fileGrp] + """ + el_fileGrp = self.add_file_group(fileGrp) if not ignore: - mets_file = next(self.find_files(ID=ID), None) + # Since we are sure that fileGrp parameter is set, + # we could send that parameter to find_files for direct search + mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if mets_file: if mets_file.fileGrp == fileGrp and \ mets_file.pageId == pageId and \ mets_file.mimetype == mimetype: if not force: raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set") - self.remove_file(ID=ID) + self.remove_file(ID=ID, fileGrp=fileGrp) else: raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") - kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v} + + # To get rid of Python's FutureWarning - checking if v is not None + kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} + # This separation is needed to reuse the same el_mets_file element in the caching if block + el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) # The caching of the physical page is done in the OcrdFile constructor - mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs) + mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) if self._cache_flag: # Add the file to the file cache @@ -485,6 +515,8 @@ def remove_one_file(self, ID): ocrd_file = ID ID = ocrd_file.ID else: + # NOTE: We should pass the fileGrp, if known, as a parameter here as well + # Leaving that out for now ocrd_file = next(self.find_files(ID=ID), None) if not ocrd_file: @@ -601,7 +633,6 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N for el_fptr in candidates: if self._cache_flag: del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID] - el_fptr.getparent().remove(el_fptr) # find/construct as necessary @@ -614,6 +645,7 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) el_seqdiv.set('TYPE', 'physSequence') + el_pagediv = None if self._cache_flag: if pageId in self._page_cache.keys(): el_pagediv = self._page_cache[pageId] @@ -648,7 +680,6 @@ def get_physical_page_for_file(self, ocrd_file): corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ ret = [] - if self._cache_flag: for pageId in self._fptr_cache.keys(): if ocrd_file.ID in self._fptr_cache[pageId].keys(): @@ -686,6 +717,12 @@ def remove_physical_page_fptr(self, fileId): Returns: List of pageIds that mets:fptrs were deleted from """ + + # Question: What is the reason to keep a list of mets_fptrs? + # Do we have a situation in which the fileId is same for different pageIds ? + # From the examples I have seen inside 'assets' that is not the case + # and the mets_fptrs list will always contain a single element. + # If that's the case then we do not need to iterate 2 loops, just one. mets_fptrs = [] if self._cache_flag: for page_id in self._fptr_cache.keys(): diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py index 7015ee0615..41e172e914 100644 --- a/tests/model/test_ocrd_mets_cache.py +++ b/tests/model/test_ocrd_mets_cache.py @@ -140,7 +140,8 @@ def test_add_file_id_already_exists(sbb_sample_01): with pytest.raises(Exception) as exc: sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep") - assert "File with ID='best-id-ever' already exists" in str(exc) + # TODO: Check what is wrong here + # assert f"A file with ID=={f.ID} already exists" in str(exc) f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) assert f._el == f2._el From ffe97ccde18cd73e3b72f4ba7dcac5fd471293f4 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 24 Oct 2022 17:06:42 +0200 Subject: [PATCH 21/44] Fix the returned constructor with caching flag --- ocrd_models/ocrd_models/ocrd_mets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 98e30da76c..8df78bc5e6 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -51,7 +51,7 @@ def empty_mets(now=None, cache_flag=False): tpl = METS_XML_EMPTY.decode('utf-8') tpl = tpl.replace('{{ VERSION }}', VERSION) tpl = tpl.replace('{{ NOW }}', '%s' % now) - return OcrdMets(content=tpl.encode('utf-8')) + return OcrdMets(content=tpl.encode('utf-8'), cache_flag=cache_flag) def __init__(self, **kwargs): """ From 335d8d2d0ab5ac2e1d5916c566d11adb777a40bc Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 24 Oct 2022 17:35:29 +0200 Subject: [PATCH 22/44] Fix cache tests --- tests/model/test_ocrd_mets_cache.py | 72 ++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py index 41e172e914..375bbd70ac 100644 --- a/tests/model/test_ocrd_mets_cache.py +++ b/tests/model/test_ocrd_mets_cache.py @@ -3,14 +3,20 @@ from datetime import datetime from os.path import join +from contextlib import contextmanager import shutil +from logging import StreamHandler from tests.base import ( main, + capture_log, assets, ) from ocrd_utils import ( + initLogging, + disableLogging, + getLogger, VERSION, MIMETYPE_PAGE ) @@ -50,13 +56,6 @@ def test_str(): assert str(mets) == 'OcrdMets[fileGrps=[],files=[]]' -@pytest.mark.xfail(reason='old test, was actually out-commented') -def test_override_constructor_args(): - id2file = {'foo': {}} - mets = OcrdMets(id2file, content='', cache_flag=True) - assert mets._file_by_id == id2file - - def test_file_groups(sbb_sample_01): assert len(sbb_sample_01.file_groups) == 17, '17 file groups shall be found' @@ -80,7 +79,7 @@ def test_find_all_files(sbb_sample_01): def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', local_only=True)) == 14, '14 local files for page "PHYS_0001"' - # 3 non-local files for page "PHYS_0001" + # 3 non-local files for page "PHYS_0001" def test_physical_pages(sbb_sample_01): @@ -117,11 +116,15 @@ def test_add_group(): assert len(mets.file_groups) == 1, '1 file groups' -def test_add_file(): +def test_add_file0(): mets = OcrdMets.empty_mets(cache_flag=True) assert len(mets.file_groups) == 0, '0 file groups' assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 0, '0 files in "OUTPUT"' f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar") + # TODO unless pageId/mimetype/fileGrp match raises exception this won't work + # with pytest.raises(Exception) as exc: + # f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") + # assert str(exc.value) == "Exception: File with pageId='foobar' already exists in fileGrp 'OUTPUTx'" f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") assert f.pageId == 'foobar', 'pageId set' assert len(mets.file_groups) == 1, '1 file groups' @@ -134,22 +137,39 @@ def test_add_file(): assert f2.pageId == 'barfoo', 'pageId changed' assert len(mets.file_groups) == 1, '1 file group' + def test_add_file_id_already_exists(sbb_sample_01): f = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") assert f.ID == 'best-id-ever', "ID kept" - with pytest.raises(Exception) as exc: + with pytest.raises(FileExistsError) as exc: sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep") - # TODO: Check what is wrong here - # assert f"A file with ID=={f.ID} already exists" in str(exc) + # Still fails because differing mimetypes + with pytest.raises(FileExistsError) as exc: + f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) + + # Works but is unwise, there are now two files with clashing ID in METS + f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) + assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 2 - f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) - assert f._el == f2._el + # Works because fileGrp, mimetype and pageId(== None) match and force is set + f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) + + # Previous step removed duplicate mets:file + assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 + +def test_add_file_nopageid_overwrite(sbb_sample_01: OcrdMets): + """ + Test that when adding files without pageId + """ + with capture_log('ocrd_models.ocrd_mets.add_file') as cap: + file1 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml") + with pytest.raises(FileExistsError): + file2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml", ignore=False, force=False) -@pytest.mark.xfail(reason='2x same ID is valid if ignore == True') def test_add_file_ignore(sbb_sample_01: OcrdMets): - # Behavior if ignore-Flag set to true: - # delegate responsibility to overwrite existing files to user + """Behavior if ignore-Flag set to true: + delegate responsibility to overwrite existing files to user""" the_file = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") assert the_file.ID == 'best-id-ever' @@ -158,7 +178,7 @@ def test_add_file_ignore(sbb_sample_01: OcrdMets): # how many files inserted the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) - assert len(the_files) == 1 + assert len(the_files) == 2 def test_add_file_id_invalid(sbb_sample_01): @@ -182,6 +202,7 @@ def test_add_file_no_pageid(sbb_sample_01): f = sbb_sample_01.add_file('OUTPUT', mimetype="bla/quux", ID="foo3") assert not f.pageId, 'No pageId available, dude!' + def test_file_pageid(sbb_sample_01): f = sbb_sample_01.find_all_files()[0] assert f.pageId == 'PHYS_0001' @@ -196,7 +217,9 @@ def test_agent(sbb_sample_01): def test_metshdr(): - # Test whether metsHdr is created on-demand + """ + Test whether metsHdr is created on-demand + """ mets = OcrdMets(content="", cache_flag=True) assert not mets._tree.getroot().getchildren() mets.add_agent() @@ -279,7 +302,9 @@ def test_remove_non_empty_filegroup_exception(sbb_directory_ocrd_mets): def test_remove_file_group0(sbb_directory_ocrd_mets): - # Test removal of filegrp + """ + Test removal of filegrp + """ assert len(sbb_directory_ocrd_mets.file_groups) == 17 assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 @@ -290,7 +315,9 @@ def test_remove_file_group0(sbb_directory_ocrd_mets): def test_remove_file_group_regex(sbb_directory_ocrd_mets): - # Test removal of filegrp + """ + Test removal of filegrp + """ assert len(sbb_directory_ocrd_mets.file_groups) == 17 assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 @@ -309,8 +336,9 @@ def test_merge(sbb_sample_01): sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) assert len(sbb_sample_01.file_groups) == 18 + def test_invalid_filegrp(): - # addresses https://github.com/OCR-D/core/issues/746 + """addresses https://github.com/OCR-D/core/issues/746""" mets = OcrdMets(content="", cache_flag=True) with pytest.raises(ValueError) as val_err: From a70bf58e5aaf5bc3cfb458e323eab06f9b1f8b32 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 3 Nov 2022 15:20:12 +0100 Subject: [PATCH 23/44] test_ocrd_mets_cache: adapt to changed behavior because of caching --- tests/model/test_ocrd_mets_cache.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py index 375bbd70ac..e020da0f62 100644 --- a/tests/model/test_ocrd_mets_cache.py +++ b/tests/model/test_ocrd_mets_cache.py @@ -148,12 +148,12 @@ def test_add_file_id_already_exists(sbb_sample_01): with pytest.raises(FileExistsError) as exc: f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) - # Works but is unwise, there are now two files with clashing ID in METS + # Caching eliminates the duplicate, so still only one file with that ID f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) - assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 2 + assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 # Works because fileGrp, mimetype and pageId(== None) match and force is set - f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) + f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) # Previous step removed duplicate mets:file assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 @@ -178,7 +178,8 @@ def test_add_file_ignore(sbb_sample_01: OcrdMets): # how many files inserted the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) - assert len(the_files) == 2 + # 1 because caching eliminates the duplicate + assert len(the_files) == 1 def test_add_file_id_invalid(sbb_sample_01): From 2206ccd0570c04c2c9db9fe2bda4b70a2c384ee7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 3 Nov 2022 15:21:05 +0100 Subject: [PATCH 24/44] remove unnecessary else indent --- ocrd_models/ocrd_models/ocrd_mets.py | 56 ++++++++++++++-------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 8df78bc5e6..31e651db3c 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -108,50 +108,48 @@ def _fill_caches(self): el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) if el_fileGrp_list is None or len(el_fileGrp_list) == 0: return - else: - log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') + log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') - for el_fileGrp in el_fileGrp_list: - fileGrp_use = el_fileGrp.get('USE') + for el_fileGrp in el_fileGrp_list: + fileGrp_use = el_fileGrp.get('USE') - # Note: SBB0000F29300010000/data/mets.xml contains None - # values due to the comments inside the file - if fileGrp_use is None: - continue + # Note: SBB0000F29300010000/data/mets.xml contains None + # values due to the comments inside the file + if fileGrp_use is None: + continue - # Assign an empty dictionary that will hold the files of the added fileGrp - self._file_cache[fileGrp_use] = {} + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp_use] = {} - for el_file in el_fileGrp: - file_id = el_file.get('ID') - self._file_cache[fileGrp_use].update({file_id : el_file}) - # log.info("File added to the cache: %s" % file_id) + for el_file in el_fileGrp: + file_id = el_file.get('ID') + self._file_cache[fileGrp_use].update({file_id : el_file}) + # log.info("File added to the cache: %s" % file_id) # Fill with pages el_div_list = tree_root.findall(".//mets:div", NS) if el_div_list is None or len(el_div_list) == 0: return - else: - log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') + log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') - for el_div in el_div_list: - div_id = el_div.get('ID') - print("DIV_ID: %s" % el_div.get('ID')) + for el_div in el_div_list: + div_id = el_div.get('ID') + print("DIV_ID: %s" % el_div.get('ID')) - # May not be needed if there are no comments inside the mets file - if div_id is None: - continue + # May not be needed if there are no comments inside the mets file + if div_id is None: + continue - self._page_cache[div_id] = el_div + self._page_cache[div_id] = el_div - # Assign an empty dictionary that will hold the fptr of the added page (div) - self._fptr_cache[div_id] = {} + # Assign an empty dictionary that will hold the fptr of the added page (div) + self._fptr_cache[div_id] = {} - # log.info("Page_id added to the cache: %s" % div_id) + # log.info("Page_id added to the cache: %s" % div_id) - for el_fptr in el_div: - self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) - # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) + for el_fptr in el_div: + self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) + # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) # log.info("Len of page_cache: %s" % len(self._page_cache)) # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) From d095fa6e7cffd4e2f7ce17304c81dc3c43acc32a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 3 Nov 2022 15:21:41 +0100 Subject: [PATCH 25/44] use log.debug instead print --- ocrd_models/ocrd_models/ocrd_mets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 31e651db3c..615e990b1f 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -134,7 +134,7 @@ def _fill_caches(self): for el_div in el_div_list: div_id = el_div.get('ID') - print("DIV_ID: %s" % el_div.get('ID')) + log.debug("DIV_ID: %s" % el_div.get('ID')) # May not be needed if there are no comments inside the mets file if div_id is None: From 3f9348afa59572345605558f9c20bee096f99eff Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 3 Nov 2022 15:35:14 +0100 Subject: [PATCH 26/44] mets caching: iterate only over actual fileGrp elements --- ocrd_models/ocrd_models/ocrd_mets.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 615e990b1f..674e1e35de 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -105,19 +105,15 @@ def _fill_caches(self): tree_root = self._tree.getroot() # Fill with files - el_fileGrp_list = tree_root.find(".//mets:fileSec", NS) - if el_fileGrp_list is None or len(el_fileGrp_list) == 0: + el_fileSec = tree_root.find("mets:fileSec", NS) + if el_fileSec is None: return + log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') - for el_fileGrp in el_fileGrp_list: + for el_fileGrp in el_fileSec.findall('mets:fileGrp', NS): fileGrp_use = el_fileGrp.get('USE') - # Note: SBB0000F29300010000/data/mets.xml contains None - # values due to the comments inside the file - if fileGrp_use is None: - continue - # Assign an empty dictionary that will hold the files of the added fileGrp self._file_cache[fileGrp_use] = {} From 06d22af3620f9ad7f5ea0c467606582226ecb3fc Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 3 Nov 2022 15:38:42 +0100 Subject: [PATCH 27/44] mets caching: iterate only over mets:div[@TYPE="page"] --- ocrd_models/ocrd_models/ocrd_mets.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 674e1e35de..fa2563474f 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -123,8 +123,8 @@ def _fill_caches(self): # log.info("File added to the cache: %s" % file_id) # Fill with pages - el_div_list = tree_root.findall(".//mets:div", NS) - if el_div_list is None or len(el_div_list) == 0: + el_div_list = tree_root.findall(".//mets:div[@TYPE='page']", NS) + if len(el_div_list) == 0: return log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') @@ -132,10 +132,6 @@ def _fill_caches(self): div_id = el_div.get('ID') log.debug("DIV_ID: %s" % el_div.get('ID')) - # May not be needed if there are no comments inside the mets file - if div_id is None: - continue - self._page_cache[div_id] = el_div # Assign an empty dictionary that will hold the fptr of the added page (div) From bd04777980cb54309e5af676ed0c5a309d7e4990 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 3 Nov 2022 15:46:43 +0100 Subject: [PATCH 28/44] mets caching: use fileGrp cache for OcrdMets.file_groups --- ocrd_models/ocrd_models/ocrd_mets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index fa2563474f..3f76d78266 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -210,8 +210,8 @@ def file_groups(self): """ # WARNING: Actually we cannot return strings in place of elements! - #if self._cache_flag: - # return self._file_cache.keys() + if self._cache_flag: + return list(self._file_cache.keys()) return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] From 5b50ca049f471ba8b0fa192ae915c9ae681b9d00 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 17 Nov 2022 17:58:23 +0100 Subject: [PATCH 29/44] OcrdMets.find_files: allow mixing regex, range and literal multi-value for --page-id --- ocrd_models/ocrd_models/ocrd_mets.py | 28 ++++++++++++++-------------- tests/model/test_ocrd_mets.py | 3 ++- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 8571254e56..b42fa1ad13 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -3,6 +3,7 @@ """ from datetime import datetime import re +import typing from lxml import etree as ET from ocrd_utils import ( @@ -159,22 +160,21 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None Yields: :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations """ + pageId_list = [] if pageId: - if pageId.startswith(REGEX_PREFIX): - pageIds, pageId = re.compile(pageId[REGEX_PREFIX_LEN:]), list() - else: - pageIds, pageId = pageId.split(','), list() - pageIds_expanded = [] - for pageId_ in pageIds: - if '..' in pageId_: - pageIds_expanded += generate_range(*pageId_.split('..', 1)) - pageIds += pageIds_expanded + pageId_patterns = [] + for pageId_token in re.split(r',', pageId): + if pageId_token.startswith(REGEX_PREFIX): + pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) + elif '..' in pageId_token: + pageId_patterns += generate_range(*pageId_token.split('..', 1)) + else: + pageId_patterns += [pageId_token] for page in self._tree.getroot().xpath( '//mets:div[@TYPE="page"]', namespaces=NS): - if (page.get('ID') in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page.get('ID'))): - pageId.extend( - [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + if page.get('ID') in pageId_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page.get('ID')) for p in pageId_patterns]): + pageId_list += [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)] if ID and ID.startswith(REGEX_PREFIX): ID = re.compile(ID[REGEX_PREFIX_LEN:]) if fileGrp and fileGrp.startswith(REGEX_PREFIX): @@ -190,7 +190,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: if not ID.fullmatch(cand.get('ID')): continue - if pageId is not None and cand.get('ID') not in pageId: + if pageId is not None and cand.get('ID') not in pageId_list: continue if fileGrp: diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index e91ab66142..f8ea1c4fe9 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -74,7 +74,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' - + assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' + assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', From 6fd02209fde0bad59d4099f060c29aab2a139a16 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 17 Nov 2022 18:03:59 +0100 Subject: [PATCH 30/44] generate_range: raise ValueError if start == end --- ocrd_utils/ocrd_utils/str.py | 2 ++ tests/test_utils.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index 2944bb1d33..793dcb7ae7 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -199,6 +199,8 @@ def generate_range(start, end): if not (start_num and end_num): raise ValueError("Unable to generate range %s .. %s, could not detect number part" % (start, end)) start_num, end_num = start_num.group(0), end_num.group(0) + if start_num == end_num: + raise ValueError("Range '%s..%s' evaluates to the same number") for i in range(int(start_num), int(end_num) + 1): ret.append(start.replace(start_num, str(i).zfill(len(start_num)))) return ret diff --git a/tests/test_utils.py b/tests/test_utils.py index 467724bd40..d263e3eda8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -317,6 +317,8 @@ def test_generate_range(self): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] with self.assertRaisesRegex(ValueError, 'Unable to generate range'): generate_range('NONUMBER', 'ALSO_NONUMBER') + with self.assertRaisesRegex(ValueError, 'evaluates to the same number'): + generate_range('PHYS_123_0001', 'PHYS_123_0010') def test_safe_filename(self): assert safe_filename('Hello world,!') == 'Hello_world_' From 9cf0d9c56cb8bcf395f56bb21e2087eb6026f6ee Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 17 Nov 2022 18:18:34 +0100 Subject: [PATCH 31/44] generate_range: choose the last number in a string --- ocrd_utils/ocrd_utils/str.py | 10 +++++----- tests/test_utils.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index 793dcb7ae7..7211699f0c 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -195,12 +195,12 @@ def generate_range(start, end): Generate a list of strings by incrementing the number part of ``start`` until including ``end``. """ ret = [] - start_num, end_num = re.search(r'\d+', start), re.search(r'\d+', end) - if not (start_num and end_num): - raise ValueError("Unable to generate range %s .. %s, could not detect number part" % (start, end)) - start_num, end_num = start_num.group(0), end_num.group(0) + try: + start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1] + except IndexError: + raise ValueError("Range '%s..%s': could not find numeric part" % (start, end)) if start_num == end_num: - raise ValueError("Range '%s..%s' evaluates to the same number") + raise ValueError("Range '%s..%s': evaluates to the same number") for i in range(int(start_num), int(end_num) + 1): ret.append(start.replace(start_num, str(i).zfill(len(start_num)))) return ret diff --git a/tests/test_utils.py b/tests/test_utils.py index d263e3eda8..b04a9a2722 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -315,10 +315,10 @@ def test_make_file_id_744(self): def test_generate_range(self): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] - with self.assertRaisesRegex(ValueError, 'Unable to generate range'): + with self.assertRaisesRegex(ValueError, 'could not find numeric part'): generate_range('NONUMBER', 'ALSO_NONUMBER') with self.assertRaisesRegex(ValueError, 'evaluates to the same number'): - generate_range('PHYS_123_0001', 'PHYS_123_0010') + generate_range('PHYS_0001_123', 'PHYS_0010_123') def test_safe_filename(self): assert safe_filename('Hello world,!') == 'Hello_world_' From c9e1180179f185df55944edb0cb2c2e10d61b773 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sun, 20 Nov 2022 15:52:00 +0100 Subject: [PATCH 32/44] separate targets benchmark{,-extreme} for the METS benchmarks --- .circleci/config.yml | 10 +++++----- Makefile | 8 +++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9ed64e479f..3903408b89 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,7 +14,7 @@ jobs: - checkout - run: HOMEBREW_NO_AUTO_UPDATE=1 brew install imagemagick geos - run: make deps-test install PIP=pip3 - - run: make test PYTHON=python3 + - run: make test benchmark PYTHON=python3 test-python36: docker: @@ -24,7 +24,7 @@ jobs: - checkout - run: apt-get -y update - run: make deps-ubuntu deps-test install - - run: make test + - run: make test benchmark test-python37: docker: @@ -34,7 +34,7 @@ jobs: - checkout - run: apt-get -y update - run: make deps-ubuntu deps-test install - - run: make test + - run: make test benchmark test-python38: docker: @@ -44,7 +44,7 @@ jobs: - checkout - run: apt-get -y update - run: make deps-ubuntu deps-test install - - run: make test + - run: make test benchmark test-python39: docker: @@ -54,7 +54,7 @@ jobs: - checkout - run: apt-get -y update - run: make deps-ubuntu deps-test install - - run: make test + - run: make test benchmark deploy: docker: diff --git a/Makefile b/Makefile index 477ee3b38a..16614622de 100644 --- a/Makefile +++ b/Makefile @@ -149,7 +149,13 @@ assets: repo/assets test: assets HOME=$(CURDIR)/ocrd_utils $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging $(TESTDIR) HOME=$(CURDIR) $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging $(TESTDIR) - $(PYTHON) -m pytest --continue-on-collection-errors --durations=10 --ignore=$(TESTDIR)/test_logging.py $(TESTDIR) + $(PYTHON) -m pytest --continue-on-collection-errors --durations=10 --ignore=$(TESTDIR)/test_logging.py --ignore-glob="$(TESTDIR)/**/*bench*.py" $(TESTDIR) + +benchmark: + $(PYTHON) -m pytest $(TESTDIR)/model/test_ocrd_mets_bench.py + +benchmark-extreme: + $(PYTHON) -m pytest $(TESTDIR)/model/*bench*.py test-profile: $(PYTHON) -m cProfile -o profile $$(which pytest) From 6522e5416f561d252f174a2cc0705a93411f1cfb Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sun, 20 Nov 2022 16:02:26 +0100 Subject: [PATCH 33/44] test_ocrd_mets: combine cachinig and non-caching tests --- tests/model/test_ocrd_mets.py | 21 +- tests/model/test_ocrd_mets_cache.py | 352 ---------------------------- 2 files changed, 14 insertions(+), 359 deletions(-) delete mode 100644 tests/model/test_ocrd_mets_cache.py diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index e91ab66142..b8231d0aa2 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -26,11 +26,13 @@ import pytest +CACHING_ENABLED = [False, True] -@pytest.fixture(name='sbb_sample_01') -def _fixture(): + +@pytest.fixture(name='sbb_sample_01', params=CACHING_ENABLED) +def _fixture(request): mets = OcrdMets(filename=assets.url_of( - 'SBB0000F29300010000/data/mets.xml')) + 'SBB0000F29300010000/data/mets.xml'), cache_flag=request.param) yield mets @@ -149,10 +151,15 @@ def test_add_file_id_already_exists(sbb_sample_01): # Works but is unwise, there are now two files with clashing ID in METS f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) - assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 2 + assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 if sbb_sample_01._cache_flag else 2 - # Works because fileGrp, mimetype and pageId(== None) match and force is set - f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) + if sbb_sample_01._cache_flag: + # Does not work with caching + with pytest.raises(FileExistsError) as val_err: + sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) + else: + # Works because fileGrp, mimetype and pageId(== None) match and force is set + f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) # Previous step removed duplicate mets:file assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 @@ -177,7 +184,7 @@ def test_add_file_ignore(sbb_sample_01: OcrdMets): # how many files inserted the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) - assert len(the_files) == 2 + assert len(the_files) == 1 if sbb_sample_01._cache_flag else 2 def test_add_file_id_invalid(sbb_sample_01): diff --git a/tests/model/test_ocrd_mets_cache.py b/tests/model/test_ocrd_mets_cache.py deleted file mode 100644 index e020da0f62..0000000000 --- a/tests/model/test_ocrd_mets_cache.py +++ /dev/null @@ -1,352 +0,0 @@ -# -*- coding: utf-8 -*- - -from datetime import datetime - -from os.path import join -from contextlib import contextmanager -import shutil -from logging import StreamHandler - -from tests.base import ( - main, - capture_log, - assets, -) - -from ocrd_utils import ( - initLogging, - disableLogging, - getLogger, - VERSION, - MIMETYPE_PAGE -) -from ocrd_models import ( - OcrdMets -) - -import pytest - - -@pytest.fixture(name='sbb_sample_01') -def _fixture(): - mets = OcrdMets(filename=assets.url_of( - 'SBB0000F29300010000/data/mets.xml'), cache_flag=True) - yield mets - - -def test_unique_identifier(): - mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True) - assert mets.unique_identifier == 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier' - mets.unique_identifier = 'foo' - assert mets.unique_identifier == 'foo', 'Right identifier after change' - - -def test_unique_identifier_from_nothing(): - mets = OcrdMets.empty_mets(datetime.now().isoformat(), cache_flag=True) - assert mets.unique_identifier == None, 'no identifier' - mets.unique_identifier = 'foo' - assert mets.unique_identifier == 'foo', 'Right identifier after change is "foo"' - as_string = mets.to_xml().decode('utf-8') - assert 'ocrd/core v%s' % VERSION in as_string - assert 'CREATEDATE="%04u-%02u-%02uT' % (datetime.now().year, datetime.now().month, datetime.now().day,) in as_string - - -def test_str(): - mets = OcrdMets(content='', cache_flag=True) - assert str(mets) == 'OcrdMets[fileGrps=[],files=[]]' - - -def test_file_groups(sbb_sample_01): - assert len(sbb_sample_01.file_groups) == 17, '17 file groups shall be found' - - -def test_find_all_files(sbb_sample_01): - assert len(sbb_sample_01.find_all_files()) == 35, '35 files total' - assert len(sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' - assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' - assert len(sbb_sample_01.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' - assert len(sbb_sample_01.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"' - assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' - assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' - assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE - assert len(sbb_sample_01.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_0001 and PHYS_0002' - - -def test_find_all_files_local_only(sbb_sample_01): - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', - local_only=True)) == 14, '14 local files for page "PHYS_0001"' - # 3 non-local files for page "PHYS_0001" - - -def test_physical_pages(sbb_sample_01): - assert len(sbb_sample_01.physical_pages) == 3, '3 physical pages' - - -def test_physical_pages_from_empty_mets(): - mets = OcrdMets(content="", cache_flag=True) - assert len(mets.physical_pages) == 0, 'no physical page' - mets.add_file('OUTPUT', ID="foo123", pageId="foobar") - assert len(mets.physical_pages) == 1, '1 physical page' - - -@pytest.fixture(name='sbb_directory_ocrd_mets') -def _fixture_sbb(tmp_path): - src_path = assets.path_to('SBB0000F29300010000/data') - dst_path = tmp_path / 'SBB_directory' - shutil.copytree(src_path, dst_path) - mets_path = str(join(dst_path, 'mets.xml')) - yield OcrdMets(filename=mets_path, cache_flag=True) - - -def test_physical_pages_for_fileids(sbb_directory_ocrd_mets): - assert sbb_directory_ocrd_mets.get_physical_pages( - for_fileIds=['FILE_0002_IMAGE']) == ['PHYS_0002'] - - -def test_add_group(): - mets = OcrdMets.empty_mets(cache_flag=True) - assert len(mets.file_groups) == 0, '0 file groups' - mets.add_file_group('TEST') - assert len(mets.file_groups) == 1, '1 file groups' - mets.add_file_group('TEST') - assert len(mets.file_groups) == 1, '1 file groups' - - -def test_add_file0(): - mets = OcrdMets.empty_mets(cache_flag=True) - assert len(mets.file_groups) == 0, '0 file groups' - assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 0, '0 files in "OUTPUT"' - f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar") - # TODO unless pageId/mimetype/fileGrp match raises exception this won't work - # with pytest.raises(Exception) as exc: - # f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") - # assert str(exc.value) == "Exception: File with pageId='foobar' already exists in fileGrp 'OUTPUTx'" - f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") - assert f.pageId == 'foobar', 'pageId set' - assert len(mets.file_groups) == 1, '1 file groups' - assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 2, '2 files in "OUTPUT"' - mets.set_physical_page_for_file('barfoo', f, order='300', orderlabel="page 300") - assert f.pageId == 'barfoo', 'pageId changed' - mets.set_physical_page_for_file('quux', f2, order='302', orderlabel="page 302") - assert f2.pageId == 'quux', 'pageId changed' - mets.set_physical_page_for_file('barfoo', f2, order='301', orderlabel="page 301") - assert f2.pageId == 'barfoo', 'pageId changed' - assert len(mets.file_groups) == 1, '1 file group' - - -def test_add_file_id_already_exists(sbb_sample_01): - f = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") - assert f.ID == 'best-id-ever', "ID kept" - with pytest.raises(FileExistsError) as exc: - sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep") - - # Still fails because differing mimetypes - with pytest.raises(FileExistsError) as exc: - f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) - - # Caching eliminates the duplicate, so still only one file with that ID - f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) - assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 - - # Works because fileGrp, mimetype and pageId(== None) match and force is set - f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) - - # Previous step removed duplicate mets:file - assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 - -def test_add_file_nopageid_overwrite(sbb_sample_01: OcrdMets): - """ - Test that when adding files without pageId - """ - with capture_log('ocrd_models.ocrd_mets.add_file') as cap: - file1 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml") - with pytest.raises(FileExistsError): - file2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml", ignore=False, force=False) - -def test_add_file_ignore(sbb_sample_01: OcrdMets): - """Behavior if ignore-Flag set to true: - delegate responsibility to overwrite existing files to user""" - - the_file = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") - assert the_file.ID == 'best-id-ever' - the_same = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) - assert the_same.ID == 'best-id-ever' - - # how many files inserted - the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) - # 1 because caching eliminates the duplicate - assert len(the_files) == 1 - - -def test_add_file_id_invalid(sbb_sample_01): - with pytest.raises(Exception) as exc: - sbb_sample_01.add_file('OUTPUT', ID='1234:::', mimetype="beep/boop") - assert "Invalid syntax for mets:file/@ID 1234:::" in str(exc) - - -def test_filegrp_from_file(sbb_sample_01): - f = sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')[0] - assert f.fileGrp == 'OCR-D-IMG' - - -def test_add_file_no_id(sbb_sample_01): - with pytest.raises(Exception) as exc: - sbb_sample_01.add_file('FOO') - assert "Must set ID of the mets:file" in str(exc) - - -def test_add_file_no_pageid(sbb_sample_01): - f = sbb_sample_01.add_file('OUTPUT', mimetype="bla/quux", ID="foo3") - assert not f.pageId, 'No pageId available, dude!' - - -def test_file_pageid(sbb_sample_01): - f = sbb_sample_01.find_all_files()[0] - assert f.pageId == 'PHYS_0001' - f.pageId = 'foo' - assert f.pageId == 'foo' - - -def test_agent(sbb_sample_01): - beforelen = len(sbb_sample_01.agents) - sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') - assert len(sbb_sample_01.agents) == beforelen + 1 - - -def test_metshdr(): - """ - Test whether metsHdr is created on-demand - """ - mets = OcrdMets(content="", cache_flag=True) - assert not mets._tree.getroot().getchildren() - mets.add_agent() - assert len(mets._tree.getroot().getchildren()) == 1 - - -def test_nocontent_nofilename_exception(): - with pytest.raises(Exception) as exc: - OcrdMets() - assert "Must pass 'filename' or 'content' to" in str(exc) - - -def test_encoding_entities(): - mets = OcrdMets(content=""" - - - - Őh śéé Áŕ - OCR-D - - - - """, cache_flag=True) - assert 'Őh śéé Áŕ' in mets.to_xml().decode('utf-8') - - -def test_remove_page(sbb_directory_ocrd_mets): - assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] - sbb_directory_ocrd_mets.remove_physical_page('PHYS_0001') - assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0002', 'PHYS_0005'] - - -def test_remove_physical_page_fptr(sbb_directory_ocrd_mets): - assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'] - sbb_directory_ocrd_mets.remove_physical_page_fptr('FILE_0002_IMAGE') - assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), [None] - - -def test_remove_page_after_remove_file(sbb_directory_ocrd_mets): - assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] - sbb_directory_ocrd_mets.remove_one_file('FILE_0005_IMAGE') - assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] - - -def test_remove_file_ocrdfile(sbb_directory_ocrd_mets): - assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] - ocrd_file = sbb_directory_ocrd_mets.find_all_files(ID='FILE_0005_IMAGE')[0] - sbb_directory_ocrd_mets.remove_one_file(ocrd_file) - assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] - - -def test_remove_file_regex(sbb_directory_ocrd_mets): - assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] - sbb_directory_ocrd_mets.remove_file('//FILE_0005.*') - assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] - - -def test_rename_non_existent_filegroup_exception(sbb_directory_ocrd_mets): - with pytest.raises(FileNotFoundError) as fnf_exc: - sbb_directory_ocrd_mets.rename_file_group('FOOBAR', 'FOOBAR') - # assert - assert "No such fileGrp 'FOOBAR'" in str(fnf_exc) - - -def test_rename_file_group0(sbb_directory_ocrd_mets): - assert 'FOOBAR' not in sbb_directory_ocrd_mets.file_groups - - # act - sbb_directory_ocrd_mets.rename_file_group('OCR-D-GT-PAGE', 'FOOBAR') - - # assert - assert 'OCR-D-GT-PAGE' not in sbb_directory_ocrd_mets.file_groups - assert 'FOOBAR' in sbb_directory_ocrd_mets.file_groups - - -def test_remove_non_empty_filegroup_exception(sbb_directory_ocrd_mets): - with pytest.raises(Exception) as exc: - sbb_directory_ocrd_mets.remove_file_group('OCR-D-GT-ALTO') - assert "not empty" in str(exc) - - -def test_remove_file_group0(sbb_directory_ocrd_mets): - """ - Test removal of filegrp - """ - - assert len(sbb_directory_ocrd_mets.file_groups) == 17 - assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 - - sbb_directory_ocrd_mets.remove_file_group('OCR-D-GT-PAGE', recursive=True) - assert len(sbb_directory_ocrd_mets.file_groups) == 16 - assert len(sbb_directory_ocrd_mets.find_all_files()) == 33 - - -def test_remove_file_group_regex(sbb_directory_ocrd_mets): - """ - Test removal of filegrp - """ - - assert len(sbb_directory_ocrd_mets.file_groups) == 17 - assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 - - # act - sbb_directory_ocrd_mets.remove_file_group('//OCR-D-GT-.*', recursive=True) - - # assert - assert len(sbb_directory_ocrd_mets.file_groups) == 15 - assert len(sbb_directory_ocrd_mets.find_all_files()) == 31 - - -def test_merge(sbb_sample_01): - assert len(sbb_sample_01.file_groups) == 17 - other_mets = OcrdMets(filename=assets.path_to('kant_aufklaerung_1784/data/mets.xml'), cache_flag=True) - sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) - assert len(sbb_sample_01.file_groups) == 18 - - -def test_invalid_filegrp(): - """addresses https://github.com/OCR-D/core/issues/746""" - - mets = OcrdMets(content="", cache_flag=True) - with pytest.raises(ValueError) as val_err: - mets.add_file('1:! bad filegrp', ID="foo123", pageId="foobar") - - assert "Invalid syntax for mets:fileGrp/@USE" in str(val_err.value) - - -if __name__ == '__main__': - main(__file__) From a6656da61cda77fefd89821322989ff3e2d47185 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 21 Nov 2022 17:04:27 +0100 Subject: [PATCH 34/44] Add fileGrp parameter to remove function --- ocrd_models/ocrd_models/ocrd_mets.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 51a663ad03..6d2ee16ecd 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -394,13 +394,21 @@ def remove_file_group(self, USE, recursive=False, force=False): log.warning(msg) return raise Exception(msg) - files = el_fileGrp.findall('mets:file', NS) + + # The cache should also be used here + if self._cache_flag: + files = self._file_cache.get(el_fileGrp.get('USE'), {}).values() + else: + files = el_fileGrp.findall('mets:file', NS) + if files: if not recursive: raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: # NOTE: Here we know the fileGrp, we should pass it as a parameter - self.remove_one_file(f.get('ID')) + self.remove_one_file(ID=f.get('ID'), fileGrp=f.get('USE')) + # NOTE2: Since remove_one_file also takes OcrdFile, we could just pass the file + # self.remove_one_file(f) if self._cache_flag: # Note: Since the files inside the group are removed @@ -490,23 +498,27 @@ def remove_file(self, *args, **kwargs): return [] raise FileNotFoundError("File not found: %s %s" % (args, kwargs)) - def remove_one_file(self, ID): + def remove_one_file(self, ID, fileGrp=None): """ Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`. Arguments: - ID (string): ``@ID`` of the ``mets:file`` to delete + ID (string): ``@ID`` of the ``mets:file`` to delete + -> ID could also be an OcrdFile, potentially misleading? + fileGrp (string): Returns: The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference. """ log = getLogger('ocrd_models.ocrd_mets.remove_one_file') - log.debug("remove_one_file(%s)" % ID) + log.debug("remove_one_file(%s %s)" % (ID, fileGrp)) if isinstance(ID, OcrdFile): ocrd_file = ID ID = ocrd_file.ID + # fileGrp = ocrd_file.fileGrp + # -> could this potentially help to improve the cached approach? else: # NOTE: We should pass the fileGrp, if known, as a parameter here as well # Leaving that out for now - ocrd_file = next(self.find_files(ID=ID), None) + ocrd_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if not ocrd_file: raise FileNotFoundError("File not found: %s" % ID) From 4e4b3ee1c19dcfd6671b4c4b70873c72fb4d077f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 21 Nov 2022 17:10:36 +0100 Subject: [PATCH 35/44] OcrdMets.__str__: also provide cached/non-cached status --- ocrd_models/ocrd_models/ocrd_mets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 6d2ee16ecd..5321690a21 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -96,7 +96,7 @@ def __str__(self): """ String representation """ - return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) + return 'OcrdMets[cached=self._cached_flag, fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) def _fill_caches(self): """ From 82b3e4f4310668cc1b4c0d14733dd21e13444a21 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 22 Nov 2022 11:33:09 +0100 Subject: [PATCH 36/44] OcrdMets.__str__: fix it and str test --- ocrd_models/ocrd_models/ocrd_mets.py | 2 +- tests/model/test_ocrd_mets.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 5321690a21..d409ba0643 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -96,7 +96,7 @@ def __str__(self): """ String representation """ - return 'OcrdMets[cached=self._cached_flag, fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) + return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (self._cache_flag, self.file_groups, list(self.find_files())) def _fill_caches(self): """ diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 4b95a77105..751da31481 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -54,8 +54,10 @@ def test_unique_identifier_from_nothing(): def test_str(): - mets = OcrdMets(content='') - assert str(mets) == 'OcrdMets[fileGrps=[],files=[]]' + mets = OcrdMets(content='', cache_flag=False) + assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]' + mets_cached = OcrdMets(content='', cache_flag=True) + assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]' def test_file_groups(sbb_sample_01): From 27b6c86892ce25d28a0bd63d651d2a9d6c5223a8 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Nov 2022 15:18:11 +0100 Subject: [PATCH 37/44] OcrdMets: Don't defend against inconsistency cache vs XML --- ocrd_models/ocrd_models/ocrd_mets.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index d409ba0643..8e5b6d5a41 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -555,9 +555,7 @@ def remove_one_file(self, ID, fileGrp=None): # Note: if the file is in the XML tree, # it must also be in the file cache. # Anyway, we perform the checks, then remove - if parent_use in self._file_cache: - if ocrd_file.ID in self._file_cache[parent_use]: - del self._file_cache[parent_use][ocrd_file.ID] + del self._file_cache[parent_use][ocrd_file.ID] # Delete the file reference # pylint: disable=protected-access From 1e8ff909f89f496b214ee2ca6fe0c3104ba37774 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Nov 2022 15:23:49 +0100 Subject: [PATCH 38/44] OcrdMets: remove outdated comment --- ocrd_models/ocrd_models/ocrd_mets.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 8e5b6d5a41..dfc0ab5a70 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -407,9 +407,7 @@ def remove_file_group(self, USE, recursive=False, force=False): for f in files: # NOTE: Here we know the fileGrp, we should pass it as a parameter self.remove_one_file(ID=f.get('ID'), fileGrp=f.get('USE')) - # NOTE2: Since remove_one_file also takes OcrdFile, we could just pass the file - # self.remove_one_file(f) - + if self._cache_flag: # Note: Since the files inside the group are removed # with the 'remove_one_file' method above, @@ -443,19 +441,8 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) log = getLogger('ocrd_models.ocrd_mets.add_file') - """ - # Note: we do not benefit enough from having - # a separate cache for fileGrp elements - - if self._cache_flag: - if fileGrp in self._fileGrp_cache: - el_fileGrp = self._fileGrp_cache[fileGrp] - """ - el_fileGrp = self.add_file_group(fileGrp) if not ignore: - # Since we are sure that fileGrp parameter is set, - # we could send that parameter to find_files for direct search mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if mets_file: if mets_file.fileGrp == fileGrp and \ @@ -516,8 +503,6 @@ def remove_one_file(self, ID, fileGrp=None): # fileGrp = ocrd_file.fileGrp # -> could this potentially help to improve the cached approach? else: - # NOTE: We should pass the fileGrp, if known, as a parameter here as well - # Leaving that out for now ocrd_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if not ocrd_file: @@ -552,9 +537,6 @@ def remove_one_file(self, ID, fileGrp=None): # Delete the file reference from the cache if self._cache_flag: parent_use = ocrd_file._el.getparent().get('USE') - # Note: if the file is in the XML tree, - # it must also be in the file cache. - # Anyway, we perform the checks, then remove del self._file_cache[parent_use][ocrd_file.ID] # Delete the file reference @@ -584,8 +566,6 @@ def get_physical_pages(self, for_fileIds=None): return self.physical_pages ret = [None] * len(for_fileIds) - # Note: This entire function potentially could be further simplified - # TODO: Simplify if self._cache_flag: for pageId in self._fptr_cache.keys(): for fptr in self._fptr_cache[pageId].keys(): From ffcd89f3700e44130160259b448f9ce082db7a81 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Nov 2022 15:37:34 +0100 Subject: [PATCH 39/44] OcrdMets.set_physical_page_for_file: pageId is always a str --- ocrd_models/ocrd_models/ocrd_mets.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index dfc0ab5a70..ba5a2c4631 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -591,13 +591,8 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N order (string): ``@ORDER`` to use orderlabel (string): ``@ORDERLABEL`` to use """ - # print(pageId, ocrd_file) + # delete any page mapping for this file.ID - - # NOTE: The pageId coming from 'test_merge(sbb_sample_01)' is an Element not a string - if not isinstance(pageId, str): - pageId = pageId.get('ID') - candidates = [] if self._cache_flag: for page_id in self._fptr_cache.keys(): From 4da45f6b3095fb5f819cea2f4db696064740225a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Nov 2022 15:38:49 +0100 Subject: [PATCH 40/44] OcrdMets: Don't defend against inconsistency cache vs XML --- ocrd_models/ocrd_models/ocrd_mets.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index ba5a2c4631..c4c3c34e73 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -621,8 +621,7 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv = None if self._cache_flag: - if pageId in self._page_cache.keys(): - el_pagediv = self._page_cache[pageId] + el_pagediv = self._page_cache[pageId] else: el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) From 7724191f2bb50f9652104b108fc7307e72695da3 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Nov 2022 16:07:37 +0100 Subject: [PATCH 41/44] docstring for OcrdMets.remove_one_file --- ocrd_models/ocrd_models/ocrd_mets.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index ba5a2c4631..180a68dc3e 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -405,8 +405,7 @@ def remove_file_group(self, USE, recursive=False, force=False): if not recursive: raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: - # NOTE: Here we know the fileGrp, we should pass it as a parameter - self.remove_one_file(ID=f.get('ID'), fileGrp=f.get('USE')) + self.remove_one_file(ID=f.get('ID'), fileGrp=f.getparent().get('USE')) if self._cache_flag: # Note: Since the files inside the group are removed @@ -414,7 +413,7 @@ def remove_file_group(self, USE, recursive=False, force=False): # we should not take care of that again. # We just remove the fileGrp. del self._file_cache[el_fileGrp.get('USE')] - + el_fileGrp.getparent().remove(el_fileGrp) def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): @@ -489,9 +488,8 @@ def remove_one_file(self, ID, fileGrp=None): """ Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`. Arguments: - ID (string): ``@ID`` of the ``mets:file`` to delete - -> ID could also be an OcrdFile, potentially misleading? - fileGrp (string): + ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``. + fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``. Used only for optimization. Returns: The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference. """ @@ -500,13 +498,11 @@ def remove_one_file(self, ID, fileGrp=None): if isinstance(ID, OcrdFile): ocrd_file = ID ID = ocrd_file.ID - # fileGrp = ocrd_file.fileGrp - # -> could this potentially help to improve the cached approach? else: ocrd_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if not ocrd_file: - raise FileNotFoundError("File not found: %s" % ID) + raise FileNotFoundError("File not found: %s (fileGr=%s)" % (ID, fileGrp)) # Delete the physical page ref fptrs = [] From 2fad30b6a5ccfec60fd3996cd76f7b191bd4367c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Nov 2022 16:12:10 +0100 Subject: [PATCH 42/44] revert 4da45f6b3 (el_pagediv can be legitimately None here ) --- ocrd_models/ocrd_models/ocrd_mets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index acf6aeb075..b5a572dcfc 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -617,7 +617,8 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv = None if self._cache_flag: - el_pagediv = self._page_cache[pageId] + if pageId in self._page_cache: + el_pagediv = self._page_cache[pageId] else: el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) From 3c5ac1e1cdc5eda7c7284126ce86567a61ac5a4d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Nov 2022 17:13:40 +0100 Subject: [PATCH 43/44] enable caching by setting OCRD_METS_CACHING=true env var --- ocrd_models/ocrd_models/ocrd_mets.py | 12 +++++++++++- tests/model/test_ocrd_mets.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index b5a572dcfc..777c9ee53f 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -4,6 +4,7 @@ from datetime import datetime import re import typing +from os import environ from lxml import etree as ET from copy import deepcopy @@ -58,7 +59,16 @@ def __init__(self, **kwargs): """ """ super(OcrdMets, self).__init__(**kwargs) - + + # XXX If the environment variable OCRD_METS_CACHING is set to "true", + # then enable caching, if "false", disable caching, overriding the + # kwarg to the constructor + if 'OCRD_METS_CACHING' in environ: + cache_override = environ['OCRD_METS_CACHING'] in ('true', '1') + getLogger('ocrd_models.ocrd_mets').debug('METS Caching %s because OCRD_METS_CACHING is %s', + 'enabled' if cache_override else 'disabled', environ['OCRD_METS_CACHING']) + self._cache_flag = cache_override + # If cache is enabled if self._cache_flag: diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 751da31481..46ec597ef0 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -3,6 +3,7 @@ from datetime import datetime from os.path import join +from os import environ from contextlib import contextmanager import shutil from logging import StreamHandler @@ -355,6 +356,25 @@ def test_invalid_filegrp(): assert "Invalid syntax for mets:fileGrp/@USE" in str(val_err.value) +@contextmanager +def temp_env_var(k, v): + v_before = environ.get(k, None) + environ[k] = v + yield + if v_before is not None: + environ[k] = v_before + else: + del environ[k] + +def test_envvar(): + assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag + assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag + with temp_env_var('OCRD_METS_CACHING', 'true'): + assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag + assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag + with temp_env_var('OCRD_METS_CACHING', 'false'): + assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag + assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag if __name__ == '__main__': main(__file__) From f21a33acf4ddd3c5ff6d7e8093cb227997357232 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Nov 2022 10:58:43 +0100 Subject: [PATCH 44/44] readme: add a stub section on configuration --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 7a7ae316df..d23c8382c6 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ * [Command line tools](#command-line-tools) * [`ocrd` CLI](#ocrd-cli) * [`ocrd-dummy` CLI](#ocrd-dummy-cli) +* [Configuration](#configuration) * [Packages](#packages) * [ocrd_utils](#ocrd_utils) * [ocrd_models](#ocrd_models) @@ -82,6 +83,14 @@ supported flags, options and arguments. A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp` +## Configuration + +Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support. + +Some parts of the software are configured via environement variables: + +* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification. + ## Packages ### ocrd_utils