diff --git a/.circleci/config.yml b/.circleci/config.yml index 9ed64e479f..3903408b89 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,7 +14,7 @@ jobs: - checkout - run: HOMEBREW_NO_AUTO_UPDATE=1 brew install imagemagick geos - run: make deps-test install PIP=pip3 - - run: make test PYTHON=python3 + - run: make test benchmark PYTHON=python3 test-python36: docker: @@ -24,7 +24,7 @@ jobs: - checkout - run: apt-get -y update - run: make deps-ubuntu deps-test install - - run: make test + - run: make test benchmark test-python37: docker: @@ -34,7 +34,7 @@ jobs: - checkout - run: apt-get -y update - run: make deps-ubuntu deps-test install - - run: make test + - run: make test benchmark test-python38: docker: @@ -44,7 +44,7 @@ jobs: - checkout - run: apt-get -y update - run: make deps-ubuntu deps-test install - - run: make test + - run: make test benchmark test-python39: docker: @@ -54,7 +54,7 @@ jobs: - checkout - run: apt-get -y update - run: make deps-ubuntu deps-test install - - run: make test + - run: make test benchmark deploy: docker: diff --git a/Makefile b/Makefile index 477ee3b38a..16614622de 100644 --- a/Makefile +++ b/Makefile @@ -149,7 +149,13 @@ assets: repo/assets test: assets HOME=$(CURDIR)/ocrd_utils $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging $(TESTDIR) HOME=$(CURDIR) $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging $(TESTDIR) - $(PYTHON) -m pytest --continue-on-collection-errors --durations=10 --ignore=$(TESTDIR)/test_logging.py $(TESTDIR) + $(PYTHON) -m pytest --continue-on-collection-errors --durations=10 --ignore=$(TESTDIR)/test_logging.py --ignore-glob="$(TESTDIR)/**/*bench*.py" $(TESTDIR) + +benchmark: + $(PYTHON) -m pytest $(TESTDIR)/model/test_ocrd_mets_bench.py + +benchmark-extreme: + $(PYTHON) -m pytest $(TESTDIR)/model/*bench*.py test-profile: $(PYTHON) -m cProfile -o profile $$(which pytest) diff --git a/README.md b/README.md index 7a7ae316df..d23c8382c6 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ * [Command line tools](#command-line-tools) * [`ocrd` CLI](#ocrd-cli) * [`ocrd-dummy` CLI](#ocrd-dummy-cli) +* [Configuration](#configuration) * [Packages](#packages) * [ocrd_utils](#ocrd_utils) * [ocrd_models](#ocrd_models) @@ -82,6 +83,14 @@ supported flags, options and arguments. A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp` +## Configuration + +Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support. + +Some parts of the software are configured via environement variables: + +* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification. + ## Packages ### ocrd_utils diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index 4637202bdc..8b02176596 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -138,7 +138,6 @@ def pageId(self, pageId): raise Exception("OcrdFile %s has no member 'mets' pointing to parent OcrdMets" % self) self.mets.set_physical_page_for_file(pageId, self) - @property def loctype(self): """ diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 8571254e56..777c9ee53f 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -3,7 +3,10 @@ """ from datetime import datetime import re +import typing +from os import environ from lxml import etree as ET +from copy import deepcopy from ocrd_utils import ( is_local_filename, @@ -41,7 +44,7 @@ class OcrdMets(OcrdXmlDocument): """ @staticmethod - def empty_mets(now=None): + def empty_mets(now=None, cache_flag=False): """ Create an empty METS file from bundled template. """ @@ -50,37 +53,134 @@ def empty_mets(now=None): tpl = METS_XML_EMPTY.decode('utf-8') tpl = tpl.replace('{{ VERSION }}', VERSION) tpl = tpl.replace('{{ NOW }}', '%s' % now) - return OcrdMets(content=tpl.encode('utf-8')) + return OcrdMets(content=tpl.encode('utf-8'), cache_flag=cache_flag) def __init__(self, **kwargs): """ - """ super(OcrdMets, self).__init__(**kwargs) + # XXX If the environment variable OCRD_METS_CACHING is set to "true", + # then enable caching, if "false", disable caching, overriding the + # kwarg to the constructor + if 'OCRD_METS_CACHING' in environ: + cache_override = environ['OCRD_METS_CACHING'] in ('true', '1') + getLogger('ocrd_models.ocrd_mets').debug('METS Caching %s because OCRD_METS_CACHING is %s', + 'enabled' if cache_override else 'disabled', environ['OCRD_METS_CACHING']) + self._cache_flag = cache_override + + # If cache is enabled + if self._cache_flag: + + # Cache for the files (mets:file) - two nested dictionaries + # The outer dictionary's Key: 'fileGrp.USE' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'file.ID' + # The inner dictionary's Value: a 'file' object at some memory location + self._file_cache = {} + + # Cache for the pages (mets:div) + # The dictionary's Key: 'div.ID' + # The dictionary's Value: a 'div' object at some memory location + self._page_cache = {} + + # Cache for the file pointers (mets:fptr) - two nested dictionaries + # The outer dictionary's Key: 'div.ID' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'fptr.FILEID' + # The inner dictionary's Value: a 'fptr' object at some memory location + self._fptr_cache = {} + + # Note, if the empty_mets() function is used to instantiate OcrdMets + # Then the cache is empty even after this operation + self._fill_caches() + + def __exit__(self): + """ + + """ + if self._cache_flag: + self._clear_caches() + def __str__(self): """ String representation """ - return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, list(self.find_files())) + return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (self._cache_flag, self.file_groups, list(self.find_files())) + + def _fill_caches(self): + """ + Fills the caches with fileGrps and FileIDs + """ + + tree_root = self._tree.getroot() + + # Fill with files + el_fileSec = tree_root.find("mets:fileSec", NS) + if el_fileSec is None: + return + + log = getLogger('ocrd_models.ocrd_mets._fill_caches-files') + + for el_fileGrp in el_fileSec.findall('mets:fileGrp', NS): + fileGrp_use = el_fileGrp.get('USE') + + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp_use] = {} + + for el_file in el_fileGrp: + file_id = el_file.get('ID') + self._file_cache[fileGrp_use].update({file_id : el_file}) + # log.info("File added to the cache: %s" % file_id) + + # Fill with pages + el_div_list = tree_root.findall(".//mets:div[@TYPE='page']", NS) + if len(el_div_list) == 0: + return + log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages') + + for el_div in el_div_list: + div_id = el_div.get('ID') + log.debug("DIV_ID: %s" % el_div.get('ID')) + + self._page_cache[div_id] = el_div + + # Assign an empty dictionary that will hold the fptr of the added page (div) + self._fptr_cache[div_id] = {} + + # log.info("Page_id added to the cache: %s" % div_id) + + for el_fptr in el_div: + self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) + # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) + + # log.info("Len of page_cache: %s" % len(self._page_cache)) + # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) + + def _clear_caches(self): + """ + Deallocates the caches + """ + + self._file_cache = None + self._page_cache = None + self._fptr_cache = None @property def unique_identifier(self): """ Get the unique identifier by looking through ``mods:identifier`` - See `specs `_ for details. """ for t in IDENTIFIER_PRIORITY: found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) if found is not None: return found.text - + @unique_identifier.setter def unique_identifier(self, purl): """ Set the unique identifier by looking through ``mods:identifier`` - See `specs `_ for details. """ id_el = None @@ -119,12 +219,16 @@ def file_groups(self): """ List the `@USE` of all `mets:fileGrp` entries. """ + + # WARNING: Actually we cannot return strings in place of elements! + if self._cache_flag: + return list(self._file_cache.keys()) + return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] def find_all_files(self, *args, **kwargs): """ Like :py:meth:`find_files` but return a list of all results. - Equivalent to ``list(self.find_files(...))`` """ return list(self.find_files(*args, **kwargs)) @@ -133,21 +237,16 @@ def find_all_files(self, *args, **kwargs): def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False): """ Search ``mets:file`` entries in this METS document and yield results. - - The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`, :py:attr:`url` and :py:attr:`mimetype` parameters can each be either a literal string, or a regular expression if the string starts with ``//`` (double slash). - If it is a regex, the leading ``//`` is removed and candidates are matched against the regex with `re.fullmatch`. If it is a literal string, comparison is done with string equality. - The :py:attr:`pageId` parameter supports the numeric range operator ``..``. For example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``, ``PHYS_0001..PHYS_0003`` will be expanded to ``PHYS_0001,PHYS_0002,PHYS_0003``. - Keyword Args: ID (string) : ``@ID`` of the ``mets:file`` fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of @@ -155,26 +254,31 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file`` mimetype (string) : ``@MIMETYPE`` of ``mets:file`` local (boolean) : Whether to restrict results to local files in the filesystem - Yields: :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations """ + pageId_list = [] if pageId: - if pageId.startswith(REGEX_PREFIX): - pageIds, pageId = re.compile(pageId[REGEX_PREFIX_LEN:]), list() + pageId_patterns = [] + for pageId_token in re.split(r',', pageId): + if pageId_token.startswith(REGEX_PREFIX): + pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) + elif '..' in pageId_token: + pageId_patterns += generate_range(*pageId_token.split('..', 1)) + else: + pageId_patterns += [pageId_token] + if self._cache_flag: + for page_id in self._page_cache.keys(): + if page_id in pageId_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): + pageId_list += self._fptr_cache[page_id] else: - pageIds, pageId = pageId.split(','), list() - pageIds_expanded = [] - for pageId_ in pageIds: - if '..' in pageId_: - pageIds_expanded += generate_range(*pageId_.split('..', 1)) - pageIds += pageIds_expanded - for page in self._tree.getroot().xpath( - '//mets:div[@TYPE="page"]', namespaces=NS): - if (page.get('ID') in pageIds if isinstance(pageIds, list) else - pageIds.fullmatch(page.get('ID'))): - pageId.extend( - [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) + for page in self._tree.getroot().xpath( + '//mets:div[@TYPE="page"]', namespaces=NS): + if page.get('ID') in pageId_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page.get('ID')) for p in pageId_patterns]): + pageId_list += [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)] + if ID and ID.startswith(REGEX_PREFIX): ID = re.compile(ID[REGEX_PREFIX_LEN:]) if fileGrp and fileGrp.startswith(REGEX_PREFIX): @@ -183,17 +287,30 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) - for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): + + candidates = [] + if self._cache_flag: + if fileGrp: + if isinstance(fileGrp, str): + candidates += self._file_cache.get(fileGrp, {}).values() + else: + candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()] + else: + candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()] + else: + candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) + + for cand in candidates: if ID: if isinstance(ID, str): if not ID == cand.get('ID'): continue else: if not ID.fullmatch(cand.get('ID')): continue - if pageId is not None and cand.get('ID') not in pageId: + if pageId is not None and cand.get('ID') not in pageId_list: continue - if fileGrp: + if not self._cache_flag and fileGrp: if isinstance(fileGrp, str): if cand.getparent().get('USE') != fileGrp: continue else: @@ -215,6 +332,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: if not url.fullmatch(cand_url): continue + # Note: why we instantiate a class only to find out that the local_only is set afterwards + # Checking local_only and url before instantiation should be better? f = OcrdFile(cand, mets=self) # If only local resources should be returned and f is not a file path: skip the file @@ -225,7 +344,6 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None def add_file_group(self, fileGrp): """ Add a new ``mets:fileGrp``. - Arguments: fileGrp (string): ``@USE`` of the new ``mets:fileGrp``. """ @@ -238,6 +356,11 @@ def add_file_group(self, fileGrp): if el_fileGrp is None: el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) + + if self._cache_flag: + # Assign an empty dictionary that will hold the files of the added fileGrp + self._file_cache[fileGrp] = {} + return el_fileGrp def rename_file_group(self, old, new): @@ -248,11 +371,13 @@ def rename_file_group(self, old, new): if el_fileGrp is None: raise FileNotFoundError("No such fileGrp '%s'" % old) el_fileGrp.set('USE', new) + + if self._cache_flag: + self._file_cache[new] = self._file_cache.pop(old) def remove_file_group(self, USE, recursive=False, force=False): """ Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``) - Arguments: USE (string): ``@USE`` of the ``mets:fileGrp`` to delete. Can be a regex if prefixed with ``//`` recursive (boolean): Whether to recursively delete each ``mets:file`` in the group @@ -279,18 +404,31 @@ def remove_file_group(self, USE, recursive=False, force=False): log.warning(msg) return raise Exception(msg) - files = el_fileGrp.findall('mets:file', NS) + + # The cache should also be used here + if self._cache_flag: + files = self._file_cache.get(el_fileGrp.get('USE'), {}).values() + else: + files = el_fileGrp.findall('mets:file', NS) + if files: if not recursive: raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) for f in files: - self.remove_one_file(f.get('ID')) + self.remove_one_file(ID=f.get('ID'), fileGrp=f.getparent().get('USE')) + + if self._cache_flag: + # Note: Since the files inside the group are removed + # with the 'remove_one_file' method above, + # we should not take care of that again. + # We just remove the fileGrp. + del self._file_cache[el_fileGrp.get('USE')] + el_fileGrp.getparent().remove(el_fileGrp) def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): """ Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`. - Arguments: fileGrp (string): ``@USE`` of ``mets:fileGrp`` to add to Keyword Args: @@ -311,20 +449,30 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force if not REGEX_FILE_ID.fullmatch(fileGrp): raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp) log = getLogger('ocrd_models.ocrd_mets.add_file') + el_fileGrp = self.add_file_group(fileGrp) if not ignore: - mets_file = next(self.find_files(ID=ID), None) + mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if mets_file: if mets_file.fileGrp == fileGrp and \ mets_file.pageId == pageId and \ mets_file.mimetype == mimetype: if not force: raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set") - self.remove_file(ID=ID) + self.remove_file(ID=ID, fileGrp=fileGrp) else: raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") - kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v} - mets_file = OcrdFile(ET.SubElement(el_fileGrp, TAG_METS_FILE), mets=self, **kwargs) + + # To get rid of Python's FutureWarning - checking if v is not None + kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} + # This separation is needed to reuse the same el_mets_file element in the caching if block + el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) + # The caching of the physical page is done in the OcrdFile constructor + mets_file = OcrdFile(el_mets_file, mets=self, **kwargs) + + if self._cache_flag: + # Add the file to the file cache + self._file_cache[fileGrp].update({ID: el_mets_file}) return mets_file @@ -346,36 +494,56 @@ def remove_file(self, *args, **kwargs): return [] raise FileNotFoundError("File not found: %s %s" % (args, kwargs)) - def remove_one_file(self, ID): + def remove_one_file(self, ID, fileGrp=None): """ Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`. - Arguments: - ID (string): ``@ID`` of the ``mets:file`` to delete - + ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``. + fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``. Used only for optimization. Returns: The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference. """ log = getLogger('ocrd_models.ocrd_mets.remove_one_file') - log.debug("remove_one_file(%s)" % ID) + log.debug("remove_one_file(%s %s)" % (ID, fileGrp)) if isinstance(ID, OcrdFile): ocrd_file = ID ID = ocrd_file.ID else: - ocrd_file = next(self.find_files(ID=ID), None) + ocrd_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if not ocrd_file: - raise FileNotFoundError("File not found: %s" % ID) + raise FileNotFoundError("File not found: %s (fileGr=%s)" % (ID, fileGrp)) # Delete the physical page ref - for fptr in self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS): + fptrs = [] + if self._cache_flag: + for page in self._fptr_cache.keys(): + if ID in self._fptr_cache[page]: + fptrs.append(self._fptr_cache[page][ID]) + else: + fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS) + + # Delete the physical page ref + for fptr in fptrs: log.debug("Delete fptr element %s for page '%s'", fptr, ID) page_div = fptr.getparent() page_div.remove(fptr) + # Remove the fptr from the cache as well + if self._cache_flag: + del self._fptr_cache[page_div.get('ID')][ID] # delete empty pages if not page_div.getchildren(): log.debug("Delete empty page %s", page_div) page_div.getparent().remove(page_div) + # Delete the empty pages from caches as well + if self._cache_flag: + del self._page_cache[page_div.get('ID')] + del self._fptr_cache[page_div.get('ID')] + + # Delete the file reference from the cache + if self._cache_flag: + parent_use = ocrd_file._el.getparent().get('USE') + del self._file_cache[parent_use][ocrd_file.ID] # Delete the file reference # pylint: disable=protected-access @@ -388,6 +556,9 @@ def physical_pages(self): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) """ + if self._cache_flag: + return self._page_cache.values() + return self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS) @@ -400,19 +571,25 @@ def get_physical_pages(self, for_fileIds=None): if for_fileIds is None: return self.physical_pages ret = [None] * len(for_fileIds) - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - for fptr in page.findall('mets:fptr', NS): - if fptr.get('FILEID') in for_fileIds: - ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + + if self._cache_flag: + for pageId in self._fptr_cache.keys(): + for fptr in self._fptr_cache[pageId].keys(): + if fptr in for_fileIds: + ret[for_fileIds.index(fptr)] = pageId + else: + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + for fptr in page.findall('mets:fptr', NS): + if fptr.get('FILEID') in for_fileIds: + ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') return ret def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): """ Set the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) corresponding to the ``mets:file`` :py:attr:`ocrd_file`, creating all structures if necessary. - Arguments: pageId (string): ``@ID`` of the physical ``mets:structMap`` entry to use ocrd_file (object): existing :py:class:`ocrd_models.ocrd_file.OcrdFile` object @@ -420,11 +597,22 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N order (string): ``@ORDER`` to use orderlabel (string): ``@ORDERLABEL`` to use """ - # print(pageId, ocrd_file) + # delete any page mapping for this file.ID - for el_fptr in self._tree.getroot().findall( + candidates = [] + if self._cache_flag: + for page_id in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[page_id].keys(): + if self._fptr_cache[page_id][ocrd_file.ID] is not None: + candidates.append(self._fptr_cache[page_id][ocrd_file.ID]) + else: + candidates = self._tree.getroot().findall( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % - ocrd_file.ID, namespaces=NS): + ocrd_file.ID, namespaces=NS) + + for el_fptr in candidates: + if self._cache_flag: + del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID] el_fptr.getparent().remove(el_fptr) # find/construct as necessary @@ -436,7 +624,14 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N if el_seqdiv is None: el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) el_seqdiv.set('TYPE', 'physSequence') - el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + + el_pagediv = None + if self._cache_flag: + if pageId in self._page_cache: + el_pagediv = self._page_cache[pageId] + else: + el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) + if el_pagediv is None: el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) el_pagediv.set('TYPE', 'page') @@ -445,29 +640,56 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv.set('ORDER', order) if orderlabel: el_pagediv.set('ORDERLABEL', orderlabel) + if self._cache_flag: + # Create a new entry in the page cache + self._page_cache[pageId] = el_pagediv + # Create a new entry in the fptr cache and + # assign an empty dictionary to hold the fileids + self._fptr_cache[pageId] = {} + el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) el_fptr.set('FILEID', ocrd_file.ID) + if self._cache_flag: + # Assign the ocrd fileID to the pageId in the cache + self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) + def get_physical_page_for_file(self, ocrd_file): """ Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) corresponding to the ``mets:file`` :py:attr:`ocrd_file`. """ - ret = self._tree.getroot().xpath( - '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % - ocrd_file.ID, namespaces=NS) - if ret: + ret = [] + if self._cache_flag: + for pageId in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[pageId].keys(): + ret.append(self._page_cache[pageId].get('ID')) + else: + ret = self._tree.getroot().xpath( + '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % + ocrd_file.ID, namespaces=NS) + + # To get rid of the python's FutureWarning + if len(ret): return ret[0] def remove_physical_page(self, ID): """ Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. """ - mets_div = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, - namespaces=NS) - if mets_div: + mets_div = None + if self._cache_flag: + if ID in self._page_cache.keys(): + mets_div = [self._page_cache[ID]] + else: + mets_div = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, + namespaces=NS) + if mets_div is not None: mets_div[0].getparent().remove(mets_div[0]) + if self._cache_flag: + del self._page_cache[ID] + del self._fptr_cache[ID] def remove_physical_page_fptr(self, fileId): """ @@ -475,22 +697,33 @@ def remove_physical_page_fptr(self, fileId): Returns: List of pageIds that mets:fptrs were deleted from """ - mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, - namespaces=NS) + + # Question: What is the reason to keep a list of mets_fptrs? + # Do we have a situation in which the fileId is same for different pageIds ? + # From the examples I have seen inside 'assets' that is not the case + # and the mets_fptrs list will always contain a single element. + # If that's the case then we do not need to iterate 2 loops, just one. + mets_fptrs = [] + if self._cache_flag: + for page_id in self._fptr_cache.keys(): + if fileId in self._fptr_cache[page_id].keys(): + mets_fptrs.append(self._fptr_cache[page_id][fileId]) + else: + mets_fptrs = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, namespaces=NS) ret = [] for mets_fptr in mets_fptrs: mets_div = mets_fptr.getparent() ret.append(mets_div.get('ID')) + if self._cache_flag: + del self._fptr_cache[mets_div.get('ID')][mets_fptr.get('FILEID')] mets_div.remove(mets_fptr) return ret def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs): """ Add all files from other_mets. - Accepts the same kwargs as :py:func:`find_files` - Keyword Args: force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s) fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS diff --git a/ocrd_models/ocrd_models/ocrd_xml_base.py b/ocrd_models/ocrd_models/ocrd_xml_base.py index 2235a8b57d..7faefbad99 100644 --- a/ocrd_models/ocrd_models/ocrd_xml_base.py +++ b/ocrd_models/ocrd_models/ocrd_xml_base.py @@ -16,11 +16,12 @@ class OcrdXmlDocument(): Base class for XML documents loaded from either content or filename. """ - def __init__(self, filename=None, content=None): + def __init__(self, filename=None, content=None, cache_flag=False): """ Args: filename (string): content (string): + cache_flag (bool): """ # print(self, filename, content) if filename is None and content is None: @@ -34,6 +35,9 @@ def __init__(self, filename=None, content=None): raise Exception('File does not exist: %s' % filename) self._tree.parse(filename) + # Cache enabled - True/False + self._cache_flag = cache_flag + def to_xml(self, xmllint=False): """ Serialize all properties as pretty-printed XML diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index 2944bb1d33..7211699f0c 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -195,10 +195,12 @@ def generate_range(start, end): Generate a list of strings by incrementing the number part of ``start`` until including ``end``. """ ret = [] - start_num, end_num = re.search(r'\d+', start), re.search(r'\d+', end) - if not (start_num and end_num): - raise ValueError("Unable to generate range %s .. %s, could not detect number part" % (start, end)) - start_num, end_num = start_num.group(0), end_num.group(0) + try: + start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1] + except IndexError: + raise ValueError("Range '%s..%s': could not find numeric part" % (start, end)) + if start_num == end_num: + raise ValueError("Range '%s..%s': evaluates to the same number") for i in range(int(start_num), int(end_num) + 1): ret.append(start.replace(start_num, str(i).zfill(len(start_num)))) return ret diff --git a/requirements_test.txt b/requirements_test.txt index db01cd8dd5..8b7997b1a0 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,6 +1,7 @@ autopep8 pytest >= 4.0.0 generateDS == 2.35.20 +pytest-benchmark >= 3.2.3 coverage >= 4.5.2 sphinx sphinx_click diff --git a/tests/model/mets_bench_extreme.py b/tests/model/mets_bench_extreme.py new file mode 100644 index 0000000000..63b30e31db --- /dev/null +++ b/tests/model/mets_bench_extreme.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- + +from contextlib import contextmanager +from time import time + +from pytest import main, fixture, mark + +from ocrd import Resolver +from ocrd_utils import MIME_TO_EXT, getLogger +from ocrd_models import OcrdMets + +logger = getLogger('ocrd.benchmark.mets') + +GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP'] +GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR'] + +REGIONS_PER_PAGE = 2 +LINES_PER_REGION = 2 +FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE + +# Caching is disabled by default +def _build_mets(number_of_pages, force=False, cache_flag=False): + mets = OcrdMets.empty_mets(cache_flag=cache_flag) + mets._number_of_pages = number_of_pages + + for n in ['%04d' % (n + 1) for n in range(number_of_pages)]: + _add_file = lambda n, fileGrp, mimetype, ID=None: mets.add_file( + fileGrp, + mimetype=mimetype, + pageId='PHYS_%s' % n, + ID=ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), + url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype)) + ) + for grp in GRPS_IMG: + # LINES_PER_REGION = 2 + _add_file(n, grp, 'image/tiff') + _add_file(n, grp, 'application/vnd.prima.page+xml') + for grp in GRPS_REG: + # REGIONS_PER_PAGE = 2 + for region_n in range(REGIONS_PER_PAGE): + _add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n)) + + return mets + +def assert_len(expected_len, mets, kwargs): + test_list = mets.find_all_files(**kwargs) + assert expected_len == len(test_list) + +def benchmark_find_files(number_of_pages, mets): + benchmark_find_files_filegrp(number_of_pages, mets) + benchmark_find_files_fileid(number_of_pages, mets) + benchmark_find_files_physical_page(number_of_pages, mets) + # This is not really useful to measure. + # We iterate all files in both cached and non-cached in the same routine + # When no specific search parameters are provided + # benchmark_find_files_all(number_of_pages, mets) + +def benchmark_find_files_filegrp(number_of_pages, mets): + # Best case - first fileGrp + assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) + # Worst case - does not exist + assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) + +def benchmark_find_files_fileid(number_of_pages, mets): + # Best case - first file ID + assert_len(1, mets, dict(ID='FULL_0001_TIF', fileGrp='FULL')) + # Worst case - does not exist + assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS', fileGrp='FULL-NOTEXIST')) + +def benchmark_find_files_physical_page(number_of_pages, mets): + # Best case - first physical page + assert_len(FILES_PER_PAGE, mets, dict(pageId='PHYS_0001')) + # Worst case - does not exist + assert_len(0, mets, dict(pageId='PHYS_0001-NOTEXISTS')) + +# Get all files, i.e., pass an empty search parameter -> dict() +def benchmark_find_files_all(number_of_pages, mets): + assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) + + + + +# ---- BENCHMARKING for 50-500-1000-2000-5000 pages ---- # + +# ----- 50 pages -> build, search, build (cached), search (cached) ----- # +mets_50 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b50(benchmark): + @benchmark + def result(): + global mets_50 + mets_50 = _build_mets(50, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s50(benchmark): + @benchmark + def ret(): + global mets_50 + benchmark_find_files(50, mets_50) +del mets_50 + +mets_c_50 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b50_c(benchmark): + @benchmark + def result(): + global mets_c_50 + mets_c_50 = _build_mets(50, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s50_c(benchmark): + @benchmark + def ret(): + global mets_c_50 + benchmark_find_files(50, mets_c_50) +del mets_c_50 +# ----------------------------------------------------------------------- # + + + +# ----- 500 pages -> build, search, build (cached), search (cached) ----- # +mets_500 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b500(benchmark): + @benchmark + def result(): + global mets_500 + mets_500 = _build_mets(500, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s500(benchmark): + @benchmark + def ret(): + global mets_500 + benchmark_find_files(500, mets_500) +del mets_500 + + +mets_c_500 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b500_c(benchmark): + @benchmark + def result(): + global mets_c_500 + mets_c_500 = _build_mets(500, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s500_c(benchmark): + @benchmark + def ret(): + global mets_c_500 + benchmark_find_files(500, mets_c_500) +del mets_c_500 + +# ----------------------------------------------------------------------- # + + + +# ----- 1000 pages -> build, search, build (cached), search (cached) ----- # +mets_1000 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b1000(benchmark): + @benchmark + def result(): + global mets_1000 + mets_1000 = _build_mets(1000, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s1000(benchmark): + @benchmark + def ret(): + global mets_1000 + benchmark_find_files(1000, mets_1000) +del mets_1000 + +mets_c_1000 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b1000_c(benchmark): + @benchmark + def result(): + global mets_c_1000 + mets_c_1000 = _build_mets(1000, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s1000_c(benchmark): + @benchmark + def ret(): + global mets_c_1000 + benchmark_find_files(1000, mets_c_1000) +del mets_c_1000 + +# ------------------------------------------------------------------------ # + + + +# ----- 2000 pages -> build, search, build (cached), search (cached) ----- # +mets_2000 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b2000(benchmark): + @benchmark + def result(): + global mets_2000 + mets_2000 = _build_mets(2000, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s2000(benchmark): + @benchmark + def ret(): + global mets_2000 + benchmark_find_files(2000, mets_2000) +del mets_2000 + +mets_c_2000 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b2000_c(benchmark): + @benchmark + def result(): + global mets_c_2000 + mets_c_2000 = _build_mets(2000, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s2000_c(benchmark): + @benchmark + def ret(): + global mets_c_2000 + benchmark_find_files(2000, mets_c_2000) +del mets_c_2000 + +# ------------------------------------------------------------------------ # + + + +# ----- 5000 pages -> build, search, build (cached), search (cached) ----- # +mets_5000 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b5000(benchmark): + @benchmark + def result(): + global mets_5000 + mets_5000 = _build_mets(5000, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s5000(benchmark): + @benchmark + def ret(): + global mets_5000 + benchmark_find_files(5000, mets_5000) +del mets_5000 + +mets_c_5000 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b5000_c(benchmark): + @benchmark + def result(): + global mets_c_5000 + mets_c_5000 = _build_mets(5000, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s5000_c(benchmark): + @benchmark + def ret(): + global mets_c_5000 + benchmark_find_files(5000, mets_c_5000) +del mets_c_5000 + +# ------------------------------------------------------------------------ # + +if __name__ == '__main__': + args = [''] + # args.append('--benchmark-max-time=10') + # args.append('--benchmark-min-time=0.1') + # args.append('--benchmark-warmup=False') + # args.append('--benchmark-disable-gc') + args.append('--benchmark-verbose') + args.append('--benchmark-min-rounds=1') + args.append('--tb=short') + main(args) diff --git a/tests/model/mets_bench_extreme_additional.py b/tests/model/mets_bench_extreme_additional.py new file mode 100644 index 0000000000..e699454e2b --- /dev/null +++ b/tests/model/mets_bench_extreme_additional.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- + +from contextlib import contextmanager +from time import time + +from pytest import main, fixture, mark + +from ocrd import Resolver +from ocrd_utils import MIME_TO_EXT, getLogger +from ocrd_models import OcrdMets + +logger = getLogger('ocrd.benchmark.mets') + +GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP'] +GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR'] + +# 1500 files per page +REGIONS_PER_PAGE = 100 +LINES_PER_REGION = 100 +FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE + +# Caching is disabled by default +def _build_mets(number_of_pages, force=False, cache_flag=False): + mets = OcrdMets.empty_mets(cache_flag=cache_flag) + mets._number_of_pages = number_of_pages + + for n in ['%04d' % (n + 1) for n in range(number_of_pages)]: + _add_file = lambda n, fileGrp, mimetype, ID=None: mets.add_file( + fileGrp, + mimetype=mimetype, + pageId='PHYS_%s' % n, + ID=ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), + url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype)) + ) + for grp in GRPS_IMG: + _add_file(n, grp, 'image/tiff') + _add_file(n, grp, 'application/vnd.prima.page+xml') + for grp in GRPS_REG: + for region_n in range(REGIONS_PER_PAGE): + _add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n)) + + return mets + +def assert_len(expected_len, mets, kwargs): + test_list = mets.find_all_files(**kwargs) + assert expected_len == len(test_list) + +def benchmark_find_files(number_of_pages, mets): + benchmark_find_files_filegrp(number_of_pages, mets) + benchmark_find_files_fileid(number_of_pages, mets) + benchmark_find_files_physical_page(number_of_pages, mets) + # This is not really useful to measure. + # We iterate all files in both cached and non-cached in the same routine + # When no specific search parameters are provided + # benchmark_find_files_all(number_of_pages, mets) + +def benchmark_find_files_filegrp(number_of_pages, mets): + # Best case - first fileGrp + assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) + # Worst case - does not exist + assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) + +def benchmark_find_files_fileid(number_of_pages, mets): + # Best case - first file ID + assert_len(1, mets, dict(ID='FULL_0001_TIF', fileGrp='FULL')) + # Worst case - does not exist + assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS', fileGrp='FULL-NOTEXIST')) + +def benchmark_find_files_physical_page(number_of_pages, mets): + # Best case - first physical page + assert_len(FILES_PER_PAGE, mets, dict(pageId='PHYS_0001')) + # Worst case - does not exist + assert_len(0, mets, dict(pageId='PHYS_0001-NOTEXISTS')) + +# Get all files, i.e., pass an empty search parameter -> dict() +def benchmark_find_files_all(number_of_pages, mets): + assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) + +# ----- 500 pages -> build, search, build (cached), search (cached) ----- # +mets_500 = None +@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b500(benchmark): + @benchmark + def result(): + global mets_500 + mets_500 = _build_mets(500, force=True) + +@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s500(benchmark): + @benchmark + def ret(): + global mets_500 + benchmark_find_files(500, mets_500) +del mets_500 + +mets_c_500 = None +@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_b500_c(benchmark): + @benchmark + def result(): + global mets_c_500 + mets_c_500 = _build_mets(500, force=True, cache_flag=True) + +@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False) +def test_s500_c(benchmark): + @benchmark + def ret(): + global mets_c_500 + benchmark_find_files(500, mets_c_500) +del mets_c_500 + +# ------------------------------------------------------------------------ # + +if __name__ == '__main__': + args = [''] + # args.append('--benchmark-max-time=10') + # args.append('--benchmark-min-time=0.1') + # args.append('--benchmark-warmup=False') + # args.append('--benchmark-disable-gc') + args.append('--benchmark-verbose') + args.append('--benchmark-min-rounds=1') + args.append('--tb=short') + main(args) diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index e91ab66142..46ec597ef0 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -3,6 +3,7 @@ from datetime import datetime from os.path import join +from os import environ from contextlib import contextmanager import shutil from logging import StreamHandler @@ -26,11 +27,13 @@ import pytest +CACHING_ENABLED = [False, True] -@pytest.fixture(name='sbb_sample_01') -def _fixture(): + +@pytest.fixture(name='sbb_sample_01', params=CACHING_ENABLED) +def _fixture(request): mets = OcrdMets(filename=assets.url_of( - 'SBB0000F29300010000/data/mets.xml')) + 'SBB0000F29300010000/data/mets.xml'), cache_flag=request.param) yield mets @@ -52,8 +55,10 @@ def test_unique_identifier_from_nothing(): def test_str(): - mets = OcrdMets(content='') - assert str(mets) == 'OcrdMets[fileGrps=[],files=[]]' + mets = OcrdMets(content='', cache_flag=False) + assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]' + mets_cached = OcrdMets(content='', cache_flag=True) + assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]' def test_file_groups(sbb_sample_01): @@ -74,7 +79,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' - + assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' + assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', @@ -149,10 +155,15 @@ def test_add_file_id_already_exists(sbb_sample_01): # Works but is unwise, there are now two files with clashing ID in METS f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) - assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 2 + assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 if sbb_sample_01._cache_flag else 2 - # Works because fileGrp, mimetype and pageId(== None) match and force is set - f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) + if sbb_sample_01._cache_flag: + # Does not work with caching + with pytest.raises(FileExistsError) as val_err: + sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) + else: + # Works because fileGrp, mimetype and pageId(== None) match and force is set + f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) # Previous step removed duplicate mets:file assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 @@ -177,7 +188,7 @@ def test_add_file_ignore(sbb_sample_01: OcrdMets): # how many files inserted the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) - assert len(the_files) == 2 + assert len(the_files) == 1 if sbb_sample_01._cache_flag else 2 def test_add_file_id_invalid(sbb_sample_01): @@ -345,6 +356,25 @@ def test_invalid_filegrp(): assert "Invalid syntax for mets:fileGrp/@USE" in str(val_err.value) +@contextmanager +def temp_env_var(k, v): + v_before = environ.get(k, None) + environ[k] = v + yield + if v_before is not None: + environ[k] = v_before + else: + del environ[k] + +def test_envvar(): + assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag + assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag + with temp_env_var('OCRD_METS_CACHING', 'true'): + assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag + assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag + with temp_env_var('OCRD_METS_CACHING', 'false'): + assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag + assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag if __name__ == '__main__': main(__file__) diff --git a/tests/model/test_ocrd_mets_bench.py b/tests/model/test_ocrd_mets_bench.py new file mode 100644 index 0000000000..ace6387336 --- /dev/null +++ b/tests/model/test_ocrd_mets_bench.py @@ -0,0 +1,257 @@ +# -*- coding: utf-8 -*- + +from contextlib import contextmanager +from time import time + +from pytest import main, fixture, mark + +from ocrd import Resolver +from ocrd_utils import MIME_TO_EXT, getLogger +from ocrd_models import OcrdMets + +import pprint + + +# LOG = getLogger('ocrd.benchmark.mets') + +GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP'] +GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR'] + +REGIONS_PER_PAGE = 10 +LINES_PER_REGION = 2 +FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE + +# Caching is disabled by default +def _build_mets(number_of_pages, force=False, cache_flag=False): + mets = OcrdMets.empty_mets(cache_flag=cache_flag) + mets._number_of_pages = number_of_pages + + for n in ['%04d' % (n + 1) for n in range(number_of_pages)]: + _add_file = lambda n, fileGrp, mimetype, ID=None: mets.add_file( + fileGrp, + mimetype=mimetype, + pageId='PHYS_%s' % n, + ID=ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), + url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype)) + ) + for grp in GRPS_IMG: + # LINES_PER_REGION = 2 + _add_file(n, grp, 'image/tiff') + _add_file(n, grp, 'application/vnd.prima.page+xml') + for grp in GRPS_REG: + # REGIONS_PER_PAGE = 10 + for region_n in range(REGIONS_PER_PAGE): + _add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n)) + + return mets + +def assert_len(expected_len, mets, kwargs): + test_list = mets.find_all_files(**kwargs) + assert expected_len == len(test_list) + +def benchmark_find_files(number_of_pages, mets): + benchmark_find_files_filegrp(number_of_pages, mets) + benchmark_find_files_fileid(number_of_pages, mets) + benchmark_find_files_all(number_of_pages, mets) + +def benchmark_find_files_filegrp(number_of_pages, mets): + # Best case - first fileGrp + assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG')) + # Worst case - does not exist + assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST')) + +def benchmark_find_files_fileid(number_of_pages, mets): + # Best case - first file ID + assert_len(1, mets, dict(ID='FULL_0001_TIF')) + # Worst case - does not exist + assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS')) + +def benchmark_find_files_physical_page(number_of_pages, mets): + # Best case - first physical page + assert_len(1, mets, dict(pageId='PHYS_0001')) + # Worst case - does not exist + assert_len(0, mets, dict(pageId='PHYS_0001-NOTEXISTS')) + +# Get all files, i.e., pass an empty search parameter -> dict() +def benchmark_find_files_all(number_of_pages, mets): + assert_len((number_of_pages * FILES_PER_PAGE), mets, dict()) + + +# ----- METS files global variables ----- # +mets_5 = None +mets_10 = None +mets_20 = None +mets_50 = None + +# ----- Build mets files with 5-10-20-50-200 pages ----- # +@mark.benchmark(group="build") +def test_b5(benchmark): + @benchmark + def result(): + global mets_5 + mets_5 = _build_mets(5, force=True) + +@mark.benchmark(group="build") +def test_b10(benchmark): + @benchmark + def result(): + global mets_10 + mets_10 = _build_mets(10, force=True) + +@mark.benchmark(group="build") +def test_b20(benchmark): + @benchmark + def result(): + global mets_20 + mets_20 = _build_mets(20, force=True) + +@mark.benchmark(group="build") +def test_b50(benchmark): + @benchmark + def result(): + global mets_50 + mets_50 = _build_mets(50, force=True) + + +# ----- Search for files with 5-10-20-50-200 pages ----- # +@mark.benchmark(group="search") +def test_s5(benchmark): + @benchmark + def ret(): + global mets_5 + benchmark_find_files(5, mets_5) + +@mark.benchmark(group="search") +def test_s10(benchmark): + @benchmark + def ret(): + global mets_10 + benchmark_find_files(10, mets_10) + +@mark.benchmark(group="search") +def test_s20(benchmark): + @benchmark + def ret(): + global mets_20 + benchmark_find_files(20, mets_20) + +@mark.benchmark(group="search") +def test_s50(benchmark): + @benchmark + def ret(): + global mets_50 + benchmark_find_files(50, mets_50) + + +del mets_5 +del mets_10 +del mets_20 +del mets_50 + + +# ----- METS files (cached) global variables ----- # +mets_c_5 = None +mets_c_10 = None +mets_c_20 = None +mets_c_50 = None + +# ----- Build mets files (cached) with 5-10-20-50-200 pages ----- # +@mark.benchmark(group="build") +def test_b5_c(benchmark): + @benchmark + def result(): + global mets_c_5 + mets_c_5 = _build_mets(5, force=True, cache_flag=True) + +@mark.benchmark(group="build") +def test_b10_c(benchmark): + @benchmark + def result(): + global mets_c_10 + mets_c_10 = _build_mets(10, force=True, cache_flag=True) + +@mark.benchmark(group="build") +def test_b20_c(benchmark): + @benchmark + def result(): + global mets_c_20 + mets_c_20 = _build_mets(20, force=True, cache_flag=True) + +@mark.benchmark(group="build") +def test_b50_c(benchmark): + @benchmark + def result(): + global mets_c_50 + mets_c_50 = _build_mets(50, force=True, cache_flag=True) + + +# ----- Search for files (cached) with 5-10-20-50-200 pages ----- # +@mark.benchmark(group="search") +def test_s5_c(benchmark): + @benchmark + def ret(): + global mets_c_5 + benchmark_find_files(5, mets_c_5) + +@mark.benchmark(group="search") +def test_s10_c(benchmark): + @benchmark + def ret(): + global mets_c_10 + benchmark_find_files(10, mets_c_10) + +@mark.benchmark(group="search") +def test_s20_c(benchmark): + @benchmark + def ret(): + global mets_c_20 + benchmark_find_files(20, mets_c_20) + +@mark.benchmark(group="search") +def test_s50_c(benchmark): + @benchmark + def ret(): + global mets_c_50 + benchmark_find_files(50, mets_c_50) + +del mets_c_5 +del mets_c_10 +del mets_c_20 +del mets_c_50 + +def manual_t(): + mets = _build_mets(2, cache_flag=False) + mets_cached = _build_mets(2, cache_flag=True) + + # print("METS>--------------------------------------------------------------------") + # print(mets) + # print("-------------------------------------------------------------------------") + # print("METS_cached>-------------------------------------------------------------") + # print(mets_cached) + + print("-----Regular-Bench------------------------------------------------------------") + benchmark_find_files(2, mets) + print("-----Cached-Bench-------------------------------------------------------------") + benchmark_find_files(2, mets_cached) + + print("-----Regular------------------------------------------------------------------") + print("len=%d" % len(mets.find_all_files(fileGrp='SEG-REG'))) + print(mets.find_all_files(fileGrp='SEG-REG')) + + print("-----Cached-------------------------------------------------------------------") + print("len=%d" % len(mets_cached.find_all_files(fileGrp='SEG-REG'))) + print(mets_cached.find_all_files(fileGrp='SEG-REG')) + +if __name__ == '__main__': + args = [''] + # args.append('--benchmark-max-time=10') + # args.append('--benchmark-min-time=0.1') + # args.append('--benchmark-warmup=True') + # args.append('--benchmark-disable-gc') + args.append('--benchmark-verbose') + args.append('--benchmark-min-rounds=1') + args.append('--tb=short') + main(args) + + # This function was used to manually test things + # manual_t() diff --git a/tests/test_utils.py b/tests/test_utils.py index 467724bd40..b04a9a2722 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -315,8 +315,10 @@ def test_make_file_id_744(self): def test_generate_range(self): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] - with self.assertRaisesRegex(ValueError, 'Unable to generate range'): + with self.assertRaisesRegex(ValueError, 'could not find numeric part'): generate_range('NONUMBER', 'ALSO_NONUMBER') + with self.assertRaisesRegex(ValueError, 'evaluates to the same number'): + generate_range('PHYS_0001_123', 'PHYS_0010_123') def test_safe_filename(self): assert safe_filename('Hello world,!') == 'Hello_world_'