From 862ece1c15d65881c3b8b88cb28946eaea11b471 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 29 Nov 2022 18:25:05 +0100 Subject: [PATCH 1/2] OcrdMets.remove_file_group with caching: shallow copy DictValues --- ocrd_models/ocrd_models/ocrd_mets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 777c9ee53f..55027a2283 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -414,7 +414,7 @@ def remove_file_group(self, USE, recursive=False, force=False): if files: if not recursive: raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE) - for f in files: + for f in list(files): self.remove_one_file(ID=f.get('ID'), fileGrp=f.getparent().get('USE')) if self._cache_flag: From d0b551327f043608f8b1fd64fb08a494e5b42600 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 29 Nov 2022 18:29:25 +0100 Subject: [PATCH 2/2] OcrdMets: fix (return) types when caching is on, #957 --- ocrd_models/ocrd_models/ocrd_mets.py | 4 ++-- tests/model/test_ocrd_mets.py | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 55027a2283..3bf8df39ae 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -557,7 +557,7 @@ def physical_pages(self): List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) """ if self._cache_flag: - return self._page_cache.values() + return list(self._page_cache.keys()) return self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', @@ -685,7 +685,7 @@ def remove_physical_page(self, ID): mets_div = self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, namespaces=NS) - if mets_div is not None: + if mets_div: mets_div[0].getparent().remove(mets_div[0]) if self._cache_flag: del self._page_cache[ID] diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 46ec597ef0..7e5ab1851f 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -37,6 +37,15 @@ def _fixture(request): yield mets +@pytest.fixture(name='sbb_directory_ocrd_mets', params=CACHING_ENABLED) +def _fixture_sbb(tmp_path, request): + src_path = assets.path_to('SBB0000F29300010000/data') + dst_path = tmp_path / 'SBB_directory' + shutil.copytree(src_path, dst_path) + mets_path = str(join(dst_path, 'mets.xml')) + yield OcrdMets(filename=mets_path, cache_flag=request.param) + + def test_unique_identifier(): mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml')) assert mets.unique_identifier == 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier' @@ -89,7 +98,8 @@ def test_find_all_files_local_only(sbb_sample_01): def test_physical_pages(sbb_sample_01): assert len(sbb_sample_01.physical_pages) == 3, '3 physical pages' - + assert isinstance(sbb_sample_01.physical_pages, list) + assert isinstance(sbb_sample_01.physical_pages[0], str) def test_physical_pages_from_empty_mets(): mets = OcrdMets(content="") @@ -98,15 +108,6 @@ def test_physical_pages_from_empty_mets(): assert len(mets.physical_pages) == 1, '1 physical page' -@pytest.fixture(name='sbb_directory_ocrd_mets') -def _fixture_sbb(tmp_path): - src_path = assets.path_to('SBB0000F29300010000/data') - dst_path = tmp_path / 'SBB_directory' - shutil.copytree(src_path, dst_path) - mets_path = str(join(dst_path, 'mets.xml')) - yield OcrdMets(filename=mets_path) - - def test_physical_pages_for_fileids(sbb_directory_ocrd_mets): assert sbb_directory_ocrd_mets.get_physical_pages( for_fileIds=['FILE_0002_IMAGE']) == ['PHYS_0002'] @@ -265,6 +266,7 @@ def test_remove_page(sbb_directory_ocrd_mets): def test_remove_physical_page_fptr(sbb_directory_ocrd_mets): assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'] sbb_directory_ocrd_mets.remove_physical_page_fptr('FILE_0002_IMAGE') + sbb_directory_ocrd_mets.remove_physical_page_fptr('FILE_0002_IMAGE') assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), [None] @@ -346,7 +348,6 @@ def test_merge(sbb_sample_01): sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) assert len(sbb_sample_01.file_groups) == 18 - def test_invalid_filegrp(): """addresses https://github.com/OCR-D/core/issues/746"""