diff --git a/energyml-utils/.flake8 b/energyml-utils/.flake8 index f5c763f..07de32c 100644 --- a/energyml-utils/.flake8 +++ b/energyml-utils/.flake8 @@ -1,6 +1,6 @@ [flake8] # Ignore specific error codes (comma-separated list) -ignore = E501, E722 #, W503, F403 +ignore = E501, E722, W503, F403, E203, E202 # Max line length (default is 79, can be changed) max-line-length = 120 diff --git a/energyml-utils/example/epc_rels_management_example.py b/energyml-utils/example/epc_rels_management_example.py new file mode 100644 index 0000000..d177c2b --- /dev/null +++ b/energyml-utils/example/epc_rels_management_example.py @@ -0,0 +1,174 @@ +""" +Example: Managing .rels files in EPC files using EpcStreamReader + +This example demonstrates the new .rels management capabilities: +1. Removing objects without breaking .rels files +2. Cleaning orphaned relationships +3. Rebuilding all .rels files from scratch +""" + +import sys +from pathlib import Path + +# Add src directory to path +src_path = Path(__file__).parent.parent / "src" +sys.path.insert(0, str(src_path)) + +from energyml.utils.epc_stream import EpcStreamReader + + +def example_workflow(epc_path: str): + """ + Complete workflow example for .rels management. + """ + print(f"Opening EPC file: {epc_path}") + reader = EpcStreamReader(epc_path) + print(f"Loaded {len(reader)} objects\n") + + # ============================================================ + # Scenario 1: Remove objects without breaking .rels + # ============================================================ + print("=" * 70) + print("SCENARIO 1: Remove objects (keeps .rels intact)") + print("=" * 70) + + # Get some objects to remove + objects_to_remove = list(reader._metadata.keys())[-3:] + print(f"\nRemoving {len(objects_to_remove)} objects:") + + for obj_id in objects_to_remove: + print(f" - {obj_id}") + reader.remove_object(obj_id) + + print(f"\nRemaining objects: {len(reader)}") + print("Note: .rels files still reference removed objects (orphaned relationships)") + + # ============================================================ + # Scenario 2: Clean orphaned relationships + # ============================================================ + print("\n" + "=" * 70) + print("SCENARIO 2: Clean orphaned relationships") + print("=" * 70) + + print("\nCalling clean_rels()...") + clean_stats = reader.clean_rels() + + print("\nCleaning statistics:") + print(f" • .rels files scanned: {clean_stats['rels_files_scanned']}") + print(f" • Orphaned relationships removed: {clean_stats['relationships_removed']}") + print(f" • Empty .rels files deleted: {clean_stats['rels_files_removed']}") + + print("\n✓ Orphaned relationships cleaned!") + + # ============================================================ + # Scenario 3: Rebuild all .rels from scratch + # ============================================================ + print("\n" + "=" * 70) + print("SCENARIO 3: Rebuild all .rels from scratch") + print("=" * 70) + + print("\nCalling rebuild_all_rels()...") + rebuild_stats = reader.rebuild_all_rels(clean_first=True) + + print("\nRebuild statistics:") + print(f" • Objects processed: {rebuild_stats['objects_processed']}") + print(f" • .rels files created: {rebuild_stats['rels_files_created']}") + print(f" • SOURCE relationships: {rebuild_stats['source_relationships']}") + print(f" • DESTINATION relationships: {rebuild_stats['destination_relationships']}") + print( + f" • Total relationships: {rebuild_stats['source_relationships'] + rebuild_stats['destination_relationships']}" + ) + + print("\n✓ All .rels files rebuilt!") + + # ============================================================ + # Best Practices + # ============================================================ + print("\n" + "=" * 70) + print("BEST PRACTICES") + print("=" * 70) + + print( + """ + 1. After removing multiple objects: + → Call clean_rels() to remove orphaned relationships + + 2. After modifying many objects or complex operations: + → Call rebuild_all_rels() to ensure consistency + + 3. Regular maintenance: + → Periodically call clean_rels() to keep .rels files tidy + + 4. When in doubt: + → Use rebuild_all_rels() to guarantee correct relationships + """ + ) + + +def quick_clean_example(epc_path: str): + """ + Quick example: Just clean the .rels files. + """ + print("\n" + "=" * 70) + print("QUICK EXAMPLE: Clean .rels in one line") + print("=" * 70) + + reader = EpcStreamReader(epc_path) + stats = reader.clean_rels() + + print(f"\n✓ Cleaned! Removed {stats['relationships_removed']} orphaned relationships") + + +def quick_rebuild_example(epc_path: str): + """ + Quick example: Rebuild all .rels files. + """ + print("\n" + "=" * 70) + print("QUICK EXAMPLE: Rebuild all .rels in one line") + print("=" * 70) + + reader = EpcStreamReader(epc_path) + stats = reader.rebuild_all_rels() + + print( + f"\n✓ Rebuilt! Created {stats['rels_files_created']} .rels files with {stats['source_relationships'] + stats['destination_relationships']} relationships" + ) + + +if __name__ == "__main__": + # Use the test EPC file + test_epc = "wip/BRGM_AVRE_all_march_25.epc" + + if not Path(test_epc).exists(): + print(f"EPC file not found: {test_epc}") + print("Please provide a valid EPC file path") + sys.exit(1) + + # Make a temporary copy for the example + import tempfile + import shutil + + with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as tmp: + tmp_path = tmp.name + + try: + shutil.copy(test_epc, tmp_path) + + # Run the complete workflow + example_workflow(tmp_path) + + # Show quick examples + shutil.copy(test_epc, tmp_path) + quick_clean_example(tmp_path) + + shutil.copy(test_epc, tmp_path) + quick_rebuild_example(tmp_path) + + print("\n" + "=" * 70) + print("Examples completed successfully!") + print("=" * 70) + + finally: + # Cleanup + if Path(tmp_path).exists(): + Path(tmp_path).unlink() diff --git a/energyml-utils/example/main.py b/energyml-utils/example/main.py index a69274e..6301e7c 100644 --- a/energyml-utils/example/main.py +++ b/energyml-utils/example/main.py @@ -1,9 +1,13 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 -import json +import sys +from pathlib import Path import re from dataclasses import fields +src_path = Path(__file__).parent.parent / "src" +sys.path.insert(0, str(src_path)) + from energyml.eml.v2_3.commonv2 import * from energyml.eml.v2_3.commonv2 import AbstractObject from energyml.resqml.v2_0_1.resqmlv2 import DoubleHdf5Array @@ -17,19 +21,19 @@ ) # from src.energyml.utils.data.hdf import * -from src.energyml.utils.data.helper import get_projected_uom, is_z_reversed -from src.energyml.utils.epc import * -from src.energyml.utils.introspection import * -from src.energyml.utils.manager import * -from src.energyml.utils.serialization import * -from src.energyml.utils.validation import ( +from energyml.utils.data.helper import get_projected_uom, is_z_reversed +from energyml.utils.epc import * +from energyml.utils.introspection import * +from energyml.utils.manager import * +from energyml.utils.serialization import * +from energyml.utils.validation import ( patterns_validation, dor_validation, validate_epc, correct_dor, ) -from src.energyml.utils.xml import * -from src.energyml.utils.data.datasets_io import HDF5FileReader, get_path_in_external_with_path +from energyml.utils.xml import * +from energyml.utils.data.datasets_io import HDF5FileReader, get_path_in_external_with_path fi_cit = Citation( title="An interpretation", @@ -494,5 +498,22 @@ def test_dor_conversion(): ) # print(get_obj_uri(tr201, "coucou")) - print(get_usable_class(tr)) - print(get_usable_class(tr201)) + logging.basicConfig(level=logging.DEBUG) + + emi = create_energyml_object("resqml20.ObjEarthModelInterpretation") + print(type(emi)) + print(serialize_xml(emi)) + + from energyml.resqml.v2_0_1 import resqmlv2 + + emi = resqmlv2.ObjEarthModelInterpretation() + print(type(emi)) + print(serialize_xml(emi)) + + emi = read_energyml_xml_file("C:/Users/Cryptaro/Downloads/emi.xml") + print(type(emi)) + print(serialize_xml(emi)) + + emi = create_energyml_object("resqml20.EarthModelInterpretation") + print(type(emi)) + print(serialize_xml(emi)) diff --git a/energyml-utils/example/main_hdf.py b/energyml-utils/example/main_hdf.py new file mode 100644 index 0000000..ac23ed4 --- /dev/null +++ b/energyml-utils/example/main_hdf.py @@ -0,0 +1,37 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +import sys +from pathlib import Path + +# Add src directory to path +src_path = Path(__file__).parent.parent / "src" +sys.path.insert(0, str(src_path)) + +from energyml.utils.data.datasets_io import get_path_in_external_with_path +from energyml.utils.introspection import get_obj_uri + + +if __name__ == "__main__": + from energyml.utils.epc import Epc + + # Create an EPC file + epc = Epc.read_file("wip/BRGM_AVRE_all_march_25.epc") + + print("\n".join(map(lambda o: str(get_obj_uri(o)), epc.energyml_objects))) + + print(epc.get_h5_file_paths("eml:///resqml22.PolylineSetRepresentation(e75db94d-a251-4f31-8a24-23b9573fbf39)")) + + print( + get_path_in_external_with_path( + epc.get_object_by_identifier( + "eml:///resqml22.PolylineSetRepresentation(e75db94d-a251-4f31-8a24-23b9573fbf39)" + ) + ) + ) + + print( + epc.read_h5_dataset( + "eml:///resqml22.PolylineSetRepresentation(e75db94d-a251-4f31-8a24-23b9573fbf39)", + "/RESQML/e75db94d-a251-4f31-8a24-23b9573fbf39/points_patch0", + ) + ) diff --git a/energyml-utils/example/main_stream.py b/energyml-utils/example/main_stream.py new file mode 100644 index 0000000..b1a712a --- /dev/null +++ b/energyml-utils/example/main_stream.py @@ -0,0 +1,214 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +import json +import sys +from pathlib import Path +import logging + +import numpy as np + + +src_path = Path(__file__).parent.parent / "src" +sys.path.insert(0, str(src_path)) + +from energyml.utils.introspection import get_obj_uri +from energyml.utils.constants import EpcExportVersion +from energyml.utils.epc_stream import read_epc_stream +from energyml.utils.epc import ( + Epc, + create_energyml_object, + as_dor, + create_h5_external_relationship, + gen_energyml_object_path, +) +from energyml.utils.serialization import serialize_json + + +def test_epc_stream_main(): + logging.basicConfig(level=logging.DEBUG) + + from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation, ContactElement + from energyml.eml.v2_3.commonv2 import DataObjectReference + + # Use the test EPC file + test_epc = "wip/my_stream_file.epc" + + if Path(test_epc).exists(): + # delete this file to start fresh + Path(test_epc).unlink() + + epc_stream = read_epc_stream(test_epc, export_version=EpcExportVersion.EXPANDED) + print(f"EPC Stream has {len(epc_stream)} objects:") + + assert len(epc_stream) == 0 + print("✓ EPC Stream is empty as expected.") + print(json.dumps(epc_stream.dumps_epc_content_and_files_lists(), indent=2)) + # Now we will create some objects + + trset: TriangulatedSetRepresentation = create_energyml_object("resqml22.TriangulatedSetRepresentation") + bfi = create_energyml_object("resqml22.BoundaryFeatureInterpretation") + bfi.object_version = "1.0" + bf = create_energyml_object("resqml22.BoundaryFeature") + + trset.represented_object = as_dor(bfi) + bfi.interpreted_feature = as_dor(bf) + + # print(get_dor_obj_info(trset.represented_object)) + # print(get_dor_obj_info(as_dor(bfi, "eml20.DataObjectReference"))) + print(gen_energyml_object_path(trset.represented_object)) + + print("\nCreated objects:") + print(serialize_json(trset)) + print(serialize_json(bfi)) + print(serialize_json(bf)) + + print("=" * 70) + + print("=) Adding TriangulatedSetRepresentation to EPC Stream...") + epc_stream.add_object(trset) + print("Epc dumps after adding TriangulatedSetRepresentation:") + print(json.dumps(epc_stream.dumps_epc_content_and_files_lists(), indent=2)) + + print("=) Adding BoundaryFeatureInterpretation to EPC Stream...") + epc_stream.add_object(bfi) + print("Epc dumps after adding BoundaryFeatureInterpretation:") + print(json.dumps(epc_stream.dumps_epc_content_and_files_lists(), indent=2)) + + print("=) Adding BoundaryFeature to EPC Stream...") + epc_stream.add_object(bf) + print("Epc dumps after adding BoundaryFeature:") + print(json.dumps(epc_stream.dumps_epc_content_and_files_lists(), indent=2)) + + print("=) Removing BoundaryFeature to EPC Stream...") + epc_stream.remove_object(get_obj_uri(bf)) + print("Epc dumps after removing BoundaryFeature:") + print(json.dumps(epc_stream.dumps_epc_content_and_files_lists(), indent=2)) + + print("=" * 70, " ARRAYS") + print("HDF5 file paths for TriangulatedSetRepresentation (before adding external rels):") + print(epc_stream.get_h5_file_paths(get_obj_uri(trset))) + + # Now adding rels to external HDF5 file + external_hdf5_path = "wip/external_data.h5" + epc_stream.add_rels_for_object( + trset, + relationships=[create_h5_external_relationship(h5_path=external_hdf5_path)], + ) + epc_stream.add_rels_for_object( + trset, + relationships=[create_h5_external_relationship(h5_path=external_hdf5_path + "_bis.h5")], + ) + + print(epc_stream.get_obj_rels(trset)) + + print("=" * 70, " ARRAYS") + print("HDF5 file paths for TriangulatedSetRepresentation (after adding external rels):") + print(epc_stream.get_h5_file_paths(get_obj_uri(trset))) + + written = epc_stream.write_array(trset, "/MyDataset", array=np.arange(12).reshape((3, 4))) + print(f"Array write successful: {written}") + print("Reading back the written arrays:") + array_read = epc_stream.read_array(trset, "/MyDataset") + print(array_read) + + +def test_epc_im_main(): + logging.basicConfig(level=logging.DEBUG) + + from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation, ContactElement + from energyml.eml.v2_3.commonv2 import DataObjectReference + + # Use the test EPC file + test_epc = "wip/my_stream_file.epc" + + if Path(test_epc).exists(): + # delete this file to start fresh + Path(test_epc).unlink() + + epc_im = Epc(epc_file_path=test_epc, export_version=EpcExportVersion.EXPANDED) + print(f"EPC Stream has {len(epc_im)} objects:") + + assert len(epc_im) == 0 + print("✓ EPC Stream is empty as expected.") + print(json.dumps(epc_im.dumps_epc_content_and_files_lists(), indent=2)) + # Now we will create some objects + + trset: TriangulatedSetRepresentation = create_energyml_object("resqml22.TriangulatedSetRepresentation") + bfi = create_energyml_object("resqml22.BoundaryFeatureInterpretation") + bfi.object_version = "1.0" + bf = create_energyml_object("resqml22.BoundaryFeature") + + trset.represented_object = as_dor(bfi) + bfi.interpreted_feature = as_dor(bf) + + # print(get_dor_obj_info(trset.represented_object)) + # print(get_dor_obj_info(as_dor(bfi, "eml20.DataObjectReference"))) + print(gen_energyml_object_path(trset.represented_object)) + + print("\nCreated objects:") + print(serialize_json(trset)) + print(serialize_json(bfi)) + print(serialize_json(bf)) + + print("=" * 70) + + print("=) Adding TriangulatedSetRepresentation to EPC Stream...") + epc_im.add_object(trset) + print("Epc dumps after adding TriangulatedSetRepresentation:") + print(json.dumps(epc_im.dumps_epc_content_and_files_lists(), indent=2)) + + print("=) Adding BoundaryFeatureInterpretation to EPC Stream...") + epc_im.add_object(bfi) + print("Epc dumps after adding BoundaryFeatureInterpretation:") + print(json.dumps(epc_im.dumps_epc_content_and_files_lists(), indent=2)) + + print("=) Adding BoundaryFeature to EPC Stream...") + epc_im.add_object(bf) + print("Epc dumps after adding BoundaryFeature:") + print(json.dumps(epc_im.dumps_epc_content_and_files_lists(), indent=2)) + + print("=) Removing BoundaryFeature to EPC Stream...") + epc_im.remove_object(get_obj_uri(bf)) + print("Epc dumps after removing BoundaryFeature:") + print(json.dumps(epc_im.dumps_epc_content_and_files_lists(), indent=2)) + + print("=" * 70, " ARRAYS") + print("HDF5 file paths for TriangulatedSetRepresentation (before adding external rels):") + print(epc_im.get_h5_file_paths(get_obj_uri(trset))) + + # Now adding rels to external HDF5 file + external_hdf5_path = "wip/external_data.h5" + epc_im.add_rels_for_object( + trset, + relationships=[create_h5_external_relationship(h5_path=external_hdf5_path)], + ) + epc_im.add_rels_for_object( + trset, + relationships=[create_h5_external_relationship(h5_path=external_hdf5_path + "_bis.h5")], + ) + + print(epc_im.get_obj_rels(trset)) + + print("=" * 70, " ARRAYS") + print("HDF5 file paths for TriangulatedSetRepresentation (after adding external rels):") + print(epc_im.get_h5_file_paths(get_obj_uri(trset))) + + written = epc_im.write_array(trset, "/MyDataset", array=np.arange(12).reshape((3, 4))) + print(f"Array write successful: {written}") + print("Reading back the written arrays:") + array_read = epc_im.read_array(trset, "/MyDataset") + print(array_read) + + +if __name__ == "__main__": + + print("Testing EPC Stream main...") + test_epc_stream_main() + + print("\n✓ EPC Stream main test completed.") + + print("\n" + "=" * 70) + print("Testing in memory EPC...") + test_epc_im_main() + + print("FIN") diff --git a/energyml-utils/example/tools.py b/energyml-utils/example/tools.py index 819063c..3c889ba 100644 --- a/energyml-utils/example/tools.py +++ b/energyml-utils/example/tools.py @@ -5,6 +5,12 @@ import os import pathlib from typing import Optional, List, Dict, Any +import sys +from pathlib import Path + +# Add src directory to path +src_path = Path(__file__).parent.parent / "src" +sys.path.insert(0, str(src_path)) from energyml.utils.validation import validate_epc @@ -359,7 +365,7 @@ def extract_representation_in_3d_file(): uuid_list=args.uuid, output_folder_path=args.output, file_format=args.file_format, - use_crs_displacement=args.crs, + use_crs_displacement=not args.no_crs, ) diff --git a/energyml-utils/pyproject.toml b/energyml-utils/pyproject.toml index b455c60..a3ff9a8 100644 --- a/energyml-utils/pyproject.toml +++ b/energyml-utils/pyproject.toml @@ -64,13 +64,14 @@ energyml-opc = "^1.12.0" h5py = { version = "^3.7.0", optional = false } pyarrow = { version = "^14.0.1", optional = false } numpy = { version = "^1.16.6", optional = false } +flake8 = "^7.3.0" [tool.poetry.group.dev.dependencies] pandas = { version = "^1.1.0", optional = false } coverage = {extras = ["toml"], version = "^6.2"} pytest = "^8.1.1" pytest-cov = "^4.1.0" -flake8 = "^4.0.0" +flake8 = "^7.3.0" black = "^22.3.0" pylint = "^2.7.2" click = ">=8.1.3, <=8.1.3" # upper version than 8.0.2 fail with black diff --git a/energyml-utils/src/energyml/utils/constants.py b/energyml-utils/src/energyml/utils/constants.py index e8ff266..f2e13d8 100644 --- a/energyml-utils/src/energyml/utils/constants.py +++ b/energyml-utils/src/energyml/utils/constants.py @@ -307,7 +307,7 @@ def parse_content_type(ct: str) -> Optional[re.Match[str]]: """Parse content type using optimized compiled regex""" try: return OptimizedRegex.CONTENT_TYPE.search(ct) - except (TypeError, AttributeError) as e: + except (TypeError, AttributeError): return None @@ -315,7 +315,7 @@ def parse_qualified_type(qt: str) -> Optional[re.Match[str]]: """Parse qualified type using optimized compiled regex""" try: return OptimizedRegex.QUALIFIED_TYPE.search(qt) - except (TypeError, AttributeError) as e: + except (TypeError, AttributeError): return None @@ -526,10 +526,11 @@ def _get_property_kind_dict_path_as_str(file_type: str = "xml") -> str: try: import energyml.utils.rc as RC except ImportError: - try: - import src.energyml.utils.rc as RC - except ImportError: - import utils.rc as RC + # try: + import src.energyml.utils.rc as RC + + # except ImportError: + # import utils.rc as RC return files(RC).joinpath(f"PropertyKindDictionary_v2.3.{file_type.lower()}").read_text(encoding="utf-8") except (ImportError, FileNotFoundError, AttributeError) as e: diff --git a/energyml-utils/src/energyml/utils/data/datasets_io.py b/energyml-utils/src/energyml/utils/data/datasets_io.py index 88e8f3b..3325eeb 100644 --- a/energyml-utils/src/energyml/utils/data/datasets_io.py +++ b/energyml-utils/src/energyml/utils/data/datasets_io.py @@ -19,7 +19,6 @@ from energyml.utils.exception import MissingExtraInstallation from energyml.utils.introspection import ( get_obj_uri, - get_obj_uuid, search_attribute_matching_name_with_path, get_object_attribute, search_attribute_matching_name, @@ -31,25 +30,25 @@ import h5py __H5PY_MODULE_EXISTS__ = True -except Exception as e: +except Exception: + h5py = None __H5PY_MODULE_EXISTS__ = False try: import csv __CSV_MODULE_EXISTS__ = True -except Exception as e: +except Exception: __CSV_MODULE_EXISTS__ = False try: import pandas as pd import pyarrow as pa import pyarrow.parquet as pq - from pandas import DataFrame # import pyarrow.feather as feather __PARQUET_MODULE_EXISTS__ = True -except Exception as e: +except Exception: __PARQUET_MODULE_EXISTS__ = False # HDF5 @@ -62,10 +61,10 @@ def h5_list_datasets(h5_file_path: Union[BytesIO, str]) -> List[str]: :return: List of dataset names in the HDF5 file """ res = [] - with h5py.File(h5_file_path, "r") as f: + with h5py.File(h5_file_path, "r") as f: # type: ignore # Function to print the names of all datasets def list_datasets(name, obj): - if isinstance(obj, h5py.Dataset): # Check if the object is a dataset + if isinstance(obj, h5py.Dataset): # Check if the object is a dataset # type: ignore res.append(name) # Visit all items in the HDF5 file and apply the list function @@ -73,14 +72,14 @@ def list_datasets(name, obj): return res @dataclass - class HDF5FileReader(DatasetReader): - def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]: - with h5py.File(source, "r") as f: + class HDF5FileReader(DatasetReader): # noqa: F401 + def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[np.ndarray]: + with h5py.File(source, "r") as f: # type: ignore d_group = f[path_in_external_file] - return d_group[()].tolist() + return d_group[()] # type: ignore - def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]: - with h5py.File(source, "r") as f: + def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[int]]: + with h5py.File(source, "r") as f: # type: ignore return list(f[path_in_external_file].shape) def extract_h5_datasets( @@ -99,8 +98,8 @@ def extract_h5_datasets( if h5_datasets_paths is None: h5_datasets_paths = h5_list_datasets(input_h5) if len(h5_datasets_paths) > 0: - with h5py.File(output_h5, "a") as f_dest: - with h5py.File(input_h5, "r") as f_src: + with h5py.File(output_h5, "a") as f_dest: # type: ignore + with h5py.File(input_h5, "r") as f_src: # type: ignore for dataset in h5_datasets_paths: f_dest.create_dataset(dataset, data=f_src[dataset]) @@ -117,7 +116,10 @@ def write_array( if isinstance(array, list): array = np.asarray(array) print("writing array", target) - with h5py.File(target, "a") as f: + if dtype is not None and not isinstance(dtype, np.dtype): + dtype = np.dtype(dtype) + + with h5py.File(target, "a") as f: # type: ignore # print(array.dtype, h5py.string_dtype(), array.dtype == 'O') # print("\t", dtype or (h5py.string_dtype() if array.dtype == '0' else array.dtype)) if isinstance(array, np.ndarray) and array.dtype == "O": @@ -129,10 +131,10 @@ def write_array( else: class HDF5FileReader: - def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]: + def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[np.ndarray]: raise MissingExtraInstallation(extra_name="hdf5") - def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]: + def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[np.ndarray]: raise MissingExtraInstallation(extra_name="hdf5") def extract_h5_datasets( @@ -243,7 +245,7 @@ def read_array( c = source.readline() while c.startswith("#"): s_pos = source.tell() - comments += c + comments += str(c) c = source.readline() source.seek(s_pos) @@ -254,8 +256,8 @@ def read_array( if len(comments) > 0: _delim = re.search(r'Default\s+delimiter:\s*"(?P[^"])"', comments, re.IGNORECASE) - logging.debug("delim", _delim, _delim.group("delim")) if _delim is not None: + logging.debug("delim", _delim, _delim.group("delim")) _delim = _delim.group("delim") logging.debug(_delim, "<==") if len(_delim) > 0: @@ -299,7 +301,7 @@ def read_array( array = csv.reader(source, delimiter=delimiter, **fmtparams) if path_in_external_file is not None and array is not None: idx = int(path_in_external_file) - return [row[idx] for row in list(filter(lambda l: len(l) > 0, list(array)))] + return [row[idx] for row in list(filter(lambda line: len(line) > 0, list(array)))] else: return list(array) @@ -358,7 +360,7 @@ def read_array( idx = int(path_in_external_file) # for row in list(array): # print(len(row)) - return [row[idx] for row in list(filter(lambda l: len(l) > 0, list(array)))] + return [row[idx] for row in list(filter(lambda line: len(line) > 0, list(array)))] else: return list(array) diff --git a/energyml-utils/src/energyml/utils/data/helper.py b/energyml-utils/src/energyml/utils/data/helper.py index f0a9aa1..febba46 100644 --- a/energyml-utils/src/energyml/utils/data/helper.py +++ b/energyml-utils/src/energyml/utils/data/helper.py @@ -5,6 +5,8 @@ import sys from typing import Any, Optional, Callable, List, Union +import numpy as np + from .datasets_io import read_external_dataset_array from ..constants import flatten_concatenation from ..epc import get_obj_identifier @@ -20,6 +22,7 @@ get_object_attribute_rgx, ) from ..workspace import EnergymlWorkspace +from .datasets_io import get_path_in_external_with_path _ARRAY_NAMES_ = [ "BooleanArrayFromDiscretePropertyArray", @@ -194,7 +197,9 @@ def sum_lists(l1: List, l2: List): :param l2: :return: """ - return [l1[i] + l2[i] for i in range(min(len(l1), len(l2)))] + max(l1, l2, key=len)[min(len(l1), len(l2)) :] + return [l1[i] + l2[i] for i in range(min(len(l1), len(l2)))] + max(l1, l2, key=len)[ + min(len(l1), len(l2)) : # noqa: E203 + ] def get_crs_obj( @@ -290,7 +295,7 @@ def read_external_array( path_in_root: Optional[str] = None, workspace: Optional[EnergymlWorkspace] = None, sub_indices: List[int] = None, -) -> List[Any]: +) -> Union[List[Any], np.ndarray]: """ Read an external array (BooleanExternalArray, BooleanHdf5Array, DoubleHdf5Array, IntegerHdf5Array, StringExternalArray ...) :param energyml_array: @@ -301,11 +306,25 @@ def read_external_array( """ array = None if workspace is not None: - array = workspace.read_external_array( - energyml_array=energyml_array, + # array = workspace.read_external_array( + # energyml_array=energyml_array, + # root_obj=root_obj, + # path_in_root=path_in_root, + # ) + crs = get_crs_obj( + context_obj=root_obj, root_obj=root_obj, path_in_root=path_in_root, + workspace=workspace, ) + pief_list = get_path_in_external_with_path(obj=energyml_array) + # empty array + array = None + for pief_path_in_obj, pief in pief_list: + arr = workspace.read_array(proxy=crs or root_obj, path_in_external=pief) + if arr is not None: + array = arr if array is None else np.concatenate((array, arr)) + else: array = read_external_dataset_array( energyml_array=energyml_array, @@ -375,7 +394,7 @@ def read_constant_array( root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, workspace: Optional[EnergymlWorkspace] = None, - sub_indices: List[int] = None, + sub_indices: Optional[List[int]] = None, ) -> List[Any]: """ Read a constant array ( BooleanConstantArray, DoubleConstantArray, FloatingPointConstantArray, IntegerConstantArray ...) @@ -486,10 +505,10 @@ def read_int_double_lattice_array( :param sub_indices: :return: """ - start_value = get_object_attribute_no_verif(energyml_array, "start_value") + # start_value = get_object_attribute_no_verif(energyml_array, "start_value") offset = get_object_attribute_no_verif(energyml_array, "offset") - result = [] + # result = [] # if len(offset) == 1: # pass @@ -660,7 +679,7 @@ def read_point3d_lattice_array( root_obj=root_obj, workspace=workspace, ) - except ObjectNotFoundNotError as e: + except ObjectNotFoundNotError: logging.error("No CRS found, not able to check zIncreasingDownward") zincreasing_downward = is_z_reversed(crs) diff --git a/energyml-utils/src/energyml/utils/data/mesh.py b/energyml-utils/src/energyml/utils/data/mesh.py index c3ad660..3ee9409 100644 --- a/energyml-utils/src/energyml/utils/data/mesh.py +++ b/energyml-utils/src/energyml/utils/data/mesh.py @@ -6,6 +6,7 @@ import os import re import sys +import numpy as np from dataclasses import dataclass, field from enum import Enum from io import BytesIO @@ -21,6 +22,7 @@ is_z_reversed, ) from ..epc import Epc, get_obj_identifier, gen_energyml_object_path +from ..epc_stream import EpcStreamReader from ..exception import ObjectNotFoundNotError from ..introspection import ( search_attribute_matching_name, @@ -497,7 +499,7 @@ def read_grid2d_representation( root_obj=energyml_object, workspace=workspace, ) - except ObjectNotFoundNotError as e: + except ObjectNotFoundNotError: pass points, indices = gen_surface_grid_geometry( @@ -588,29 +590,37 @@ def read_triangulated_set_representation( root_obj=energyml_object, workspace=workspace, ) - except ObjectNotFoundNotError as e: + except ObjectNotFoundNotError: pass point_list: List[Point] = [] for point_path, point_obj in search_attribute_matching_name_with_path(patch, "Geometry.Points"): - point_list = point_list + read_array( + _array = read_array( energyml_array=point_obj, root_obj=energyml_object, path_in_root=patch_path + "." + point_path, workspace=workspace, ) + if isinstance(_array, np.ndarray): + _array = _array.tolist() + + point_list = point_list + _array triangles_list: List[List[int]] = [] for ( triangles_path, triangles_obj, ) in search_attribute_matching_name_with_path(patch, "Triangles"): - triangles_list = triangles_list + read_array( + _array = read_array( energyml_array=triangles_obj, root_obj=energyml_object, path_in_root=patch_path + "." + triangles_path, workspace=workspace, ) + if isinstance(_array, np.ndarray): + _array = _array.tolist() + triangles_list = triangles_list + _array + triangles_list = list(map(lambda tr: [ti - point_offset for ti in tr], triangles_list)) if sub_indices is not None and len(sub_indices) > 0: new_triangles_list = [] @@ -1068,7 +1078,7 @@ def write_geojson_feature( out.write(b"{") # start geometry # "type": f"{geo_type_prefix}{geo_type.name}", out.write(f'"type": "{geo_type.name}", '.encode()) - out.write(f'"coordinates": '.encode()) + out.write('"coordinates": '.encode()) mins, maxs = _write_geojson_shape( out=out, geo_type=geo_type, @@ -1317,7 +1327,7 @@ def export_multiple_data( use_crs_displacement: bool = True, logger: Optional[Any] = None, ): - epc = Epc.read_file(epc_path) + epc = EpcStreamReader(epc_path) # with open(epc_path.replace(".epc", ".h5"), "rb") as fh: # buf = BytesIO(fh.read()) diff --git a/energyml-utils/src/energyml/utils/data/model.py b/energyml-utils/src/energyml/utils/data/model.py index 70c9aec..e798ce8 100644 --- a/energyml-utils/src/energyml/utils/data/model.py +++ b/energyml-utils/src/energyml/utils/data/model.py @@ -2,22 +2,24 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass from io import BytesIO -from typing import Optional, List, Any, Union +from typing import Optional, List, Union + +import numpy as np @dataclass class DatasetReader: - def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]: + def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[np.ndarray]: return None - def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]: + def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[int]]: return None -@dataclass -class ETPReader(DatasetReader): - def read_array(self, obj_uri: str, path_in_external_file: str) -> Optional[List[Any]]: - return None +# @dataclass +# class ETPReader(DatasetReader): +# def read_array(self, obj_uri: str, path_in_external_file: str) -> Optional[np.ndarray]: +# return None - def get_array_dimension(self, source: str, path_in_external_file: str) -> Optional[List[Any]]: - return None +# def get_array_dimension(self, source: str, path_in_external_file: str) -> Optional[np.ndarray]: +# return None diff --git a/energyml-utils/src/energyml/utils/epc.py b/energyml-utils/src/energyml/utils/epc.py index 5de714b..28e7c1b 100644 --- a/energyml-utils/src/energyml/utils/epc.py +++ b/energyml-utils/src/energyml/utils/epc.py @@ -8,6 +8,7 @@ import json import logging import os +from pathlib import Path import random import re import traceback @@ -29,13 +30,13 @@ Keywords1, TargetMode, ) -from .uri import parse_uri +import numpy as np +from .uri import Uri, parse_uri from xsdata.formats.dataclass.models.generics import DerivedElement from .constants import ( RELS_CONTENT_TYPE, RELS_FOLDER_NAME, - RGX_DOMAIN_VERSION, EpcExportVersion, RawFile, EPCRelsRelationshipType, @@ -47,12 +48,16 @@ OptimizedRegex, ) from .data.datasets_io import ( + HDF5FileReader, + HDF5FileWriter, read_external_dataset_array, ) from .exception import UnparsableFile from .introspection import ( get_class_from_content_type, + get_dor_obj_info, get_obj_type, + get_obj_uri, get_obj_usable_class, is_dor, search_attribute_matching_type, @@ -72,7 +77,6 @@ set_attribute_value, get_object_attribute, get_qualified_type_from_class, - get_class_fields, ) from .manager import get_class_pkg, get_class_pkg_version from .serialization import ( @@ -121,7 +125,7 @@ class Epc(EnergymlWorkspace): default_factory=list, ) - """ + """ Additional rels for objects. Key is the object (same than in @energyml_objects) and value is a list of RelationShip. This can be used to link an HDF5 to an ExternalPartReference in resqml 2.0.1 Key is a value returned by @get_obj_identifier @@ -248,6 +252,10 @@ def export_file(self, path: Optional[str] = None) -> None: """ if path is None: path = self.epc_file_path + + # Ensure directory exists + if path is not None: + Path(path).parent.mkdir(parents=True, exist_ok=True) epc_io = self.export_io() with open(path, "wb") as f: f.write(epc_io.getbuffer()) @@ -317,6 +325,21 @@ def export_io(self) -> BytesIO: return zip_buffer + def get_obj_rels(self, obj: Any) -> Optional[Relationships]: + """ + Get the Relationships object for a given energyml object + :param obj: + :return: + """ + rels_path = gen_rels_path( + energyml_object=obj, + export_version=self.export_version, + ) + all_rels = self.compute_rels() + if rels_path in all_rels: + return all_rels[rels_path] + return None + def compute_rels(self) -> Dict[str, Relationships]: """ Returns a dict containing for each objet, the rels xml file path as key and the RelationShips object as value @@ -382,7 +405,7 @@ def compute_rels(self) -> Dict[str, Relationships]: return obj_rels - def rels_to_h5_file(self, obj: any, h5_path: str) -> Relationship: + def rels_to_h5_file(self, obj: Any, h5_path: str) -> Relationship: """ Creates in the epc file, a Relation (in the object .rels file) to link a h5 external file. Usually this function is used to link an ExternalPartReference to a h5 file. @@ -395,15 +418,40 @@ def rels_to_h5_file(self, obj: any, h5_path: str) -> Relationship: if obj_ident not in self.additional_rels: self.additional_rels[obj_ident] = [] - rel = Relationship( - target=h5_path, - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), - id="Hdf5File", - target_mode=TargetMode.EXTERNAL.value, - ) + nb_current_file = len(self.get_h5_file_paths(obj)) + + rel = create_h5_external_relationship(h5_path=h5_path, current_idx=nb_current_file) self.additional_rels[obj_ident].append(rel) return rel + def get_h5_file_paths(self, obj: Any) -> List[str]: + """ + Get all HDF5 file paths referenced in the EPC file (from rels to external resources) + :return: list of HDF5 file paths + """ + is_uri = (isinstance(obj, str) and parse_uri(obj) is not None) or isinstance(obj, Uri) + if is_uri: + obj = self.get_object_by_identifier(obj) + + h5_paths = set() + + if isinstance(obj, str): + obj = self.get_object_by_identifier(obj) + for rels in self.additional_rels.get(get_obj_identifier(obj), []): + if rels.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(): + h5_paths.add(rels.target) + + if len(h5_paths) == 0: + # search if an h5 file has the same name than the epc file + epc_folder = self.get_epc_file_folder() + if epc_folder is not None and self.epc_file_path is not None: + epc_file_name = os.path.basename(self.epc_file_path) + epc_file_base, _ = os.path.splitext(epc_file_name) + possible_h5_path = os.path.join(epc_folder, epc_file_base + ".h5") + if os.path.exists(possible_h5_path): + h5_paths.add(possible_h5_path) + return list(h5_paths) + # -- Functions inherited from EnergymlWorkspace def get_object_as_dor(self, identifier: str, dor_qualified_type) -> Optional[Any]: @@ -426,20 +474,66 @@ def get_object_by_uuid(self, uuid: str) -> List[Any]: """ return list(filter(lambda o: get_obj_uuid(o) == uuid, self.energyml_objects)) - def get_object_by_identifier(self, identifier: str) -> Optional[Any]: + def get_object_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]: """ Search an object by its identifier. - :param identifier: given by the function :func:`get_obj_identifier` + :param identifier: given by the function :func:`get_obj_identifier`, or a URI (or its str representation) :return: """ + is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None + id_str = str(identifier) for o in self.energyml_objects: - if get_obj_identifier(o) == identifier: + if (get_obj_identifier(o) if not is_uri else str(get_obj_uri(o))) == id_str: return o return None def get_object(self, uuid: str, object_version: Optional[str]) -> Optional[Any]: return self.get_object_by_identifier(f"{uuid}.{object_version or ''}") + def add_object(self, obj: Any) -> bool: + """ + Add an energyml object to the EPC stream + :param obj: + :return: + """ + self.energyml_objects.append(obj) + return True + + def remove_object(self, identifier: Union[str, Uri]) -> None: + """ + Remove an energyml object from the EPC stream by its identifier + :param identifier: + :return: + """ + obj = self.get_object_by_identifier(identifier) + if obj is not None: + self.energyml_objects.remove(obj) + + def __len__(self) -> int: + return len(self.energyml_objects) + + def add_rels_for_object( + self, + obj: Any, + relationships: List[Relationship], + ) -> None: + """ + Add relationships to an object in the EPC stream + :param obj: + :param relationships: + :return: + """ + + if isinstance(obj, str) or isinstance(obj, Uri): + obj = self.get_object_by_identifier(obj) + obj_ident = get_obj_identifier(obj) + else: + obj_ident = get_obj_identifier(obj) + if obj_ident not in self.additional_rels: + self.additional_rels[obj_ident] = [] + + self.additional_rels[obj_ident] = self.additional_rels[obj_ident] + relationships + def get_epc_file_folder(self) -> Optional[str]: if self.epc_file_path is not None and len(self.epc_file_path) > 0: folders_and_name = re.split(r"[\\/]", self.epc_file_path) @@ -456,6 +550,14 @@ def read_external_array( path_in_root: Optional[str] = None, use_epc_io_h5: bool = True, ) -> List[Any]: + """Read an external array from HDF5 files linked to the EPC file. + :param energyml_array: the energyml array object (e.g. FloatingPointExternalArray) + :param root_obj: the root object containing the energyml_array + :param path_in_root: the path in the root object to the energyml_array + :param use_epc_io_h5: if True, use also the in-memory HDF5 files stored in epc.h5_io_files + + :return: the array read from the external datasets + """ sources = [] if self is not None and use_epc_io_h5 and self.h5_io_files is not None and len(self.h5_io_files): sources = sources + self.h5_io_files @@ -468,6 +570,67 @@ def read_external_array( epc=self, ) + def read_array(self, proxy: Union[str, Uri, Any], path_in_external: str) -> Optional[np.ndarray]: + obj = proxy + if isinstance(proxy, str) or isinstance(proxy, Uri): + obj = self.get_object_by_identifier(proxy) + + h5_path = self.get_h5_file_paths(obj) + h5_reader = HDF5FileReader() + + if h5_path is None or len(h5_path) == 0: + for h5_path in self.external_files_path: + try: + return h5_reader.read_array(source=h5_path, path_in_external_file=path_in_external) + except Exception: + pass + # logging.error(f"Failed to read HDF5 dataset from {h5_path}: {e}") + else: + for h5p in h5_path: + try: + return h5_reader.read_array(source=h5p, path_in_external_file=path_in_external) + except Exception: + pass + # logging.error(f"Failed to read HDF5 dataset from {h5p}: {e}") + return None + + def write_array( + self, proxy: Union[str, Uri, Any], path_in_external: str, array: Any, in_memory: bool = False + ) -> bool: + """ + Write a dataset in the HDF5 file linked to the proxy object. + :param proxy: the object or its identifier + :param path_in_external: the path in the external file + :param array: the data to write + :param in_memory: if True, write in the in-memory HDF5 files (epc.h5_io_files) + + :return: True if successful + """ + obj = proxy + if isinstance(proxy, str) or isinstance(proxy, Uri): + obj = self.get_object_by_identifier(proxy) + + h5_path = self.get_h5_file_paths(obj) + h5_writer = HDF5FileWriter() + + if in_memory or h5_path is None or len(h5_path) == 0: + for h5_path in self.external_files_path: + try: + h5_writer.write_array(target=h5_path, path_in_external_file=path_in_external, array=array) + return True + except Exception: + pass + # logging.error(f"Failed to write HDF5 dataset to {h5_path}: {e}") + + for h5p in h5_path: + try: + h5_writer.write_array(target=h5p, path_in_external_file=path_in_external, array=array) + return True + except Exception: + pass + # logging.error(f"Failed to write HDF5 dataset to {h5p}: {e}") + return False + # Class methods @classmethod @@ -524,11 +687,10 @@ def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance ov_obj = ov_obj.value path_to_obj[ov_path] = ov_obj obj_list.append(ov_obj) - except Exception as e: + except Exception: logging.error(traceback.format_exc()) logging.error( - f"Epc.@read_stream failed to parse file {ov_path} for content-type: {ov_ct} => {get_class_from_content_type(ov_ct)}\n\n", - get_class_from_content_type(ov_ct), + f"Epc.@read_stream failed to parse file {ov_path} for content-type: {ov_ct} => {str(get_class_from_content_type(ov_ct))}\n\n", ) try: logging.debug(epc_file.read(ov_path)) @@ -551,7 +713,7 @@ def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance content=BytesIO(epc_file.read(f_info.filename)), ) ) - except IOError as e: + except IOError: logging.error(traceback.format_exc()) elif f_info.filename != "_rels/.rels": # CoreProperties rels file # RELS FILES READING START @@ -608,6 +770,18 @@ def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance return None + def dumps_epc_content_and_files_lists(self) -> str: + """ + Dumps the EPC content and files lists for debugging purposes. + :return: A string representation of the EPC content and files lists. + """ + content_list = [ + f"{get_obj_identifier(obj)} ({get_qualified_type_from_class(type(obj))})" for obj in self.energyml_objects + ] + raw_files_list = [raw_file.path for raw_file in self.raw_files] + + return "EPC Content:\n" + "\n".join(content_list) + "\n\nRaw Files:\n" + "\n".join(raw_files_list) + # ______ __ ____ __ _ # / ____/___ ___ _________ ___ ______ ___ / / / __/_ ______ _____/ /_(_)___ ____ _____ @@ -883,18 +1057,19 @@ def gen_energyml_object_path( energyml_object = read_energyml_xml_str(energyml_object) obj_type = get_object_type_for_file_path_from_class(energyml_object.__class__) + # logging.debug("is_dor: ", str(is_dor(energyml_object)), "object type : " + str(obj_type)) - pkg = get_class_pkg(energyml_object) - pkg_version = get_class_pkg_version(energyml_object) - object_version = get_obj_version(energyml_object) - uuid = get_obj_uuid(energyml_object) - - # if object_version is None: - # object_version = "0" + if is_dor(energyml_object): + uuid, pkg, pkg_version, obj_cls, object_version = get_dor_obj_info(energyml_object) + obj_type = get_object_type_for_file_path_from_class(obj_cls) + else: + pkg = get_class_pkg(energyml_object) + pkg_version = get_class_pkg_version(energyml_object) + object_version = get_obj_version(energyml_object) + uuid = get_obj_uuid(energyml_object) if export_version == EpcExportVersion.EXPANDED: return f"namespace_{pkg}{pkg_version.replace('.', '')}/{(('version_' + object_version + '/') if object_version is not None and len(object_version) > 0 else '')}{obj_type}_{uuid}.xml" - # return f"namespace_{pkg}{pkg_version.replace('.', '')}/{uuid}{(('/version_' + object_version) if object_version is not None else '')}/{obj_type}_{uuid}.xml" else: return obj_type + "_" + uuid + ".xml" @@ -929,6 +1104,9 @@ def gen_rels_path( return f"{obj_folder}{RELS_FOLDER_NAME}/{obj_file_name}.rels" +# def gen_rels_path_from_dor(dor: Any, export_version: EpcExportVersion = EpcExportVersion.CLASSIC) -> str: + + def get_epc_content_type_path( export_version: EpcExportVersion = EpcExportVersion.CLASSIC, ) -> str: @@ -938,3 +1116,17 @@ def get_epc_content_type_path( :return: """ return "[Content_Types].xml" + + +def create_h5_external_relationship(h5_path: str, current_idx: int = 0) -> Relationship: + """ + Create a Relationship object to link an external HDF5 file. + :param h5_path: + :return: + """ + return Relationship( + target=h5_path, + type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + id=f"Hdf5File{current_idx + 1 if current_idx > 0 else ''}", + target_mode=TargetMode.EXTERNAL, + ) diff --git a/energyml-utils/src/energyml/utils/epc_stream.py b/energyml-utils/src/energyml/utils/epc_stream.py index 811a7d1..721f9d6 100644 --- a/energyml-utils/src/energyml/utils/epc_stream.py +++ b/energyml-utils/src/energyml/utils/epc_stream.py @@ -8,28 +8,35 @@ content into memory at once. """ +import tempfile +import shutil import logging import os import zipfile from contextlib import contextmanager -from dataclasses import dataclass, field -from io import BytesIO +from dataclasses import dataclass from pathlib import Path -from typing import Dict, List, Optional, Any, Iterator, Set, Union, Tuple +from typing import Dict, List, Optional, Any, Iterator, Union, Tuple from weakref import WeakValueDictionary -from energyml.opc.opc import Types, Override, CoreProperties -from .constants import OptimizedRegex, EpcExportVersion -from .epc import Epc, gen_energyml_object_path -from .exception import UnparsableFile +from energyml.opc.opc import Types, Override, CoreProperties, Relationships, Relationship +from energyml.utils.data.datasets_io import HDF5FileReader, HDF5FileWriter +from energyml.utils.uri import Uri, parse_uri +from energyml.utils.workspace import EnergymlWorkspace +import numpy as np +from .constants import EPCRelsRelationshipType, OptimizedRegex, EpcExportVersion +from .epc import Epc, gen_energyml_object_path, gen_rels_path, get_epc_content_type_path from .introspection import ( get_class_from_content_type, + get_obj_content_type, get_obj_identifier, get_obj_uuid, - get_obj_version, get_object_type_for_file_path_from_class, + get_direct_dor_list, + get_obj_type, + get_obj_usable_class, ) -from .serialization import read_energyml_xml_bytes +from .serialization import read_energyml_xml_bytes, serialize_xml from .xml import is_energyml_content_type @@ -72,7 +79,7 @@ def memory_efficiency(self) -> float: return (1 - (self.loaded_objects / self.total_objects)) * 100 if self.total_objects > 0 else 100.0 -class EpcStreamReader: +class EpcStreamReader(EnergymlWorkspace): """ Memory-efficient EPC file reader with lazy loading and smart caching. @@ -101,6 +108,8 @@ def __init__( cache_size: int = 100, validate_on_load: bool = True, preload_metadata: bool = True, + export_version: EpcExportVersion = EpcExportVersion.CLASSIC, + force_h5_path: Optional[str] = None, ): """ Initialize the EPC stream reader. @@ -110,18 +119,38 @@ def __init__( cache_size: Maximum number of objects to keep in memory cache validate_on_load: Whether to validate objects when loading preload_metadata: Whether to preload all object metadata + export_version: EPC packaging version (CLASSIC or EXPANDED) + force_h5_path: Optional forced HDF5 file path for external resources. If set, all arrays will be read/written from/to this path. """ self.epc_file_path = Path(epc_file_path) self.cache_size = cache_size self.validate_on_load = validate_on_load + self.force_h5_path = force_h5_path + + is_new_file = False # Validate file exists and is readable if not self.epc_file_path.exists(): - raise FileNotFoundError(f"EPC file not found: {epc_file_path}") + logging.info(f"EPC file not found: {epc_file_path}. Creating a new empty EPC file.") + self._create_empty_epc() + is_new_file = True + # raise FileNotFoundError(f"EPC file not found: {epc_file_path}") if not zipfile.is_zipfile(self.epc_file_path): raise ValueError(f"File is not a valid ZIP/EPC file: {epc_file_path}") + # Check if the ZIP file has the required EPC structure + if not is_new_file: + try: + with zipfile.ZipFile(self.epc_file_path, "r") as zf: + content_types_path = get_epc_content_type_path() + if content_types_path not in zf.namelist(): + logging.info(f"EPC file is missing required structure. Initializing empty EPC file.") + self._create_empty_epc() + is_new_file = True + except Exception as e: + logging.warning(f"Failed to check EPC structure: {e}. Reinitializing.") + # Object metadata storage self._metadata: Dict[str, EpcObjectMetadata] = {} # identifier -> metadata self._uuid_index: Dict[str, List[str]] = {} # uuid -> list of identifiers @@ -139,14 +168,33 @@ def __init__( self._zip_file: Optional[zipfile.ZipFile] = None # EPC export version detection - self.export_version: EpcExportVersion = EpcExportVersion.CLASSIC # Default + self.export_version: EpcExportVersion = export_version or EpcExportVersion.CLASSIC # Default + + # Additional rels management + self.additional_rels: Dict[str, List[Relationship]] = {} # Initialize by loading metadata - if preload_metadata: + if not is_new_file and preload_metadata: self._load_metadata() # Detect EPC version after loading metadata self.export_version = self._detect_epc_version() + def _create_empty_epc(self) -> None: + """Create an empty EPC file structure.""" + # Ensure directory exists + self.epc_file_path.parent.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(self.epc_file_path, "w") as zf: + # Create [Content_Types].xml + content_types = Types() + content_types_xml = serialize_xml(content_types) + zf.writestr(get_epc_content_type_path(), content_types_xml) + + # Create _rels/.rels + rels = Relationships() + rels_xml = serialize_xml(rels) + zf.writestr("_rels/.rels", rels_xml) + def _load_metadata(self) -> None: """Load object metadata from [Content_Types].xml without loading actual objects.""" try: @@ -181,7 +229,7 @@ def _get_zip_file(self) -> Iterator[zipfile.ZipFile]: def _read_content_types(self, zf: zipfile.ZipFile) -> Types: """Read and parse [Content_Types].xml file.""" - content_types_path = "[Content_Types].xml" + content_types_path = get_epc_content_type_path() try: content_data = zf.read(content_types_path) @@ -256,13 +304,17 @@ def _extract_object_info_fast( version = None version_patterns = [ r'object[Vv]ersion["\']?\s*[:=]\s*["\']([^"\']+)', - r'version["\']?\s*[:=]\s*["\']([^"\']+)', ] for pattern in version_patterns: - version_match = OptimizedRegex.SCHEMA_VERSION.search(chunk_str) + import re + + version_match = re.search(pattern, chunk_str) if version_match: version = version_match.group(1) + # Ensure version is a string + if not isinstance(version, str): + version = str(version) break # Extract object type from content type @@ -335,7 +387,7 @@ def _detect_epc_version(self) -> EpcExportVersion: logging.warning(f"Failed to detect EPC version, defaulting to CLASSIC: {e}") return EpcExportVersion.CLASSIC - def get_object_by_identifier(self, identifier: str) -> Optional[Any]: + def get_object_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]: """ Get object by its identifier with smart caching. @@ -345,9 +397,15 @@ def get_object_by_identifier(self, identifier: str) -> Optional[Any]: Returns: The requested object or None if not found """ + is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None + if is_uri: + uri = parse_uri(identifier) if isinstance(identifier, str) else identifier + assert uri is not None and uri.uuid is not None + identifier = uri.uuid + "." + (uri.version or "") + # Check cache first if identifier in self._object_cache: - self._update_access_order(identifier) + self._update_access_order(identifier) # type: ignore self.stats.cache_hits += 1 return self._object_cache[identifier] @@ -367,8 +425,14 @@ def get_object_by_identifier(self, identifier: str) -> Optional[Any]: return obj - def _load_object(self, identifier: str) -> Optional[Any]: + def _load_object(self, identifier: Union[str, Uri]) -> Optional[Any]: """Load object from EPC file.""" + is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None + if is_uri: + uri = parse_uri(identifier) if isinstance(identifier, str) else identifier + assert uri is not None and uri.uuid is not None + identifier = uri.uuid + "." + (uri.version or "") + assert isinstance(identifier, str) metadata = self._metadata.get(identifier) if not metadata: return None @@ -399,8 +463,16 @@ def _validate_object(self, obj: Any, metadata: EpcObjectMetadata) -> None: except Exception as e: logging.debug(f"Validation failed for {metadata.identifier}: {e}") - def _add_to_cache(self, identifier: str, obj: Any) -> None: + def _add_to_cache(self, identifier: Union[str, Uri], obj: Any) -> None: """Add object to cache with LRU eviction.""" + is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None + if is_uri: + uri = parse_uri(identifier) if isinstance(identifier, str) else identifier + assert uri is not None and uri.uuid is not None + identifier = uri.uuid + "." + (uri.version or "") + + assert isinstance(identifier, str) + # Remove from access order if already present if identifier in self._access_order: self._access_order.remove(identifier) @@ -527,6 +599,116 @@ def to_epc(self, load_all: bool = False) -> Epc: return epc + def get_obj_rels(self, obj: Union[str, Uri, Any]) -> List[Relationship]: + """ + Get all relationships for a given object. + :param obj: the object or its identifier/URI + :return: list of Relationship objects + """ + rels = [] + + # read rels from EPC file + if isinstance(obj, (str, Uri)): + obj = self.get_object_by_identifier(obj) + with zipfile.ZipFile(self.epc_file_path, "r") as zf: + rels_path = gen_rels_path(obj, self.export_version) + try: + rels_data = zf.read(rels_path) + self.stats.bytes_read += len(rels_data) + relationships = read_energyml_xml_bytes(rels_data, Relationships) + rels.extend(relationships.relationship) + except KeyError: + # No rels file found for this object + pass + + return rels + + def get_h5_file_paths(self, obj: Union[str, Uri, Any]) -> List[str]: + """ + Get all HDF5 file paths referenced in the EPC file (from rels to external resources) + :param obj: the object or its identifier/URI + :return: list of HDF5 file paths + """ + if self.force_h5_path is not None: + return [self.force_h5_path] + h5_paths = set() + + if isinstance(obj, (str, Uri)): + obj = self.get_object_by_identifier(obj) + + for rels in self.additional_rels.get(get_obj_identifier(obj), []): + if rels.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(): + h5_paths.add(rels.target) + + if len(h5_paths) == 0: + # search if an h5 file has the same name than the epc file + epc_folder = os.path.dirname(self.epc_file_path) + if epc_folder is not None and self.epc_file_path is not None: + epc_file_name = os.path.basename(self.epc_file_path) + epc_file_base, _ = os.path.splitext(epc_file_name) + possible_h5_path = os.path.join(epc_folder, epc_file_base + ".h5") + if os.path.exists(possible_h5_path): + h5_paths.add(possible_h5_path) + return list(h5_paths) + + def read_array(self, proxy: Union[str, Uri, Any], path_in_external: str) -> Optional[np.ndarray]: + """ + Read a dataset from the HDF5 file linked to the proxy object. + :param proxy: the object or its identifier + :param path_in_external: the path in the external HDF5 file + :return: the dataset as a numpy array + """ + # Resolve proxy to object + if isinstance(proxy, (str, Uri)): + obj = self.get_object_by_identifier(proxy) + else: + obj = proxy + + h5_path = self.get_h5_file_paths(obj) + + h5_reader = HDF5FileReader() + + if h5_path is None or len(h5_path) == 0: + raise ValueError("No HDF5 file paths found for the given proxy object.") + else: + for h5p in h5_path: + # TODO: handle different type of files + try: + return h5_reader.read_array(source=h5p, path_in_external_file=path_in_external) + except Exception: + pass + # logging.error(f"Failed to read HDF5 dataset from {h5p}: {e}") + + def write_array(self, proxy: Union[str, Uri, Any], path_in_external: str, array: np.ndarray) -> bool: + """ + Write a dataset to the HDF5 file linked to the proxy object. + :param proxy: the object or its identifier + :param path_in_external: the path in the external HDF5 file + :param array: the numpy array to write + + return: True if successful + """ + # Resolve proxy to object + if isinstance(proxy, (str, Uri)): + obj = self.get_object_by_identifier(proxy) + else: + obj = proxy + + h5_path = self.get_h5_file_paths(obj) + + h5_writer = HDF5FileWriter() + + if h5_path is None or len(h5_path) == 0: + raise ValueError("No HDF5 file paths found for the given proxy object.") + else: + for h5p in h5_path: + try: + h5_writer.write_array(target=h5p, path_in_external_file=path_in_external, array=array) + return True + except Exception as e: + logging.error(f"Failed to write HDF5 dataset to {h5p}: {e}") + return False + def validate_all_objects(self, fast_mode: bool = True) -> Dict[str, List[str]]: """ Validate all objects in the EPC file. @@ -591,20 +773,20 @@ def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit with cleanup.""" self.clear_cache() - def add_object(self, obj: Any, file_path: Optional[str] = None) -> str: + def add_object(self, obj: Any, file_path: Optional[str] = None, replace_if_exists: bool = True) -> str: """ Add a new object to the EPC file and update caches. Args: obj: The EnergyML object to add - object_type: The type of the object (e.g., 'BoundaryFeature') file_path: Optional custom file path, auto-generated if not provided + replace_if_exists: If True, replace the object if it already exists. If False, raise ValueError. Returns: The identifier of the added object Raises: - ValueError: If object is invalid or already exists + ValueError: If object is invalid or already exists (when replace_if_exists=False) RuntimeError: If file operations fail """ identifier = None @@ -619,10 +801,21 @@ def add_object(self, obj: Any, file_path: Optional[str] = None) -> str: raise ValueError("Object must have a valid UUID") version = identifier[len(uuid) + 1 :] if identifier and "." in identifier else None + # Ensure version is treated as a string, not an integer + if version is not None and not isinstance(version, str): + version = str(version) + object_type = get_object_type_for_file_path_from_class(obj) if identifier in self._metadata: - raise ValueError(f"Object with identifier {identifier} already exists. use update_object() instead.") + if replace_if_exists: + # Remove the existing object first + logging.info(f"Replacing existing object {identifier}") + self.remove_object(identifier) + else: + raise ValueError( + f"Object with identifier {identifier} already exists. Use update_object() or set replace_if_exists=True." + ) # Generate file path if not provided file_path = gen_energyml_object_path(obj, self.export_version) @@ -630,7 +823,7 @@ def add_object(self, obj: Any, file_path: Optional[str] = None) -> str: print(f"Generated file path: {file_path} for export version: {self.export_version}") # Determine content type based on object type - content_type = self._get_content_type_for_object_type(object_type) + content_type = get_obj_content_type(obj) # Create metadata metadata = EpcObjectMetadata( @@ -674,7 +867,7 @@ def add_object(self, obj: Any, file_path: Optional[str] = None) -> str: self._rollback_add_object(identifier) raise RuntimeError(f"Failed to add object to EPC: {e}") - def remove_object(self, identifier: str) -> bool: + def remove_object(self, identifier: Union[str, Uri]) -> bool: """ Remove an object (or all versions of an object) from the EPC file and update caches. @@ -690,6 +883,13 @@ def remove_object(self, identifier: str) -> bool: RuntimeError: If file operations fail """ try: + is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None + if is_uri: + uri = parse_uri(identifier) if isinstance(identifier, str) else identifier + assert uri is not None and uri.uuid is not None + identifier = uri.uuid + "." + (uri.version or "") + assert isinstance(identifier, str) + if identifier not in self._metadata: # Check if identifier is a UUID only (should remove all versions) if identifier in self._uuid_index: @@ -720,7 +920,11 @@ def _remove_single_object(self, identifier: str) -> bool: metadata = self._metadata[identifier] - # Remove from cache first + # IMPORTANT: Remove from file FIRST (before clearing cache/metadata) + # because _remove_object_from_file needs to load the object to access its DORs + self._remove_object_from_file(metadata) + + # Now remove from cache if identifier in self._object_cache: del self._object_cache[identifier] @@ -743,12 +947,9 @@ def _remove_single_object(self, identifier: str) -> bool: if not self._type_index[object_type]: del self._type_index[object_type] - # Remove from metadata + # Remove from metadata (do this last) del self._metadata[identifier] - # Remove from file - self._remove_object_from_file(metadata) - # Update stats self.stats.total_objects -= 1 if self.stats.loaded_objects > 0: @@ -788,27 +989,305 @@ def update_object(self, obj: Any) -> str: logging.error(f"Failed to update object {identifier}: {e}") raise RuntimeError(f"Failed to update object in EPC: {e}") - def _get_content_type_for_object_type(self, object_type: str) -> str: - """Get appropriate content type for object type.""" - # Map common object types to content types - content_type_map = { - "BoundaryFeature": "application/x-resqml+xml;version=2.2;type=BoundaryFeature", - "PropertyKind": "application/x-eml+xml;version=2.3;type=PropertyKind", - "LocalDepth3dCrs": "application/x-resqml+xml;version=2.2;type=LocalDepth3dCrs", - "PolylineSetRepresentation": "application/x-resqml+xml;version=2.2;type=PolylineSetRepresentation", - "PointSetRepresentation": "application/x-resqml+xml;version=2.2;type=PointSetRepresentation", - } + def add_rels_for_object(self, identifier: Union[str, Uri, Any], relationships: List[Relationship]) -> None: + """ + Add additional relationships for a specific object. + + Args: + identifier: The identifier of the object, can be str, Uri, or the object itself + relationships: List of Relationship objects to add + """ + is_uri = isinstance(identifier, Uri) or (isinstance(identifier, str) and parse_uri(identifier) is not None) + object_instance = None + if is_uri: + uri = parse_uri(identifier) if isinstance(identifier, str) else identifier + assert uri is not None and uri.uuid is not None + identifier = uri.uuid + "." + (uri.version or "") + object_instance = self.get_object_by_identifier(identifier) + elif not isinstance(identifier, str): + identifier = get_obj_identifier(identifier) + object_instance = self.get_object_by_identifier(identifier) + else: + object_instance = identifier + + assert isinstance(identifier, str) + + if identifier not in self.additional_rels: + self.additional_rels[identifier] = [] + + self.additional_rels[identifier].extend(relationships) + if len(self.additional_rels[identifier]) > 0: + # Create temporary file for updated EPC + with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: + temp_path = temp_file.name + # Update the .rels file for this object by updating the rels file in the EPC + with ( + zipfile.ZipFile(self.epc_file_path, "r") as source_zip, + zipfile.ZipFile(temp_path, "a") as target_zip, + ): + # copy all files except the rels file to be updated + for item in source_zip.infolist(): + if item.filename != gen_rels_path(object_instance, self.export_version): + buffer = source_zip.read(item.filename) + target_zip.writestr(item, buffer) + + self._update_existing_rels_files( + Relationships(relationship=relationships), + gen_rels_path(object_instance, self.export_version), + source_zip, + target_zip, + ) + shutil.move(temp_path, self.epc_file_path) - return content_type_map.get(object_type, f"application/x-resqml+xml;version=2.2;type={object_type}") + def _compute_object_rels(self, obj: Any, obj_identifier: str) -> List[Relationship]: + """ + Compute relationships for a given object (SOURCE relationships). + This object references other objects through DORs. - def _add_object_to_file(self, obj: Any, metadata: EpcObjectMetadata) -> None: - """Add object to the EPC file by updating the ZIP archive.""" - import tempfile - import shutil + Args: + obj: The EnergyML object + obj_identifier: The identifier of the object - # Serialize object to XML - from .serialization import serialize_xml + Returns: + List of Relationship objects for this object's .rels file + """ + rels = [] + + # Get all DORs (Data Object References) in this object + direct_dors = get_direct_dor_list(obj) + + for dor in direct_dors: + try: + target_identifier = get_obj_identifier(dor) + target_rels_path = gen_rels_path(dor, self.export_version) + + # Create SOURCE relationship (this object -> target object) + rel = Relationship( + target=target_rels_path, + type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), + id=f"_{obj_identifier}_{get_obj_type(get_obj_usable_class(dor))}_{target_identifier}", + ) + rels.append(rel) + except Exception as e: + logging.warning(f"Failed to create relationship for DOR in {obj_identifier}: {e}") + + return rels + + def _get_objects_referencing(self, target_identifier: str) -> List[Tuple[str, Any]]: + """ + Find all objects that reference the target object. + + Args: + target_identifier: The identifier of the target object + + Returns: + List of tuples (identifier, object) of objects that reference the target + """ + referencing_objects = [] + + # We need to check all objects in the EPC to find those that reference our target + for identifier in self._metadata: + # Load the object to check its DORs + obj = self.get_object_by_identifier(identifier) + if obj is not None: + # Check if this object references our target + direct_dors = get_direct_dor_list(obj) + for dor in direct_dors: + try: + dor_identifier = get_obj_identifier(dor) + if dor_identifier == target_identifier: + referencing_objects.append((identifier, obj)) + break # Found a reference, no need to check other DORs in this object + except Exception: + continue + + return referencing_objects + + def _update_existing_rels_files( + self, rels: Relationships, rel_path: str, source_zip: zipfile.ZipFile, target_zip: zipfile.ZipFile + ) -> None: + """Merge new relationships with existing .rels, reading from source and writing to target ZIP. + + Args: + rels: New Relationships to add + rel_path: Path to the .rels file + source_zip: ZIP to read existing rels from + target_zip: ZIP to write updated rels to + """ + # print("@ Updating rels file:", rel_path) + existing_relationships = [] + try: + if rel_path in source_zip.namelist(): + rels_data = source_zip.read(rel_path) + existing_rels = read_energyml_xml_bytes(rels_data, Relationships) + if existing_rels and existing_rels.relationship: + existing_relationships = list(existing_rels.relationship) + except Exception as e: + logging.debug(f"Could not read existing rels for {rel_path}: {e}") + + for new_rel in rels.relationship: + rel_exists = any( + r.target == new_rel.target and r.type_value == new_rel.type_value for r in existing_relationships + ) + cpt = 0 + new_rel_id = new_rel.id + while any(r.id == new_rel_id for r in existing_relationships): + new_rel_id = f"{new_rel.id}_{cpt}" + cpt += 1 + if new_rel_id != new_rel.id: + new_rel.id = new_rel_id + if not rel_exists: + existing_relationships.append(new_rel) + + if existing_relationships: + updated_rels = Relationships(relationship=existing_relationships) + updated_rels_xml = serialize_xml(updated_rels) + target_zip.writestr(rel_path, updated_rels_xml) + + def _update_rels_files( + self, + obj: Any, + metadata: EpcObjectMetadata, + source_zip: zipfile.ZipFile, + target_zip: zipfile.ZipFile, + ) -> List[str]: + """ + Update all necessary .rels files when adding/updating an object. + + This includes: + 1. The object's own .rels file (for objects it references) + 2. The .rels files of objects that now reference this object (DESTINATION relationships) + + Args: + obj: The object being added/updated + metadata: Metadata for the object + source_zip: Source ZIP file to read existing rels from + target_zip: Target ZIP file to write updated rels to + + returns: + List of updated .rels file paths + """ + obj_identifier = metadata.identifier + updated_rels_paths = [] + if not obj_identifier: + logging.warning("Object identifier is None, skipping rels update") + return updated_rels_paths + + # 1. Create/update the object's own .rels file + obj_rels_path = gen_rels_path(obj, self.export_version) + obj_relationships = self._compute_object_rels(obj, obj_identifier) + + if obj_relationships: + self._update_existing_rels_files( + Relationships(relationship=obj_relationships), obj_rels_path, source_zip, target_zip + ) + updated_rels_paths.append(obj_rels_path) + + # 2. Update .rels files of objects referenced by this object + # These objects need DESTINATION relationships pointing to our object + direct_dors = get_direct_dor_list(obj) + + logging.debug(f"Updating rels for object {obj_identifier}, found {len(direct_dors)} direct DORs") + + for dor in direct_dors: + try: + target_rels_path = gen_rels_path(dor, self.export_version) + target_identifier = get_obj_identifier(dor) + + # Add DESTINATION relationship from target to our object + dest_rel = Relationship( + target=metadata.file_path, + type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), + id=f"_{target_identifier}_{get_obj_type(get_obj_usable_class(obj))}_{obj_identifier}", + ) + + self._update_existing_rels_files( + Relationships(relationship=[dest_rel]), target_rels_path, source_zip, target_zip + ) + updated_rels_paths.append(target_rels_path) + + except Exception as e: + logging.warning(f"Failed to update rels for referenced object: {e}") + return updated_rels_paths + + def _remove_rels_files( + self, obj: Any, metadata: EpcObjectMetadata, source_zip: zipfile.ZipFile, target_zip: zipfile.ZipFile + ) -> None: + """ + Remove/update .rels files when removing an object. + + This includes: + 1. Removing the object's own .rels file + 2. Removing DESTINATION relationships from objects that this object referenced + + Args: + obj: The object being removed + metadata: Metadata for the object + source_zip: Source ZIP file to read existing rels from + target_zip: Target ZIP file to write updated rels to + """ + # obj_identifier = metadata.identifier + + # 1. The object's own .rels file will be automatically excluded by not copying it + # obj_rels_path = gen_rels_path(obj, self.export_version) + + # 2. Update .rels files of objects that were referenced by this object + # Remove DESTINATION relationships that pointed to our object + direct_dors = get_direct_dor_list(obj) + + for dor in direct_dors: + try: + target_identifier = get_obj_identifier(dor) + + # Check if target object exists + if target_identifier not in self._metadata: + continue + + target_obj = self.get_object_by_identifier(target_identifier) + if target_obj is None: + continue + + target_rels_path = gen_rels_path(target_obj, self.export_version) + + # Read existing rels for the target object + existing_relationships = [] + try: + if target_rels_path in source_zip.namelist(): + rels_data = source_zip.read(target_rels_path) + existing_rels = read_energyml_xml_bytes(rels_data, Relationships) + if existing_rels and existing_rels.relationship: + existing_relationships = list(existing_rels.relationship) + except Exception as e: + logging.debug(f"Could not read existing rels for {target_identifier}: {e}") + + # Remove DESTINATION relationship that pointed to our object + updated_relationships = [ + r + for r in existing_relationships + if not ( + r.target == metadata.file_path + and r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type() + ) + ] + + # Write updated rels file (or skip if no relationships left) + if updated_relationships: + updated_rels = Relationships(relationship=updated_relationships) + updated_rels_xml = serialize_xml(updated_rels) + target_zip.writestr(target_rels_path, updated_rels_xml) + + except Exception as e: + logging.warning(f"Failed to update rels for referenced object during removal: {e}") + + def _add_object_to_file(self, obj: Any, metadata: EpcObjectMetadata) -> None: + """Add object to the EPC file by safely rewriting the ZIP archive. + The method creates a temporary ZIP archive, copies all entries except + the ones to be updated (content types and relevant .rels), then writes + the new object, merges and writes updated .rels files and the + updated [Content_Types].xml before replacing the original file. This + avoids issues with append mode creating overlapped entries. + """ xml_content = serialize_xml(obj) # Create temporary file for updated EPC @@ -816,21 +1295,25 @@ def _add_object_to_file(self, obj: Any, metadata: EpcObjectMetadata) -> None: temp_path = temp_file.name try: - # Copy existing EPC to temp file with zipfile.ZipFile(self.epc_file_path, "r") as source_zip: with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zip: - # Copy all existing files except [Content_Types].xml - for item in source_zip.infolist(): - if item.filename != "[Content_Types].xml": - data = source_zip.read(item.filename) - target_zip.writestr(item, data) # Add new object file - target_zip.writestr(metadata.file_path, xml_content.encode("utf-8")) + target_zip.writestr(metadata.file_path, xml_content) + + # Update .rels files by merging with existing ones read from source + updated_rels_paths = self._update_rels_files(obj, metadata, source_zip, target_zip) + + # Copy all existing files except [Content_Types].xml and rels we'll update + for item in source_zip.infolist(): + if item.filename == get_epc_content_type_path() or item.filename in updated_rels_paths: + continue + data = source_zip.read(item.filename) + target_zip.writestr(item, data) # Update [Content_Types].xml updated_content_types = self._update_content_types_xml(source_zip, metadata, add=True) - target_zip.writestr("[Content_Types].xml", updated_content_types) + target_zip.writestr(get_epc_content_type_path(), updated_content_types) # Replace original file with updated version shutil.move(temp_path, self.epc_file_path) @@ -839,12 +1322,14 @@ def _add_object_to_file(self, obj: Any, metadata: EpcObjectMetadata) -> None: # Clean up temp file on error if os.path.exists(temp_path): os.unlink(temp_path) + logging.error(f"Failed to add object to EPC file: {e}") raise def _remove_object_from_file(self, metadata: EpcObjectMetadata) -> None: - """Remove object from the EPC file by updating the ZIP archive.""" - import tempfile - import shutil + """Remove object from the EPC file by updating the ZIP archive. + + Note: This does NOT remove .rels files. Use clean_rels() to remove orphaned relationships. + """ # Create temporary file for updated EPC with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: @@ -855,19 +1340,20 @@ def _remove_object_from_file(self, metadata: EpcObjectMetadata) -> None: with zipfile.ZipFile(self.epc_file_path, "r") as source_zip: with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zip: # Copy all existing files except the one to remove and [Content_Types].xml + # We keep .rels files as-is (they will be cleaned by clean_rels() if needed) for item in source_zip.infolist(): - if item.filename not in [metadata.file_path, "[Content_Types].xml"]: + if item.filename not in [metadata.file_path, get_epc_content_type_path()]: data = source_zip.read(item.filename) target_zip.writestr(item, data) # Update [Content_Types].xml updated_content_types = self._update_content_types_xml(source_zip, metadata, add=False) - target_zip.writestr("[Content_Types].xml", updated_content_types) + target_zip.writestr(get_epc_content_type_path(), updated_content_types) # Replace original file with updated version shutil.move(temp_path, self.epc_file_path) - except Exception as e: + except Exception: # Clean up temp file on error if os.path.exists(temp_path): os.unlink(temp_path) @@ -925,6 +1411,297 @@ def _rollback_add_object(self, identifier: Optional[str]) -> None: if identifier in self._access_order: self._access_order.remove(identifier) + def clean_rels(self) -> Dict[str, int]: + """ + Clean all .rels files by removing relationships to objects that no longer exist. + + This method: + 1. Scans all .rels files in the EPC + 2. For each relationship, checks if the target object exists + 3. Removes relationships pointing to non-existent objects + 4. Removes empty .rels files + + Returns: + Dictionary with statistics: + - 'rels_files_scanned': Number of .rels files examined + - 'relationships_removed': Number of orphaned relationships removed + - 'rels_files_removed': Number of empty .rels files removed + """ + import tempfile + import shutil + + stats = { + "rels_files_scanned": 0, + "relationships_removed": 0, + "rels_files_removed": 0, + } + + # Create temporary file for updated EPC + with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: + temp_path = temp_file.name + + try: + with zipfile.ZipFile(self.epc_file_path, "r") as source_zip: + with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zip: + # Get all existing object file paths for validation + existing_object_files = {metadata.file_path for metadata in self._metadata.values()} + + # Process each file + for item in source_zip.infolist(): + if item.filename.endswith(".rels"): + # Process .rels file + stats["rels_files_scanned"] += 1 + + try: + rels_data = source_zip.read(item.filename) + rels_obj = read_energyml_xml_bytes(rels_data, Relationships) + + if rels_obj and rels_obj.relationship: + # Filter out relationships to non-existent objects + original_count = len(rels_obj.relationship) + + # Keep only relationships where the target exists + # or where the target is external (starts with ../ or http) + valid_relationships = [] + for rel in rels_obj.relationship: + target = rel.target + # Keep external references (HDF5, etc.) and existing objects + if ( + target.startswith("../") + or target.startswith("http") + or target in existing_object_files + or target.lstrip("/") + in existing_object_files # Also check without leading slash + ): + valid_relationships.append(rel) + + removed_count = original_count - len(valid_relationships) + stats["relationships_removed"] += removed_count + + if removed_count > 0: + logging.info( + f"Removed {removed_count} orphaned relationships from {item.filename}" + ) + + # Only write the .rels file if it has remaining relationships + if valid_relationships: + rels_obj.relationship = valid_relationships + updated_rels = serialize_xml(rels_obj) + target_zip.writestr(item.filename, updated_rels) + else: + # Empty .rels file, don't write it + stats["rels_files_removed"] += 1 + logging.info(f"Removed empty .rels file: {item.filename}") + else: + # Empty or invalid .rels, don't copy it + stats["rels_files_removed"] += 1 + + except Exception as e: + logging.warning(f"Failed to process .rels file {item.filename}: {e}") + # Copy as-is on error + data = source_zip.read(item.filename) + target_zip.writestr(item, data) + + else: + # Copy non-.rels files as-is + data = source_zip.read(item.filename) + target_zip.writestr(item, data) + + # Replace original file + shutil.move(temp_path, self.epc_file_path) + + logging.info( + f"Cleaned .rels files: scanned {stats['rels_files_scanned']}, " + f"removed {stats['relationships_removed']} orphaned relationships, " + f"removed {stats['rels_files_removed']} empty .rels files" + ) + + return stats + + except Exception as e: + # Clean up temp file on error + if os.path.exists(temp_path): + os.unlink(temp_path) + raise RuntimeError(f"Failed to clean .rels files: {e}") + + def rebuild_all_rels(self, clean_first: bool = True) -> Dict[str, int]: + """ + Rebuild all .rels files from scratch by analyzing all objects and their references. + + This method: + 1. Optionally cleans existing .rels files first + 2. Loads each object temporarily + 3. Analyzes its Data Object References (DORs) + 4. Creates/updates .rels files with proper SOURCE and DESTINATION relationships + + Args: + clean_first: If True, remove all existing .rels files before rebuilding + + Returns: + Dictionary with statistics: + - 'objects_processed': Number of objects analyzed + - 'rels_files_created': Number of .rels files created + - 'source_relationships': Number of SOURCE relationships created + - 'destination_relationships': Number of DESTINATION relationships created + """ + import tempfile + import shutil + + stats = { + "objects_processed": 0, + "rels_files_created": 0, + "source_relationships": 0, + "destination_relationships": 0, + } + + logging.info(f"Starting rebuild of all .rels files for {len(self._metadata)} objects...") + + # Build a map of which objects are referenced by which objects + # Key: target identifier, Value: list of (source_identifier, source_obj) + reverse_references: Dict[str, List[Tuple[str, Any]]] = {} + + # First pass: analyze all objects and build the reference map + for identifier in self._metadata: + try: + obj = self.get_object_by_identifier(identifier) + if obj is None: + continue + + stats["objects_processed"] += 1 + + # Get all DORs in this object + dors = get_direct_dor_list(obj) + + for dor in dors: + try: + target_identifier = get_obj_identifier(dor) + if target_identifier in self._metadata: + # Record this reference + if target_identifier not in reverse_references: + reverse_references[target_identifier] = [] + reverse_references[target_identifier].append((identifier, obj)) + except Exception: + pass + + except Exception as e: + logging.warning(f"Failed to analyze object {identifier}: {e}") + + # Second pass: create the .rels files + # Map of rels_file_path -> Relationships object + rels_files: Dict[str, Relationships] = {} + + # Process each object to create SOURCE relationships + for identifier in self._metadata: + try: + obj = self.get_object_by_identifier(identifier) + if obj is None: + continue + + # metadata = self._metadata[identifier] + obj_rels_path = gen_rels_path(obj, self.export_version) + + # Get all DORs (objects this object references) + dors = get_direct_dor_list(obj) + + if dors: + # Create SOURCE relationships + relationships = [] + + for dor in dors: + try: + target_identifier = get_obj_identifier(dor) + if target_identifier in self._metadata: + target_metadata = self._metadata[target_identifier] + + rel = Relationship( + target=target_metadata.file_path, + type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), + id=f"_{identifier}_{get_obj_type(get_obj_usable_class(dor))}_{target_identifier}", + ) + relationships.append(rel) + stats["source_relationships"] += 1 + + except Exception as e: + logging.debug(f"Failed to create SOURCE relationship: {e}") + + if relationships: + if obj_rels_path not in rels_files: + rels_files[obj_rels_path] = Relationships(relationship=[]) + rels_files[obj_rels_path].relationship.extend(relationships) + + except Exception as e: + logging.warning(f"Failed to create SOURCE rels for {identifier}: {e}") + + # Add DESTINATION relationships + for target_identifier, source_list in reverse_references.items(): + try: + target_obj = self.get_object_by_identifier(target_identifier) + if target_obj is None: + continue + + target_metadata = self._metadata[target_identifier] + target_rels_path = gen_rels_path(target_obj, self.export_version) + + # Create DESTINATION relationships for each object that references this one + for source_identifier, source_obj in source_list: + try: + source_metadata = self._metadata[source_identifier] + + rel = Relationship( + target=source_metadata.file_path, + type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), + id=f"_{target_identifier}_{get_obj_type(get_obj_usable_class(source_obj))}_{source_identifier}", + ) + + if target_rels_path not in rels_files: + rels_files[target_rels_path] = Relationships(relationship=[]) + rels_files[target_rels_path].relationship.append(rel) + stats["destination_relationships"] += 1 + + except Exception as e: + logging.debug(f"Failed to create DESTINATION relationship: {e}") + + except Exception as e: + logging.warning(f"Failed to create DESTINATION rels for {target_identifier}: {e}") + + stats["rels_files_created"] = len(rels_files) + + # Third pass: write the new EPC with updated .rels files + with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: + temp_path = temp_file.name + + try: + with zipfile.ZipFile(self.epc_file_path, "r") as source_zip: + with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zip: + # Copy all non-.rels files + for item in source_zip.infolist(): + if not (item.filename.endswith(".rels") and clean_first): + data = source_zip.read(item.filename) + target_zip.writestr(item, data) + + # Write new .rels files + for rels_path, rels_obj in rels_files.items(): + rels_xml = serialize_xml(rels_obj) + target_zip.writestr(rels_path, rels_xml) + + # Replace original file + shutil.move(temp_path, self.epc_file_path) + + logging.info( + f"Rebuilt .rels files: processed {stats['objects_processed']} objects, " + f"created {stats['rels_files_created']} .rels files, " + f"added {stats['source_relationships']} SOURCE and " + f"{stats['destination_relationships']} DESTINATION relationships" + ) + + return stats + + except Exception as e: + # Clean up temp file on error + if os.path.exists(temp_path): + os.unlink(temp_path) + raise RuntimeError(f"Failed to rebuild .rels files: {e}") + def __repr__(self) -> str: """String representation.""" return ( @@ -934,6 +1711,22 @@ def __repr__(self) -> str: f"cache_hit_rate={self.stats.cache_hit_rate:.1f}%)" ) + def dumps_epc_content_and_files_lists(self): + """Dump EPC content and files lists for debugging.""" + content_list = [] + file_list = [] + + with zipfile.ZipFile(self.epc_file_path, "r") as zf: + file_list = zf.namelist() + + for item in zf.infolist(): + content_list.append(f"{item.filename} - {item.file_size} bytes") + + return { + "content_list": sorted(content_list), + "file_list": sorted(file_list), + } + # Utility functions for backward compatibility diff --git a/energyml-utils/src/energyml/utils/exception.py b/energyml-utils/src/energyml/utils/exception.py index 60b571e..87e128c 100644 --- a/energyml-utils/src/energyml/utils/exception.py +++ b/energyml-utils/src/energyml/utils/exception.py @@ -38,4 +38,4 @@ def __init__(self, t: Optional[str] = None): class UnparsableFile(Exception): def __init__(self, t: Optional[str] = None): - super().__init__(f"File is not parsable for an EPC file. Please use RawFile class for non energyml files.") + super().__init__("File is not parsable for an EPC file. Please use RawFile class for non energyml files.") diff --git a/energyml-utils/src/energyml/utils/introspection.py b/energyml-utils/src/energyml/utils/introspection.py index e91624b..e764eba 100644 --- a/energyml-utils/src/energyml/utils/introspection.py +++ b/energyml-utils/src/energyml/utils/introspection.py @@ -18,11 +18,14 @@ epoch_to_date, epoch, gen_uuid, + qualified_type_to_content_type, snake_case, pascal_case, path_next_attribute, + OptimizedRegex, ) from .manager import ( + class_has_parent_with_name, get_class_pkg, get_class_pkg_version, RELATED_MODULES, @@ -30,9 +33,10 @@ get_sub_classes, get_classes_matching_name, dict_energyml_modules, + reshape_version_from_regex_match, ) from .uri import Uri, parse_uri -from .xml import parse_content_type, ENERGYML_NAMESPACES, parse_qualified_type +from .constants import parse_content_type, ENERGYML_NAMESPACES, parse_qualified_type def is_enum(cls: Union[type, Any]): @@ -91,7 +95,7 @@ def find_class_in_module(module_name, class_name): try: if cls_name == class_name or cls.Meta.name == class_name: return cls - except Exception as e: + except Exception: pass logging.error(f"Not Found : {module_name}; {class_name}") return None @@ -106,7 +110,8 @@ def search_class_in_module_from_partial_name(module_name: str, class_partial_nam """ try: - module = import_module(module_name) + import_module(module_name) + # module = import_module(module_name) classes = get_module_classes_from_name(module_name) matching_classes = [cls for cls_name, cls in classes if class_partial_name.lower() in cls_name.lower()] return matching_classes @@ -287,7 +292,7 @@ def import_related_module(energyml_module_name: str) -> None: for m in related: try: import_module(m) - except Exception as e: + except Exception: pass # logging.error(e) @@ -331,7 +336,7 @@ def get_class_fields(cls: Union[type, Any]) -> Dict[str, Field]: try: # print(list_function_parameters_with_types(cls.__new__, True)) return list_function_parameters_with_types(cls.__new__, True) - except AttributeError as e: + except AttributeError: # For not working types like proxy type for C++ binding res = {} for a_name, a_type in inspect.getmembers(cls): @@ -639,9 +644,52 @@ def class_match_rgx( return False -def is_dor(obj: any) -> bool: +def get_dor_obj_info(dor: Any) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[type], Optional[str]]: + """ + From a DOR object, return a tuple (uuid, package name, package version, object_type, object_version) + + :param dor: a DataObjectReference object or ContentElement object + :return: tuple (uuid, package name, package version, object_type, object_version) + 1. uuid: the UUID of the object + 2. package name: the name of the package where the object is defined + 3. package version: the version of the package where the object is defined + 4. object_type: the class of the object + 5. object_version: the version of the object + + Example for a resqml v2.2 TriangulatedSetRepresentation : + ('123e4567-e89b-12d3-a456-426614174000', 'resqml', '2.2', , '1.0') + """ + obj_version = None + obj_cls = None + pkg_version = None + pkg = None + if hasattr(dor, "content_type"): + content_type = get_object_attribute_no_verif(dor, "content_type") + if content_type is not None: + obj_cls = get_class_from_content_type(content_type) + elif hasattr(dor, "qualified_type"): + qualified_type = get_object_attribute_no_verif(dor, "qualified_type") + if qualified_type is not None: + obj_cls = get_class_from_qualified_type(qualified_type) + + obj_version = get_obj_version(dor) + + uuid = get_obj_uuid(dor) + + if obj_cls is not None: + p = OptimizedRegex.ENERGYML_MODULE_NAME + match = p.search(obj_cls.__module__) + if match is not None: + pkg_version = reshape_version_from_regex_match(match) + pkg = match.group("pkg") + + return uuid, pkg, pkg_version, obj_cls, obj_version + + +def is_dor(obj: Any) -> bool: return ( "dataobjectreference" in get_obj_type(obj).lower() + or class_has_parent_with_name(obj, "DataObjectReference") or get_object_attribute(obj, "ContentType") is not None or get_object_attribute(obj, "QualifiedType") is not None ) @@ -1068,7 +1116,7 @@ def get_obj_version(obj: Any) -> Optional[str]: """ try: return get_object_attribute_no_verif(obj, "object_version") - except AttributeError as e: + except AttributeError: try: return get_object_attribute_no_verif(obj, "version_string") except Exception: @@ -1085,7 +1133,7 @@ def get_obj_title(obj: Any) -> Optional[str]: """ try: return get_object_attribute_advanced(obj, "citation.title") - except AttributeError as e: + except AttributeError: return None @@ -1138,6 +1186,26 @@ def get_obj_pkg_pkgv_type_uuid_version( return pkg, pkg_v, obj_type, obj_uuid, obj_version +def get_obj_qualified_type(obj: Any) -> str: + """ + Generates an objet qualified type as : 'PKG.PKG_VERSION.OBJ_TYPE' + :param obj: + :return: str + """ + pkg, pkg_v, obj_type, _, _ = get_obj_pkg_pkgv_type_uuid_version(obj) + if pkg is None or pkg_v is None or obj_type is None: + raise ValueError(f"Cannot get qualified type for object of type {type(obj)}") + return f"{pkg}{pkg_v}.{obj_type}" + + +def get_obj_content_type(obj: Any) -> str: + qualified_type = get_obj_qualified_type(obj) + res = qualified_type_to_content_type(qualified_type) + if res is None: + raise ValueError(f"Cannot get content type for object of type {type(obj)} from qualified type {qualified_type}") + return res + + def get_obj_identifier(obj: Any) -> str: """ Generates an objet identifier as : 'OBJ_UUID.OBJ_VERSION' @@ -1211,6 +1279,31 @@ def as_obj_prefixed_class_if_possible(o: Any) -> Any: if o is not None: if not isinstance(o, type): o_type = type(o) + # logging.info( + # f"Trying to convert object of type {o_type.__module__} -- {o_type.__name__} to obj prefixed class : {o_type.__name__.lower().startswith('obj')}" + # ) + if o_type.__name__.lower().startswith("obj"): + # search for sub class with same name but without Obj prefix + if hasattr(o_type, "Meta") and not hasattr(o_type.Meta, "namespace"): + try: + sub_name = str(o_type.__name__).replace(o_type.__name__, o_type.__name__[3:]) + sub_class_name = f"{o_type.__module__}.{sub_name}" + # logging.info(f"\n\nSearching subclass {sub_class_name} for {o_type}") + sub = get_class_from_name(sub_class_name) + # logging.info(f"Found subclass {sub} for {sub}") + if sub is not None and issubclass(sub, o_type): + try: + try: + if sub.Meta is not None: + o_type.Meta.namespace = sub.Meta.namespace # keep the same namespace + except Exception: + logging.debug(f"Failed to set namespace for {sub}") + except Exception as e: + # logging.debug(f"Failed to convert {o} to {sub}") + logging.debug(e) + except Exception: + logging.debug(f"Error using Meta class for {o_type}") + return o if o_type.__bases__ is not None: for bc in o_type.__bases__: # print(bc) @@ -1410,7 +1503,7 @@ def get_class_from_simple_name(simple_name: str, energyml_module_context=None) - energyml_module_context = [] try: return eval(simple_name) - except NameError as e: + except NameError: for mod in energyml_module_context: try: exec(f"from {mod} import *") @@ -1446,7 +1539,7 @@ def _gen_str_from_attribute_name(attribute_name: Optional[str], _parent_class: O elif "mime_type" in attribute_name_lw and ( "external" in _parent_class.__name__.lower() and "part" in _parent_class.__name__.lower() ): - return f"application/x-hdf5" + return "application/x-hdf5" elif "type" in attribute_name_lw: if attribute_name_lw.startswith("qualified"): return get_qualified_type_from_class(get_classes_matching_name(_parent_class, "Abstract")[0]) @@ -1635,91 +1728,3 @@ def _random_value_from_class( logging.error(f"@_random_value_from_class Not supported object type generation {cls}") return None - - -if __name__ == "__main__": - # # poetry run python -m src.energyml.utils.introspection - - from energyml.eml.v2_3.commonv2 import * - from energyml.eml.v2_0.commonv2 import Citation as Cit201 - from energyml.resqml.v2_0_1.resqmlv2 import TriangulatedSetRepresentation as Tr20, ObjTriangulatedSetRepresentation - from energyml.resqml.v2_2.resqmlv2 import ( - TriangulatedSetRepresentation, - FaultInterpretation, - ) - from .serialization import * - - # # with open( - # # "C:/Users/Cryptaro/Downloads/test/obj_TriangulatedSetRepresentation_9298c0c3-7418-4c70-8388-e6071c95074e.xml", - # # "rb", - # # ) as f: - # # f_content = f.read() - # # print(read_energyml_xml_bytes(f_content)) - - fi_cit = Citation( - title="An interpretation", - originator="Valentin", - creation=epoch_to_date(epoch()), - editor="test", - format="Geosiris", - last_update=epoch_to_date(epoch()), - ) - - fi = FaultInterpretation( - citation=fi_cit, - uuid=gen_uuid(), - object_version="0", - ) - - tr_cit = Citation( - title="--", - # title="test title", - originator="Valentin", - creation=epoch_to_date(epoch()), - editor="test", - format="Geosiris", - last_update=epoch_to_date(epoch()), - ) - - # tr_cit201 = Cit201( - # title="--", - # # title="test title", - # originator="Valentin", - # # creation=str(epoch_to_date(epoch())) - # editor="test", - # format="Geosiris", - # # last_update=str(epoch_to_date(epoch())), - # ) - dor = DataObjectReference( - uuid=fi.uuid, - title="a DOR title", - object_version="0", - qualified_type="a wrong qualified type", - ) - tr = TriangulatedSetRepresentation( - citation=tr_cit, - uuid=gen_uuid(), - represented_object=dor, - ) - - # tr201 = Tr20( - # citation=tr_cit201, - # uuid=gen_uuid(), - # ) - # tr201_bis = ObjTriangulatedSetRepresentation( - # citation=tr_cit201, - # uuid=gen_uuid(), - # ) - # # print(get_obj_uri(tr201, "coucou")) - - # print(get_obj_usable_class(tr)) - # print(get_obj_usable_class(tr201)) - - # print(serialize_xml(tr201_bis, False)) - # print(serialize_xml(tr201, False)) - # # print(serialize_json(tr201)) - # print(serialize_xml(as_obj_prefixed_class_if_possible(tr201))) - # # print("--> ", serialize_json(tr)) - # # print(serialize_xml((get_usable_class(tr201))(tr201))) - print(get_all_possible_instanciable_classes_for_attribute(tr, "represented_object")) - print(get_all_possible_instanciable_classes_for_attribute(tr, "triangle_patch")) diff --git a/energyml-utils/src/energyml/utils/manager.py b/energyml-utils/src/energyml/utils/manager.py index 2a62af8..23933b3 100644 --- a/energyml-utils/src/energyml/utils/manager.py +++ b/energyml-utils/src/energyml/utils/manager.py @@ -4,9 +4,15 @@ import inspect import logging import pkgutil -from typing import Union, Any, Dict +import re +from typing import Union, Any, Dict, List, Optional -from .constants import * +from energyml.utils.constants import ( + ENERGYML_MODULES_NAMES, + RELATED_MODULES, + RGX_ENERGYML_MODULE_NAME, + RGX_PROJECT_VERSION, +) def get_related_energyml_modules_name(cls: Union[type, Any]) -> List[str]: @@ -98,6 +104,26 @@ def get_sub_classes(cls: type) -> List[type]: return list(dict.fromkeys(sub_classes)) +def class_has_parent_with_name( + cls: type, + parent_name_rgx: str, + re_flags=re.IGNORECASE, +) -> bool: + """ + Check if the class :param:`cls` has a parent class matching the regex :param:`parent_name_rgx`. + :param cls: + :param parent_name_rgx: + :param re_flags: + :return: + """ + if not isinstance(cls, type): + cls = type(cls) + for parent in inspect.getmro(cls): + if re.match(parent_name_rgx, parent.__name__, re_flags): + return True + return False + + def get_classes_matching_name( cls: type, name_rgx: str, @@ -181,6 +207,21 @@ def reshape_version(version: str, nb_digit: int) -> str: return version +def reshape_version_from_regex_match( + match: Optional[re.Match], print_dev_version: bool = True, nb_digit: int = 2 +) -> str: + """ + Reshape a version from a regex match object. + :param match: A regex match object containing the version information. + :param print_dev_version: If True, append 'dev' to the version if applicable. + :param nb_digit: The number of digits to keep in the version. + :return: The reshaped version string. + """ + return reshape_version(match.group("versionNumber"), nb_digit) + ( + "dev" + match.group("versionDev") if match.group("versionDev") is not None and print_dev_version else "" + ) + + def get_class_pkg_version(cls, print_dev_version: bool = True, nb_max_version_digits: int = 2): p = re.compile(RGX_ENERGYML_MODULE_NAME) class_module = None @@ -192,9 +233,7 @@ def get_class_pkg_version(cls, print_dev_version: bool = True, nb_max_version_di class_module = type(cls).__module__ match = p.search(class_module) - return reshape_version(match.group("versionNumber"), nb_max_version_digits) + ( - "dev" + match.group("versionDev") if match.group("versionDev") is not None and print_dev_version else "" - ) + return reshape_version_from_regex_match(match, print_dev_version, nb_max_version_digits) # ProtocolDict = DefaultDict[str, MessageDict] diff --git a/energyml-utils/src/energyml/utils/serialization.py b/energyml-utils/src/energyml/utils/serialization.py index c48a3ec..54a105d 100644 --- a/energyml-utils/src/energyml/utils/serialization.py +++ b/energyml-utils/src/energyml/utils/serialization.py @@ -15,10 +15,7 @@ from xsdata.formats.dataclass.models.generics import DerivedElement from xsdata.formats.dataclass.parsers import XmlParser, JsonParser from xsdata.formats.dataclass.parsers.config import ParserConfig -from xsdata.formats.dataclass.parsers.handlers import ( - LxmlEventHandler, - XmlEventHandler, -) + from xsdata.formats.dataclass.serializers import JsonSerializer from xsdata.formats.dataclass.serializers import XmlSerializer from xsdata.formats.dataclass.serializers.config import SerializerConfig @@ -106,10 +103,10 @@ def read_energyml_xml_bytes(file: bytes, obj_type: Optional[type] = None) -> Any except xsdata.exceptions.ParserError as e: if len(e.args) > 0: if "unknown property" in e.args[0].lower(): - logging.error(f"Trying reading without fail on unknown attribute/property") + logging.error("Trying reading without fail on unknown attribute/property") try: return _read_energyml_xml_bytes_as_class(file, obj_type, False, False) - except Exception as e: + except Exception: logging.error(traceback.print_stack()) pass # Otherwise @@ -269,14 +266,19 @@ def read_energyml_obj(data: Union[str, bytes], format_: str = "xml") -> Any: def serialize_xml(obj, check_obj_prefixed_classes: bool = True) -> str: + # logging.debug(f"[1] Serializing object of type {type(obj)}") obj = as_obj_prefixed_class_if_possible(obj) if check_obj_prefixed_classes else obj + # logging.debug(f"[2] Serializing object of type {type(obj)}") context = XmlContext( # element_name_generator=text.camel_case, # attribute_name_generator=text.kebab_case ) serializer_config = SerializerConfig(indent=" ") serializer = XmlSerializer(context=context, config=serializer_config) - return serializer.render(obj, ns_map=ENERGYML_NAMESPACES) + # res = serializer.render(obj) + res = serializer.render(obj, ns_map=ENERGYML_NAMESPACES) + # logging.debug(f"[3] Serialized XML with meta namespace : {obj.Meta.namespace}: {serialize_json(obj)}") + return res def serialize_json( @@ -371,7 +373,7 @@ def _read_json_dict(obj_json: Any, sub_obj: List) -> Any: ) else: logging.error(f"No matching attribute for attribute {att} in {obj}") - except Exception as e: + except Exception: logging.error(f"Error assign attribute value for attribute {att} in {obj}") except Exception as e: logging.error( diff --git a/energyml-utils/src/energyml/utils/uri.py b/energyml-utils/src/energyml/utils/uri.py index 57602cd..da05b1d 100644 --- a/energyml-utils/src/energyml/utils/uri.py +++ b/energyml-utils/src/energyml/utils/uri.py @@ -2,9 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Optional from dataclasses import dataclass, field -import re from .constants import ( - URI_RGX, URI_RGX_GRP_DATASPACE, URI_RGX_GRP_DOMAIN, URI_RGX_GRP_DOMAIN_VERSION, @@ -80,6 +78,11 @@ def is_object_uri(self): def get_qualified_type(self): return f"{self.domain}{self.domain_version}.{self.object_type}" + def as_identifier(self): + if not self.is_object_uri(): + return None + return f"{self.uuid}.{self.version if self.version is not None else ''}" + def __str__(self): res = "eml:///" if self.dataspace is not None and len(self.dataspace) > 0: @@ -107,4 +110,6 @@ def __str__(self): def parse_uri(uri: str) -> Optional[Uri]: - return Uri.parse(uri) + if uri is None or len(uri) <= 0: + return None + return Uri.parse(uri.strip()) diff --git a/energyml-utils/src/energyml/utils/workspace.py b/energyml-utils/src/energyml/utils/workspace.py index b59e2d9..8371644 100644 --- a/energyml-utils/src/energyml/utils/workspace.py +++ b/energyml-utils/src/energyml/utils/workspace.py @@ -1,7 +1,11 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +from abc import abstractmethod from dataclasses import dataclass -from typing import Optional, Any, List +from typing import Optional, Any, Union + +from energyml.utils.uri import Uri +import numpy as np @dataclass @@ -16,10 +20,26 @@ def get_object_by_identifier(self, identifier: str) -> Optional[Any]: def get_object_by_uuid(self, uuid: str) -> Optional[Any]: return self.get_object(uuid, None) - def read_external_array( - self, - energyml_array: Any, - root_obj: Optional[Any] = None, - path_in_root: Optional[str] = None, - ) -> List[Any]: - raise NotImplementedError("EnergymlWorkspace.get_object") + # def read_external_array( + # self, + # energyml_array: Any, + # root_obj: Optional[Any] = None, + # path_in_root: Optional[str] = None, + # ) -> List[Any]: + # raise NotImplementedError("EnergymlWorkspace.get_object") + + @abstractmethod + def add_object(self, obj: Any) -> bool: + raise NotImplementedError("EnergymlWorkspace.add_object") + + @abstractmethod + def remove_object(self, identifier: Union[str, Uri]) -> None: + raise NotImplementedError("EnergymlWorkspace.remove_object") + + @abstractmethod + def read_array(self, proxy: Union[str, Uri, Any], path_in_external: str) -> Optional[np.ndarray]: + raise NotImplementedError("EnergymlWorkspace.read_array") + + @abstractmethod + def write_array(self, proxy: Union[str, Uri, Any], path_in_external: str, array: Any) -> bool: + raise NotImplementedError("EnergymlWorkspace.write_array") diff --git a/energyml-utils/src/energyml/utils/xml.py b/energyml-utils/src/energyml/utils/xml.py index 7338cca..94e02ee 100644 --- a/energyml-utils/src/energyml/utils/xml.py +++ b/energyml-utils/src/energyml/utils/xml.py @@ -1,11 +1,13 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +from io import BytesIO import logging -from typing import Any, Union +from typing import Union, Optional +import re from lxml import etree as ETREE # type: Any -from .constants import * +from .constants import ENERGYML_NAMESPACES, ENERGYML_NAMESPACES_PACKAGE, OptimizedRegex, parse_content_type def get_pkg_from_namespace(namespace: str) -> Optional[str]: @@ -25,11 +27,12 @@ def get_root_namespace(tree: ETREE.Element) -> str: return tree.nsmap.get(tree.prefix, tree.nsmap.get(None, "")) -def get_class_name_from_xml(tree: ETREE.Element) -> str: +def get_class_name_from_xml(tree: ETREE.Element) -> Optional[str]: root_namespace = get_root_namespace(tree) pkg = get_pkg_from_namespace(root_namespace) if pkg is None: logging.error(f"No pkg found for elt {tree}") + return None else: if pkg == "opc": return "energyml.opc.opc." + get_root_type(tree)