From 3e8553360e5d74e2267f46801bd53a8909ca008c Mon Sep 17 00:00:00 2001 From: Neil Shephard Date: Tue, 18 Nov 2025 10:29:32 +0000 Subject: [PATCH 1/2] bug: decode numpy arrays of strings I found whilst working through TopoStats refactoring that the inclusion of the `config` as an attribute to `TopoStats` objects resulted in some of the values, which are lists of strings, being converted to Numpy arrays. These could not be decoded direction and `item[()].decode("utf-8")` failed with an `AttributeError` stating that `numpy.ndarray does not have attribute decode`. The solution proposed here is to capture this error and if `item[()]` is an instance of `np.ndarray` to iterate over the list decoding each item in turn. The typing is explicitly ignored because we want it as a list rather than a dictionary. Test included, and whilst it passes seems a bit light but does mirror the scenario encountered (had to use `group_path` as the higher level of nesting). --- AFMReader/io.py | 10 +++++++++- tests/test_io.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/AFMReader/io.py b/AFMReader/io.py index 74ada20..9113b62 100644 --- a/AFMReader/io.py +++ b/AFMReader/io.py @@ -5,6 +5,7 @@ from typing import BinaryIO import h5py +import numpy as np from loguru import logger from ruamel.yaml import YAML, YAMLError @@ -255,7 +256,14 @@ def unpack_hdf5(open_hdf5_file: h5py.File, group_path: str = "/") -> dict: # Decode byte strings to utf-8. The data type "O" is a byte string. elif isinstance(item, h5py.Dataset) and item.dtype == "O": # Byte string - data[key] = item[()].decode("utf-8") + try: + data[key] = item[()].decode("utf-8") + # Numpy arrays of strings can not be directly decoded, have to iterate over each item + except AttributeError as e: + if isinstance(item[()], np.ndarray): + data[key] = [_item.decode("utf-8") for _item in item[()]] # type: ignore + else: + raise e else: # Another type of dataset data[key] = item[()] diff --git a/tests/test_io.py b/tests/test_io.py index 885628f..5d009e5 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -201,6 +201,46 @@ def test_unpack_hdf5_nested_dict_group_path(tmp_path: Path) -> None: np.testing.assert_equal(result, expected) +def test_unpack_hdf5_list_of_bytes(tmp_path: Path) -> None: + """Test loading a list of strings which are encoded to Numpy array on saving.""" + to_save = { + "config": { + "grainstats": { + "class_names": np.asarray([b"DNA", b"Protein"], dtype="S7"), + "edge_detection_method": "binary_erosion", + "extract_height_profile": True, + "run": True, + } + } + } + group_path = "/config/grainstats/" + expected = { + "class_names": np.asarray([b"DNA", b"Protein"], dtype="S7"), + "edge_detection_method": "binary_erosion", + "extract_height_profile": True, + "run": True, + } + # Manually save the dictionary to HDF5 format + with h5py.File(tmp_path / "hdf5_file_list_of_strings", "w") as f: + # t_path = Path.cwd() + # with h5py.File(t_path / "tmp" / "something_else", "w") as f: + config = f.create_group("config") + grainstats = config.create_group("grainstats") + grainstats.create_dataset("class_names", data=to_save["config"]["grainstats"]["class_names"]) + grainstats.create_dataset( + "edge_detection_method", data=to_save["config"]["grainstats"]["edge_detection_method"] + ) + grainstats.create_dataset( + "extract_height_profile", data=to_save["config"]["grainstats"]["extract_height_profile"] + ) + grainstats.create_dataset("run", data=to_save["config"]["grainstats"]["run"]) + + # Load it back in and check if the list is the same + with h5py.File(tmp_path / "hdf5_file_list_of_strings", "r") as f: + result = unpack_hdf5(open_hdf5_file=f, group_path=group_path) + np.testing.assert_equal(result, expected) + + def test_read_yaml() -> None: """Test reading of YAML file.""" sample_config = read_yaml(RESOURCES / "test.yaml") From 0b1c527fd3b94c4a61a6539da9950d79c56a00d0 Mon Sep 17 00:00:00 2001 From: Neil Shephard Date: Tue, 18 Nov 2025 10:52:23 +0000 Subject: [PATCH 2/2] feature: handle newer topostats file versions Restructuring of TopoStats to define classes means the `TopoStats` object now holds the `topostats_version` rather than `topostats_file_version`. This commit allows both to be handled and switches to using [`packging.version`](https://packaging.pypa.io/en/stable/version.html) to do so which ensures a more consistent approach to comparing version numbers. Yet to write a test for working with newer `.topostats` where `topostats_version >= 2.3.2` as work is still on-going but parameterised test is in place for when work is complete. --- .pylintrc | 2 +- AFMReader/topostats.py | 12 +++++++++--- tests/test_topostats.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.pylintrc b/.pylintrc index 1667c84..350da63 100644 --- a/.pylintrc +++ b/.pylintrc @@ -62,7 +62,7 @@ py-version=3.9 # When enabled, pylint would attempt to guess common misconfiguration and emit # user-friendly hints instead of false-positive error messages. -suggestion-mode=yes +# suggestion-mode=yes # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. diff --git a/AFMReader/topostats.py b/AFMReader/topostats.py index 8dc9148..9b3ac44 100644 --- a/AFMReader/topostats.py +++ b/AFMReader/topostats.py @@ -5,6 +5,7 @@ import h5py +from packaging.version import parse as parse_version from AFMReader.io import unpack_hdf5 from AFMReader.logging import logger @@ -41,10 +42,15 @@ def load_topostats(file_path: Path | str) -> dict[str, Any]: try: with h5py.File(file_path, "r") as f: data = unpack_hdf5(open_hdf5_file=f, group_path="/") - if str(data["topostats_file_version"]) >= "0.2": + # Handle different names for variables holding the file version (<=0.3) or the newer topostats version + version = ( + data["topostats_file_version"] + if "topostats_file_version" in data.keys() # pylint: disable=consider-iterating-dictionary + else data["topostats_version"] + ) + if parse_version(str(version)) > parse_version("0.2"): data["img_path"] = Path(data["img_path"]) - file_version = data["topostats_file_version"] - logger.info(f"[{filename}] TopoStats file version : {file_version}") + logger.info(f"[{filename}] TopoStats file version : {version}") except OSError as e: if "Unable to open file" in str(e): diff --git a/tests/test_topostats.py b/tests/test_topostats.py index 4a1dc60..66a7989 100644 --- a/tests/test_topostats.py +++ b/tests/test_topostats.py @@ -99,7 +99,7 @@ def test_load_topostats( assert topostats_data["pixel_to_nm_scaling"] == pytest.approx(pixel_to_nm_scaling) assert topostats_data["image"].shape == image_shape assert topostats_data["image"].sum() == pytest.approx(image_sum) - if version >= "0.2": + if version > "0.2": assert isinstance(topostats_data["img_path"], Path)