computational-cell-analytics · constantinpape · Sep 2, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 8, 2025
diff --git a/flamingo_tools/file_utils.py b/flamingo_tools/file_utils.py
@@ -7,6 +7,8 @@
 import zarr
 from elf.io import open_file
 
+from .s3_utils import get_s3_path
+
 try:
     from zarr.abc.store import Store
 except ImportError:
@@ -67,7 +69,9 @@ def read_tif(file_path: str) -> Union[np.ndarray, np.memmap]:
     return x
 
 
-def read_image_data(input_path: Union[str, Store], input_key: Optional[str]) -> np.typing.ArrayLike:
+def read_image_data(
+    input_path: Union[str, Store], input_key: Optional[str], from_s3: bool = False
+) -> np.typing.ArrayLike:
     """Read flamingo image data, stored in various formats.
 
     Args:
@@ -76,10 +80,16 @@ def read_image_data(input_path: Union[str, Store], input_key: Optional[str]) ->
             Access via S3 is only supported for a zarr container.
         input_key: The key (= internal path) for a zarr or n5 container.
             Set it to None if the data is stored in a tif file.
+        from_s3: Whether to read the data from S3.
 
     Returns:
         The data, loaded either as a numpy mem-map, a numpy array, or a zarr / n5 array.
     """
+    if from_s3:
+        assert input_key is not None
+        s3_store, fs = get_s3_path(input_path)
+        return zarr.open(s3_store, mode="r")[input_key]
+
     if input_key is None:
         input_ = read_tif(input_path)
     elif isinstance(input_path, str):

diff --git a/flamingo_tools/measurements.py b/flamingo_tools/measurements.py
@@ -3,7 +3,7 @@
 import warnings
 from concurrent import futures
 from functools import partial
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -60,10 +60,13 @@ def _get_bounding_box_and_center(table, seg_id, resolution, shape, dilation):
         for bmin, bmax, sh in zip(bb_min, bb_max, shape)
     )
 
+    if isinstance(resolution, float):
+        resolution = (resolution,) * 3
+
     center = (
-        int(row.anchor_z.item() / resolution),
-        int(row.anchor_y.item() / resolution),
-        int(row.anchor_x.item() / resolution),
+        int(row.anchor_z.item() / resolution[0]),
+        int(row.anchor_y.item() / resolution[1]),
+        int(row.anchor_x.item() / resolution[2]),
     )
 
     return bb, center
@@ -307,7 +310,7 @@ def compute_object_measures(
     image_key: Optional[str] = None,
     segmentation_key: Optional[str] = None,
     n_threads: Optional[int] = None,
-    resolution: float = 0.38,
+    resolution: Union[float, Tuple[float, ...]] = 0.38,
     force: bool = False,
     feature_set: str = "default",
     s3_flag: bool = False,
@@ -359,8 +362,8 @@ def compute_object_measures(
         table = table[table["component_labels"].isin(component_list)]
 
     # Then, open the volumes.
-    image = read_image_data(image_path, image_key)
-    segmentation = read_image_data(segmentation_path, segmentation_key)
+    image = read_image_data(image_path, image_key, from_s3=s3_flag)
+    segmentation = read_image_data(segmentation_path, segmentation_key, from_s3=s3_flag)
 
     measures = compute_object_measures_impl(
         image, segmentation, n_threads, resolution, table=table, feature_set=feature_set,

diff --git a/flamingo_tools/segmentation/postprocessing.py b/flamingo_tools/segmentation/postprocessing.py
@@ -1,7 +1,8 @@
 import math
 import multiprocessing as mp
+import threading
 from concurrent import futures
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import elf.parallel as parallel
 import numpy as np
@@ -15,6 +16,9 @@
 from scipy.spatial import distance
 from scipy.spatial import cKDTree, ConvexHull
 from skimage import measure
+from skimage.filters import gaussian
+from skimage.feature import peak_local_max
+from skimage.segmentation import find_boundaries, watershed
 from sklearn.neighbors import NearestNeighbors
 from tqdm import tqdm
 
@@ -732,3 +736,134 @@ def filter_cochlea_volume(
         combined_dilated[combined_dilated > 0] = 1
 
     return combined_dilated
+
+
+def split_nonconvex_objects(
+    segmentation: np.typing.ArrayLike,
+    output: np.typing.ArrayLike,
+    segmentation_table: pd.DataFrame,
+    min_size: int,
+    resolution: Union[float, Sequence[float]],
+    height_map: Optional[np.typing.ArrayLike] = None,
+    component_labels: Optional[List[int]] = None,
+    n_threads: Optional[int] = None,
+) -> Dict[int, List[int]]:
+    """Split noncovex objects into multiple parts inplace.
+
+    Args:
+        segmentation:
+        output:
+        segmentation_table:
+        min_size:
+        resolution:
+        height_map:
+        component_labels:
+        n_threads:
+    """
+    if isinstance(resolution, float):
+        resolution = [resolution] * 3
+    assert len(resolution) == 3
+    resolution = np.array(resolution)
+
+    lock = threading.Lock()
+    offset = len(segmentation_table)
+
+    def split_object(object_id):
+        nonlocal offset
+
+        row = segmentation_table[segmentation_table.label_id == object_id]
+        if row.n_pixels.values[0] < min_size:
+            # print(object_id, ": min-size")
+            return [object_id]
+
+        bb_min = np.array([
+            row.bb_min_z.values[0], row.bb_min_y.values[0], row.bb_min_x.values[0],
+        ]) / resolution
+        bb_max = np.array([
+            row.bb_max_z.values[0], row.bb_max_y.values[0], row.bb_max_x.values[0],
+        ]) / resolution
+
+        bb_min = np.maximum(bb_min.astype(int) - 1, np.array([0, 0, 0]))
+        bb_max = np.minimum(bb_max.astype(int) + 1, np.array(list(segmentation.shape)))
+        bb = tuple(slice(mi, ma) for mi, ma in zip(bb_min, bb_max))
+
+        # This is due to segmentation artifacts.
+        bb_shape = bb_max - bb_min
+        if (bb_shape > 500).any():
+            print(object_id, "has a too large shape:", bb_shape)
+            return [object_id]
+
+        seg = segmentation[bb]
+        mask = ~find_boundaries(seg)
+        dist = distance_transform_edt(mask, sampling=resolution)
+
+        seg_mask = seg == object_id
+        dist[~seg_mask] = 0
+        dist = gaussian(dist, (0.6, 1.2, 1.2))
+        maxima = peak_local_max(dist, min_distance=3, exclude_border=True)
+
+        if len(maxima) == 1:
+            # print(object_id, ": max len")
+            return [object_id]
+
+        with lock:
+            old_offset = offset
+            offset += len(maxima)
+
+        seeds = np.zeros(seg.shape, dtype=int)
+        for i, pos in enumerate(maxima, 1):
+            seeds[tuple(pos)] = old_offset + i
+
+        if height_map is None:
+            hmap = dist.max() - dist
+        else:
+            hmap = height_map[bb]
+        new_seg = watershed(hmap, markers=seeds, mask=seg_mask)
+
+        seg_ids, sizes = np.unique(new_seg, return_counts=True)
+        seg_ids, sizes = seg_ids[1:], sizes[1:]
+
+        keep_ids = seg_ids[sizes > min_size]
+        if len(keep_ids) < 2:
+            # print(object_id, ": keep-id")
+            return [object_id]
+
+        elif len(keep_ids) != len(seg_ids):
+            new_seg[~np.isin(new_seg, keep_ids)] = 0
+            new_seg = watershed(hmap, markers=new_seg, mask=seg_mask)
+
+        with lock:
+            out = output[bb]
+            out[seg_mask] = new_seg[seg_mask]
+            output[bb] = out
+
+        # print(object_id, ":", len(keep_ids))
+        return keep_ids.tolist()
+
+        # import napari
+        # v = napari.Viewer()
+        # v.add_image(hmap)
+        # v.add_labels(seg)
+        # v.add_labels(new_seg)
+        # v.add_points(maxima)
+        # napari.run()
+
+    if component_labels is None:
+        object_ids = segmentation_table.label_id.values
+    else:
+        object_ids = segmentation_table[segmentation_table.component_labels.isin(component_labels)].label_id.values
+
+    if n_threads is None:
+        n_threads = mp.cpu_count()
+
+    # new_id_mapping = []
+    # for object_id in tqdm(object_ids, desc="Split non-convex objects"):
+    #     new_id_mapping.append(split_object(object_id))
+
+    with futures.ThreadPoolExecutor(n_threads) as tp:
+        new_id_mapping = list(
+            tqdm(tp.map(split_object, object_ids), total=len(object_ids), desc="Split non-convex objects")
+        )
+
+    new_id_mapping = {object_id: mapped_ids for object_id, mapped_ids in zip(object_ids, new_id_mapping)}
+    return new_id_mapping