From e0c63247d6a387580b84fa3e1329ade2e6cf4e92 Mon Sep 17 00:00:00 2001
From: Blampey Quentin <quentin.blampey@centralesupelec.fr>
Date: Mon, 2 Dec 2024 08:52:55 +0100
Subject: [PATCH 1/4] import read_text from anndata.io instead of anndata

---
 src/spatialdata_io/readers/_utils/_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py
index da0ac9e3..d7b831ff 100644
--- a/src/spatialdata_io/readers/_utils/_utils.py
+++ b/src/spatialdata_io/readers/_utils/_utils.py
@@ -5,7 +5,8 @@
 from pathlib import Path
 from typing import Any, Union
 
-from anndata import AnnData, read_text
+from anndata import AnnData
+from anndata.io import read_text
 from h5py import File
 
 from spatialdata_io.readers._utils._read_10x_h5 import _read_10x_h5

From 515ace0db461fdcb01570b075672ab0432b5cd22 Mon Sep 17 00:00:00 2001
From: Blampey Quentin <quentin.blampey@centralesupelec.fr>
Date: Mon, 2 Dec 2024 08:57:29 +0100
Subject: [PATCH 2/4] Revert "import read_text from anndata.io instead of
 anndata"

This reverts commit e0c63247d6a387580b84fa3e1329ade2e6cf4e92.
---
 src/spatialdata_io/readers/_utils/_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py
index d7b831ff..da0ac9e3 100644
--- a/src/spatialdata_io/readers/_utils/_utils.py
+++ b/src/spatialdata_io/readers/_utils/_utils.py
@@ -5,8 +5,7 @@
 from pathlib import Path
 from typing import Any, Union
 
-from anndata import AnnData
-from anndata.io import read_text
+from anndata import AnnData, read_text
 from h5py import File
 
 from spatialdata_io.readers._utils._read_10x_h5 import _read_10x_h5

From 5612ff4963bc7f817a8def999d605c92c696c51e Mon Sep 17 00:00:00 2001
From: Blampey Quentin <quentin.blampey@centralesupelec.fr>
Date: Tue, 18 Mar 2025 13:57:55 +0100
Subject: [PATCH 3/4] draft cosmx reader with stitching

---
 src/spatialdata_io/readers/cosmx.py | 509 ++++++++++++++--------------
 1 file changed, 247 insertions(+), 262 deletions(-)

diff --git a/src/spatialdata_io/readers/cosmx.py b/src/spatialdata_io/readers/cosmx.py
index 8f4ec38f..ab669017 100644
--- a/src/spatialdata_io/readers/cosmx.py
+++ b/src/spatialdata_io/readers/cosmx.py
@@ -1,298 +1,283 @@
 from __future__ import annotations
 
-import os
 import re
 from collections.abc import Mapping
 from pathlib import Path
 from types import MappingProxyType
-from typing import Any
+from typing import Any, Optional
 
 import dask.array as da
 import numpy as np
 import pandas as pd
-import pyarrow as pa
-from anndata import AnnData
-from dask.dataframe import DataFrame as DaskDataFrame
+import tifffile
+import xarray as xr
 from dask_image.imread import imread
-from scipy.sparse import csr_matrix
-
-# from spatialdata._core.core_utils import xy_cs
-from skimage.transform import estimate_transform
 from spatialdata import SpatialData
 from spatialdata._logging import logger
-from spatialdata.models import Image2DModel, Labels2DModel, PointsModel, TableModel
-
-# from spatialdata._core.ngff.ngff_coordinate_system import NgffAxis  # , CoordinateSystem
-from spatialdata.transformations.transformations import Affine, Identity
-
-from spatialdata_io._constants._constants import CosmxKeys
-from spatialdata_io._docs import inject_docs
-
-__all__ = ["cosmx"]
-
-# x_axis = NgffAxis(name="x", type="space", unit="discrete")
-# y_axis = NgffAxis(name="y", type="space", unit="discrete")
-# c_axis = NgffAxis(name="c", type="channel", unit="index")
+from spatialdata.models import Image2DModel, PointsModel
 
 
-@inject_docs(cx=CosmxKeys)
 def cosmx(
     path: str | Path,
-    dataset_id: str | None = None,
-    transcripts: bool = True,
+    dataset_id: Optional[str] = None,
+    fov: int | str | None = None,
+    read_proteins: bool = False,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
     image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ) -> SpatialData:
     """
-    Read *Cosmx Nanostring* data.
+    Read *Cosmx Nanostring* data. The fields of view are stitched together, except if `fov` is provided.
 
     This function reads the following files:
-
-        - ``<dataset_id>_`{cx.COUNTS_SUFFIX!r}```: Counts matrix.
-        - ``<dataset_id>_`{cx.METADATA_SUFFIX!r}```: Metadata file.
-        - ``<dataset_id>_`{cx.FOV_SUFFIX!r}```: Field of view file.
-        - ``{cx.IMAGES_DIR!r}``: Directory containing the images.
-        - ``{cx.LABELS_DIR!r}``: Directory containing the labels.
-
-    .. seealso::
-
-        - `Nanostring Spatial Molecular Imager <https://nanostring.com/products/cosmx-spatial-molecular-imager/>`_.
-
-    Parameters
-    ----------
-    path
-        Path to the root directory containing *Nanostring* files.
-    dataset_id
-        Name of the dataset.
-    transcripts
-        Whether to also read in transcripts information.
-    imread_kwargs
-        Keyword arguments passed to :func:`dask_image.imread.imread`.
-    image_models_kwargs
-        Keyword arguments passed to :class:`spatialdata.models.Image2DModel`.
-
-    Returns
-    -------
-    :class:`spatialdata.SpatialData`
+        - `*_fov_positions_file.csv` or `*_fov_positions_file.csv.gz`: FOV locations
+        - `Morphology2D` directory: all the FOVs morphology images
+        - `*_tx_file.csv.gz` or `*_tx_file.csv`: Transcripts location and names
+        - If `read_proteins` is `True`, all the images under the nested `ProteinImages` directories will be read
+
+        These files must be exported as flat files in AtomX. That is: within a study, click on "Export" and then select files from the "Flat CSV Files" section (transcripts flat and FOV position flat).
+
+    Args:
+        path: Path to the root directory containing *Nanostring* files.
+        dataset_id: Optional name of the dataset (needs to be provided if not infered).
+        fov: Name or number of one single field of view to be read. If a string is provided, an example of correct syntax is "F008". By default, reads all FOVs.
+        read_proteins: Whether to read the proteins or the transcripts.
+        image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`.
+        imread_kwargs: Keyword arguments passed to `dask_image.imread.imread`.
+
+    Returns:
+        A `SpatialData` object representing the CosMX experiment
     """
     path = Path(path)
 
-    # tries to infer dataset_id from the name of the counts file
-    if dataset_id is None:
-        counts_files = [f for f in os.listdir(path) if str(f).endswith(CosmxKeys.COUNTS_SUFFIX)]
-        if len(counts_files) == 1:
-            found = re.match(rf"(.*)_{CosmxKeys.COUNTS_SUFFIX}", counts_files[0])
-            if found:
-                dataset_id = found.group(1)
-    if dataset_id is None:
-        raise ValueError("Could not infer `dataset_id` from the name of the counts file. Please specify it manually.")
-
-    # check for file existence
-    counts_file = path / f"{dataset_id}_{CosmxKeys.COUNTS_SUFFIX}"
-    if not counts_file.exists():
-        raise FileNotFoundError(f"Counts file not found: {counts_file}.")
-    if transcripts:
-        transcripts_file = path / f"{dataset_id}_{CosmxKeys.TRANSCRIPTS_SUFFIX}"
-        if not transcripts_file.exists():
-            raise FileNotFoundError(f"Transcripts file not found: {transcripts_file}.")
+    dataset_id = _infer_dataset_id(path, dataset_id)
+    fov_locs = _read_fov_locs(path, dataset_id)
+    fov_id, fov = _check_fov_id(fov)
+
+    protein_dir_dict = {}
+    if read_proteins:
+        protein_dir_dict = {
+            int(protein_dir.parent.name[3:]): protein_dir for protein_dir in list(path.rglob("**/FOV*/ProteinImages"))
+        }
+        assert len(protein_dir_dict), f"No directory called 'ProteinImages' was found under {path}"
+
+    ### Read image(s)
+    images_dir = _find_dir(path, "Morphology2D")
+    morphology_coords = _cosmx_morphology_coords(images_dir)
+
+    if fov is None:
+        image, c_coords = _read_stitched_image(
+            images_dir,
+            fov_locs,
+            protein_dir_dict,
+            morphology_coords,
+            **imread_kwargs,
+        )
+        image_name = "stitched_image"
     else:
-        transcripts_file = None
-    meta_file = path / f"{dataset_id}_{CosmxKeys.METADATA_SUFFIX}"
-    if not meta_file.exists():
-        raise FileNotFoundError(f"Metadata file not found: {meta_file}.")
-    fov_file = path / f"{dataset_id}_{CosmxKeys.FOV_SUFFIX}"
-    if not fov_file.exists():
-        raise FileNotFoundError(f"Found field of view file: {fov_file}.")
-    images_dir = path / CosmxKeys.IMAGES_DIR
-    if not images_dir.exists():
-        raise FileNotFoundError(f"Images directory not found: {images_dir}.")
-    labels_dir = path / CosmxKeys.LABELS_DIR
-    if not labels_dir.exists():
-        raise FileNotFoundError(f"Labels directory not found: {labels_dir}.")
-
-    counts = pd.read_csv(path / counts_file, header=0, index_col=CosmxKeys.INSTANCE_KEY)
-    counts.index = counts.index.astype(str).str.cat(counts.pop(CosmxKeys.FOV).astype(str).values, sep="_")
-
-    obs = pd.read_csv(path / meta_file, header=0, index_col=CosmxKeys.INSTANCE_KEY)
-    obs[CosmxKeys.FOV] = pd.Categorical(obs[CosmxKeys.FOV].astype(str))
-    obs[CosmxKeys.REGION_KEY] = pd.Categorical(obs[CosmxKeys.FOV].astype(str).apply(lambda s: s + "_labels"))
-    obs[CosmxKeys.INSTANCE_KEY] = obs.index.astype(np.int64)
-    obs.rename_axis(None, inplace=True)
-    obs.index = obs.index.astype(str).str.cat(obs[CosmxKeys.FOV].values, sep="_")
-
-    common_index = obs.index.intersection(counts.index)
-
-    adata = AnnData(
-        csr_matrix(counts.loc[common_index, :].values),
-        dtype=counts.values.dtype,
-        obs=obs.loc[common_index, :],
+        pattern = f"*{fov_id}.TIF"
+        fov_files = list(images_dir.rglob(pattern))
+
+        assert len(fov_files), f"No file matches the pattern {pattern} inside {images_dir}"
+        assert len(fov_files) == 1, f"Multiple files match the pattern {pattern}: {', '.join(fov_files)}"
+
+        image, c_coords = _read_fov_image(fov_files[0], protein_dir_dict.get(fov), morphology_coords, **imread_kwargs)
+        image_name = f"{fov}_image"
+
+    parsed_image = Image2DModel.parse(image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs)
+
+    if read_proteins:
+        return SpatialData(images={image_name: parsed_image})
+
+    ### Read transcripts
+    transcripts_data = _read_transcripts_csv(path, dataset_id)
+
+    if fov is None:
+        transcripts_data["x"] = transcripts_data["x_global_px"] - fov_locs["xmin"].min()
+        transcripts_data["y"] = transcripts_data["y_global_px"] - fov_locs["ymin"].min()
+        coordinates = None
+        points_name = "points"
+    else:
+        transcripts_data = transcripts_data[transcripts_data["fov"] == fov]
+        coordinates = {"x": "x_local_px", "y": "y_local_px"}
+        points_name = f"{fov}_points"
+
+    from spatialdata_io._constants._constants import CosmxKeys
+
+    transcripts = PointsModel.parse(
+        transcripts_data,
+        coordinates=coordinates,
+        feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT,
     )
-    adata.var_names = counts.columns
 
-    table = TableModel.parse(
-        adata,
-        region=list(set(adata.obs[CosmxKeys.REGION_KEY].astype(str).tolist())),
-        region_key=CosmxKeys.REGION_KEY.value,
-        instance_key=CosmxKeys.INSTANCE_KEY.value,
+    return SpatialData(
+        images={image_name: parsed_image},
+        points={points_name: transcripts},
     )
 
-    fovs_counts = list(map(str, adata.obs.fov.astype(int).unique()))
 
-    affine_transforms_to_global = {}
+def _infer_dataset_id(path: Path, dataset_id: str | None) -> str:
+    if isinstance(dataset_id, str):
+        return dataset_id
 
-    for fov in fovs_counts:
-        idx = table.obs.fov.astype(str) == fov
-        loc = table[idx, :].obs[[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL]].values
-        glob = table[idx, :].obs[[CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL]].values
-        out = estimate_transform(ttype="affine", src=loc, dst=glob)
-        affine_transforms_to_global[fov] = Affine(
-            # out.params, input_coordinate_system=input_cs, output_coordinate_system=output_cs
-            out.params,
-            input_axes=("x", "y"),
-            output_axes=("x", "y"),
-        )
+    for suffix in [".csv", ".csv.gz"]:
+        counts_files = list(path.rglob(f"[!\\.]*_fov_positions_file{suffix}"))
 
-    table.obsm["global"] = table.obs[[CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL]].to_numpy()
-    table.obsm["spatial"] = table.obs[[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL]].to_numpy()
-    table.obs.drop(
-        columns=[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL, CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL],
-        inplace=True,
-    )
+        if len(counts_files) == 1:
+            found = re.match(rf"(.*)_fov_positions_file{suffix}", counts_files[0].name)
+            if found:
+                return found.group(1)
 
-    # prepare to read images and labels
-    file_extensions = (".jpg", ".png", ".jpeg", ".tif", ".tiff")
-    pat = re.compile(r".*_F(\d+)")
-
-    # check if fovs are correct for images and labels
-    fovs_images = []
-    for fname in os.listdir(path / CosmxKeys.IMAGES_DIR):
-        if fname.endswith(file_extensions):
-            fovs_images.append(str(int(pat.findall(fname)[0])))
-
-    fovs_labels = []
-    for fname in os.listdir(path / CosmxKeys.LABELS_DIR):
-        if fname.endswith(file_extensions):
-            fovs_labels.append(str(int(pat.findall(fname)[0])))
-
-    fovs_images_and_labels = set(fovs_images).intersection(set(fovs_labels))
-    fovs_diff = fovs_images_and_labels.difference(set(fovs_counts))
-    if len(fovs_diff):
-        raise logger.warning(
-            f"Found images and labels for {len(fovs_images)} FOVs, but only {len(fovs_counts)} FOVs in the counts file.\n"
-            + f"The following FOVs are missing: {fovs_diff} \n"
-            + "... will use only fovs in Table."
-        )
+    raise ValueError("Could not infer `dataset_id` from the name of the transcript file. Please specify it manually.")
+
+
+def _read_fov_image(
+    morphology_path: Path, protein_path: Path | None, morphology_coords: list[str], **imread_kwargs
+) -> tuple[da.Array, list[str]]:
+    image = imread(morphology_path, **imread_kwargs)
+
+    protein_names = []
+    if protein_path is not None:
+        protein_image, protein_names = _read_protein_fov(protein_path)
+        image = da.concatenate([image, protein_image], axis=0)
+
+    return image, morphology_coords + protein_names
+
+
+def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
+    fov_file = path / f"{dataset_id}_fov_positions_file.csv"
+
+    if not fov_file.exists():
+        fov_file = path / f"{dataset_id}_fov_positions_file.csv.gz"
+
+    assert fov_file.exists(), f"Missing field of view file: {fov_file}"
+
+    fov_locs = pd.read_csv(fov_file)
+    fov_locs["xmax"] = 0.0  # will be filled when reading the images
+    fov_locs["ymin"] = 0.0  # will be filled when reading the images
+
+    fov_key, x_key, y_key, scale_factor = "fov", "x_global_px", "y_global_px", 1
+
+    if not np.isin([fov_key, x_key, y_key], fov_locs.columns).all():  # try different column names
+        fov_key, x_key, y_key = "FOV", "X_mm", "Y_mm"
+        scale_factor = 1e3 / 0.120280945  # CosMX milimeters to pixels
+
+        assert np.isin(
+            [fov_key, x_key, y_key], fov_locs.columns
+        ).all(), f"The file {fov_file} must contain the following columns: {fov_key}, {x_key}, {y_key}. Consider using a different export module."
+
+    fov_locs.index = fov_locs[fov_key]
+    fov_locs["xmin"] = fov_locs[x_key] * scale_factor
+    fov_locs["ymax"] = fov_locs[y_key] * scale_factor
+
+    return fov_locs
+
+
+def _read_stitched_image(
+    images_dir: Path,
+    fov_locs: pd.DataFrame,
+    protein_dir_dict: dict,
+    morphology_coords: list[str],
+    **imread_kwargs,
+) -> tuple[da.Array, list[str] | None]:
+    fov_images = {}
+    c_coords_dict = {}
+    pattern = re.compile(r".*_F(\d+)")
+    for image_path in images_dir.iterdir():
+        if image_path.suffix == ".TIF":
+            fov = int(pattern.findall(image_path.name)[0])
+
+            image, c_coords = _read_fov_image(image_path, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs)
+
+            c_coords_dict[fov] = c_coords
+
+            fov_images[fov] = da.flip(image, axis=1)
+
+            fov_locs.loc[fov, "xmax"] = fov_locs.loc[fov, "xmin"] + image.shape[2]
+            fov_locs.loc[fov, "ymin"] = fov_locs.loc[fov, "ymax"] - image.shape[1]
+
+    for dim in ["x", "y"]:
+        shift = fov_locs[f"{dim}min"].min()
+        fov_locs[f"{dim}0"] = (fov_locs[f"{dim}min"] - shift).round().astype(int)
+        fov_locs[f"{dim}1"] = (fov_locs[f"{dim}max"] - shift).round().astype(int)
+
+    c_coords = list(set.union(*[set(names) for names in c_coords_dict.values()]))
+
+    stitched_image = da.zeros(shape=(len(c_coords), fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype)
+    stitched_image = xr.DataArray(stitched_image, dims=("c", "y", "x"), coords={"c": c_coords})
+
+    for fov, im in fov_images.items():
+        xmin, xmax = fov_locs.loc[fov, "x0"], fov_locs.loc[fov, "x1"]
+        ymin, ymax = fov_locs.loc[fov, "y0"], fov_locs.loc[fov, "y1"]
+        stitched_image.loc[{"c": c_coords_dict[fov], "y": slice(ymin, ymax), "x": slice(xmin, xmax)}] = im
+
+        if len(c_coords_dict[fov]) < len(c_coords):
+            logger.warning(f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}")
+
+    return stitched_image.data, c_coords
+
+
+def _check_fov_id(fov: str | int | None) -> tuple[str, int]:
+    if fov is None:
+        return None, None
+
+    if isinstance(fov, int):
+        return f"F{fov:0>3}", fov
+
+    assert (
+        fov[0] == "F" and len(fov) == 4 and all(c.isdigit() for c in fov[1:])
+    ), f"'fov' needs to start with a F followed by three digits. Found '{fov}'."
+
+    return fov, int(fov[1:])
+
+
+def _read_transcripts_csv(path: Path, dataset_id: str) -> pd.DataFrame:
+    transcripts_file = path / f"{dataset_id}_tx_file.csv.gz"
+
+    if transcripts_file.exists():
+        df = pd.read_csv(transcripts_file, compression="gzip")
+    else:
+        transcripts_file = path / f"{dataset_id}_tx_file.csv"
+        assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found."
+        df = pd.read_csv(transcripts_file)
+
+    TRANSCRIPT_COLUMNS = ["x_global_px", "y_global_px", "target"]
+    assert np.isin(
+        TRANSCRIPT_COLUMNS, df.columns
+    ).all(), f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module."
+
+    return df
+
+
+def _find_dir(path: Path, name: str):
+    if (path / name).is_dir():
+        return path / name
+
+    paths = list(path.rglob(f"**/{name}"))
+    assert len(paths) == 1, f"Found {len(paths)} path(s) with name {name} inside {path}"
+
+    return paths[0]
+
+
+def _cosmx_morphology_coords(images_dir: Path) -> list[str]:
+    images_paths = list(images_dir.glob("*.TIF"))
+    assert len(images_paths) > 0, f"Expected to find images inside {images_dir}"
+
+    with tifffile.TiffFile(images_paths[0]) as tif:
+        description = tif.pages[0].description
+        substrings = re.findall(r'"BiologicalTarget": "(.*?)",', description)
+        return substrings
+
+
+def _get_cosmx_protein_name(image_path: Path) -> str:
+    with tifffile.TiffFile(image_path) as tif:
+        description = tif.pages[0].description
+        substrings = re.findall(r'"DisplayName": "(.*?)",', description)
+        return substrings[0]
+
+
+def _read_protein_fov(protein_dir: Path) -> tuple[da.Array, list[str]]:
+    images_paths = list(protein_dir.rglob("*.TIF"))
+    protein_image = da.concatenate([imread(image_path) for image_path in images_paths], axis=0)
+    channel_names = [_get_cosmx_protein_name(image_path) for image_path in images_paths]
 
-    # read images
-    images = {}
-    for fname in os.listdir(path / CosmxKeys.IMAGES_DIR):
-        if fname.endswith(file_extensions):
-            fov = str(int(pat.findall(fname)[0]))
-            if fov in fovs_counts:
-                aff = affine_transforms_to_global[fov]
-                im = imread(path / CosmxKeys.IMAGES_DIR / fname, **imread_kwargs).squeeze()
-                flipped_im = da.flip(im, axis=0)
-                parsed_im = Image2DModel.parse(
-                    flipped_im,
-                    transformations={
-                        fov: Identity(),
-                        "global": aff,
-                        "global_only_image": aff,
-                    },
-                    dims=("y", "x", "c"),
-                    rgb=None,
-                    **image_models_kwargs,
-                )
-                images[f"{fov}_image"] = parsed_im
-            else:
-                logger.warning(f"FOV {fov} not found in counts file. Skipping image {fname}.")
-
-    # read labels
-    labels = {}
-    for fname in os.listdir(path / CosmxKeys.LABELS_DIR):
-        if fname.endswith(file_extensions):
-            fov = str(int(pat.findall(fname)[0]))
-            if fov in fovs_counts:
-                aff = affine_transforms_to_global[fov]
-                la = imread(path / CosmxKeys.LABELS_DIR / fname, **imread_kwargs).squeeze()
-                flipped_la = da.flip(la, axis=0)
-                parsed_la = Labels2DModel.parse(
-                    flipped_la,
-                    transformations={
-                        fov: Identity(),
-                        "global": aff,
-                        "global_only_labels": aff,
-                    },
-                    dims=("y", "x"),
-                    **image_models_kwargs,
-                )
-                labels[f"{fov}_labels"] = parsed_la
-            else:
-                logger.warning(f"FOV {fov} not found in counts file. Skipping labels {fname}.")
-
-    points: dict[str, DaskDataFrame] = {}
-    if transcripts:
-        # assert transcripts_file is not None
-        # from pyarrow.csv import read_csv
-        #
-        # ptable = read_csv(path / transcripts_file)  # , header=0)
-        # for fov in fovs_counts:
-        #     aff = affine_transforms_to_global[fov]
-        #     sub_table = ptable.filter(pa.compute.equal(ptable.column(CosmxKeys.FOV), int(fov))).to_pandas()
-        #     sub_table[CosmxKeys.INSTANCE_KEY] = sub_table[CosmxKeys.INSTANCE_KEY].astype("category")
-        #     # we rename z because we want to treat the data as 2d
-        #     sub_table.rename(columns={"z": "z_raw"}, inplace=True)
-        #     points[fov] = PointsModel.parse(
-        #         sub_table,
-        #         coordinates={"x": CosmxKeys.X_LOCAL_TRANSCRIPT, "y": CosmxKeys.Y_LOCAL_TRANSCRIPT},
-        #         feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT,
-        #         instance_key=CosmxKeys.INSTANCE_KEY,
-        #         transformations={
-        #             fov: Identity(),
-        #             "global": aff,
-        #             "global_only_labels": aff,
-        #         },
-        #     )
-        # let's convert the .csv to .parquet and let's read it with pyarrow.parquet for faster subsetting
-        import tempfile
-
-        import pyarrow.parquet as pq
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            print("converting .csv to .parquet to improve the speed of the slicing operations... ", end="")
-            assert transcripts_file is not None
-            transcripts_data = pd.read_csv(path / transcripts_file, header=0)
-            transcripts_data.to_parquet(Path(tmpdir) / "transcripts.parquet")
-            print("done")
-
-            ptable = pq.read_table(Path(tmpdir) / "transcripts.parquet")
-            for fov in fovs_counts:
-                aff = affine_transforms_to_global[fov]
-                sub_table = ptable.filter(pa.compute.equal(ptable.column(CosmxKeys.FOV), int(fov))).to_pandas()
-                sub_table[CosmxKeys.INSTANCE_KEY] = sub_table[CosmxKeys.INSTANCE_KEY].astype("category")
-                # we rename z because we want to treat the data as 2d
-                sub_table.rename(columns={"z": "z_raw"}, inplace=True)
-                if len(sub_table) > 0:
-                    points[f"{fov}_points"] = PointsModel.parse(
-                        sub_table,
-                        coordinates={"x": CosmxKeys.X_LOCAL_TRANSCRIPT, "y": CosmxKeys.Y_LOCAL_TRANSCRIPT},
-                        feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT,
-                        instance_key=CosmxKeys.INSTANCE_KEY,
-                        transformations={
-                            fov: Identity(),
-                            "global": aff,
-                            "global_only_labels": aff,
-                        },
-                    )
-
-    # TODO: what to do with fov file?
-    # if fov_file is not None:
-    #     fov_positions = pd.read_csv(path / fov_file, header=0, index_col=CosmxKeys.FOV)
-    #     for fov, row in fov_positions.iterrows():
-    #         try:
-    #             adata.uns["spatial"][str(fov)]["metadata"] = row.to_dict()
-    #         except KeyError:
-    #             logg.warning(f"FOV `{str(fov)}` does not exist, skipping it.")
-    #             continue
-
-    return SpatialData(images=images, labels=labels, points=points, table=table)
+    return protein_image, channel_names

From 1a685340acecdcc46af70bc0c093d2296e8a8efe Mon Sep 17 00:00:00 2001
From: Quentin Blampey <quentin.blampey@gmail.com>
Date: Fri, 11 Apr 2025 17:19:59 +0200
Subject: [PATCH 4/4] use sopa reader that fixes sopa issues 180, 227, 231

---
 src/spatialdata_io/readers/cosmx.py | 176 ++++++++++++++++++++--------
 1 file changed, 125 insertions(+), 51 deletions(-)

diff --git a/src/spatialdata_io/readers/cosmx.py b/src/spatialdata_io/readers/cosmx.py
index ab669017..307414a8 100644
--- a/src/spatialdata_io/readers/cosmx.py
+++ b/src/spatialdata_io/readers/cosmx.py
@@ -4,7 +4,7 @@
 from collections.abc import Mapping
 from pathlib import Path
 from types import MappingProxyType
-from typing import Any, Optional
+from typing import Any
 
 import dask.array as da
 import numpy as np
@@ -19,11 +19,12 @@
 
 def cosmx(
     path: str | Path,
-    dataset_id: Optional[str] = None,
-    fov: int | str | None = None,
+    dataset_id: str | None = None,
+    fov: int | None = None,
     read_proteins: bool = False,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
     image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
+    flip_image: bool | None = None,
 ) -> SpatialData:
     """
     Read *Cosmx Nanostring* data. The fields of view are stitched together, except if `fov` is provided.
@@ -38,11 +39,12 @@ def cosmx(
 
     Args:
         path: Path to the root directory containing *Nanostring* files.
-        dataset_id: Optional name of the dataset (needs to be provided if not infered).
-        fov: Name or number of one single field of view to be read. If a string is provided, an example of correct syntax is "F008". By default, reads all FOVs.
+        dataset_id: Optional name of the dataset (needs to be provided if not inferred).
+        fov: Number of one single field of view to be read. If not provided, reads all FOVs and create a stitched image.
         read_proteins: Whether to read the proteins or the transcripts.
         image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`.
         imread_kwargs: Keyword arguments passed to `dask_image.imread.imread`.
+        flip_image: For some buggy AtomX exports, `flip_image=True` has to be used for stitching. By default, the value is inferred based on the transcript file. See [this](https://github.com/gustaveroussy/sopa/issues/231) issue.
 
     Returns:
         A `SpatialData` object representing the CosMX experiment
@@ -51,14 +53,19 @@ def cosmx(
 
     dataset_id = _infer_dataset_id(path, dataset_id)
     fov_locs = _read_fov_locs(path, dataset_id)
-    fov_id, fov = _check_fov_id(fov)
 
     protein_dir_dict = {}
     if read_proteins:
         protein_dir_dict = {
-            int(protein_dir.parent.name[3:]): protein_dir for protein_dir in list(path.rglob("**/FOV*/ProteinImages"))
+            int(protein_dir.parent.name[3:]): protein_dir
+            for protein_dir in list(path.rglob("**/FOV*/ProteinImages"))
         }
-        assert len(protein_dir_dict), f"No directory called 'ProteinImages' was found under {path}"
+        assert len(protein_dir_dict), (
+            f"No directory called 'ProteinImages' was found under {path}"
+        )
+
+    if flip_image is None and not read_proteins:
+        flip_image = _infer_flip_image(path, dataset_id)
 
     ### Read image(s)
     images_dir = _find_dir(path, "Morphology2D")
@@ -70,20 +77,22 @@ def cosmx(
             fov_locs,
             protein_dir_dict,
             morphology_coords,
+            flip_image,
             **imread_kwargs,
         )
         image_name = "stitched_image"
     else:
-        pattern = f"*{fov_id}.TIF"
-        fov_files = list(images_dir.rglob(pattern))
+        logger.info(f"Reading single FOV ({fov}), the image will not be stitched")
+        fov_file = _find_matching_fov_file(images_dir, fov)
 
-        assert len(fov_files), f"No file matches the pattern {pattern} inside {images_dir}"
-        assert len(fov_files) == 1, f"Multiple files match the pattern {pattern}: {', '.join(fov_files)}"
-
-        image, c_coords = _read_fov_image(fov_files[0], protein_dir_dict.get(fov), morphology_coords, **imread_kwargs)
-        image_name = f"{fov}_image"
+        image, c_coords = _read_fov_image(
+            fov_file, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs
+        )
+        image_name = f"F{fov:0>5}_image"
 
-    parsed_image = Image2DModel.parse(image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs)
+    parsed_image = Image2DModel.parse(
+        image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs
+    )
 
     if read_proteins:
         return SpatialData(images={image_name: parsed_image})
@@ -99,7 +108,7 @@ def cosmx(
     else:
         transcripts_data = transcripts_data[transcripts_data["fov"] == fov]
         coordinates = {"x": "x_local_px", "y": "y_local_px"}
-        points_name = f"{fov}_points"
+        points_name = f"F{fov:0>5}_points"
 
     from spatialdata_io._constants._constants import CosmxKeys
 
@@ -110,8 +119,7 @@ def cosmx(
     )
 
     return SpatialData(
-        images={image_name: parsed_image},
-        points={points_name: transcripts},
+        images={image_name: parsed_image}, points={points_name: transcripts}
     )
 
 
@@ -127,11 +135,16 @@ def _infer_dataset_id(path: Path, dataset_id: str | None) -> str:
             if found:
                 return found.group(1)
 
-    raise ValueError("Could not infer `dataset_id` from the name of the transcript file. Please specify it manually.")
+    raise ValueError(
+        "Could not infer `dataset_id` from the name of the transcript file. Please specify it manually."
+    )
 
 
 def _read_fov_image(
-    morphology_path: Path, protein_path: Path | None, morphology_coords: list[str], **imread_kwargs
+    morphology_path: Path,
+    protein_path: Path | None,
+    morphology_coords: list[str],
+    **imread_kwargs,
 ) -> tuple[da.Array, list[str]]:
     image = imread(morphology_path, **imread_kwargs)
 
@@ -153,21 +166,23 @@ def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
 
     fov_locs = pd.read_csv(fov_file)
     fov_locs["xmax"] = 0.0  # will be filled when reading the images
-    fov_locs["ymin"] = 0.0  # will be filled when reading the images
+    fov_locs["ymax"] = 0.0  # will be filled when reading the images
 
     fov_key, x_key, y_key, scale_factor = "fov", "x_global_px", "y_global_px", 1
 
-    if not np.isin([fov_key, x_key, y_key], fov_locs.columns).all():  # try different column names
+    if not np.isin(
+        [fov_key, x_key, y_key], fov_locs.columns
+    ).all():  # try different column names
         fov_key, x_key, y_key = "FOV", "X_mm", "Y_mm"
         scale_factor = 1e3 / 0.120280945  # CosMX milimeters to pixels
 
-        assert np.isin(
-            [fov_key, x_key, y_key], fov_locs.columns
-        ).all(), f"The file {fov_file} must contain the following columns: {fov_key}, {x_key}, {y_key}. Consider using a different export module."
+        assert np.isin([fov_key, x_key, y_key], fov_locs.columns).all(), (
+            f"The file {fov_file} must contain the following columns: {fov_key}, {x_key}, {y_key}. Consider using a different export module."
+        )
 
     fov_locs.index = fov_locs[fov_key]
     fov_locs["xmin"] = fov_locs[x_key] * scale_factor
-    fov_locs["ymax"] = fov_locs[y_key] * scale_factor
+    fov_locs["ymin"] = fov_locs[y_key] * scale_factor
 
     return fov_locs
 
@@ -177,6 +192,7 @@ def _read_stitched_image(
     fov_locs: pd.DataFrame,
     protein_dir_dict: dict,
     morphology_coords: list[str],
+    flip_image: int,
     **imread_kwargs,
 ) -> tuple[da.Array, list[str] | None]:
     fov_images = {}
@@ -186,14 +202,24 @@ def _read_stitched_image(
         if image_path.suffix == ".TIF":
             fov = int(pattern.findall(image_path.name)[0])
 
-            image, c_coords = _read_fov_image(image_path, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs)
+            image, c_coords = _read_fov_image(
+                image_path,
+                protein_dir_dict.get(fov),
+                morphology_coords,
+                **imread_kwargs,
+            )
 
             c_coords_dict[fov] = c_coords
 
             fov_images[fov] = da.flip(image, axis=1)
 
             fov_locs.loc[fov, "xmax"] = fov_locs.loc[fov, "xmin"] + image.shape[2]
-            fov_locs.loc[fov, "ymin"] = fov_locs.loc[fov, "ymax"] - image.shape[1]
+
+            if flip_image:
+                fov_locs.loc[fov, "ymax"] = fov_locs.loc[fov, "ymin"]
+                fov_locs.loc[fov, "ymin"] = fov_locs.loc[fov, "ymax"] - image.shape[1]
+            else:
+                fov_locs.loc[fov, "ymax"] = fov_locs.loc[fov, "ymin"] + image.shape[1]
 
     for dim in ["x", "y"]:
         shift = fov_locs[f"{dim}min"].min()
@@ -202,48 +228,70 @@ def _read_stitched_image(
 
     c_coords = list(set.union(*[set(names) for names in c_coords_dict.values()]))
 
-    stitched_image = da.zeros(shape=(len(c_coords), fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype)
-    stitched_image = xr.DataArray(stitched_image, dims=("c", "y", "x"), coords={"c": c_coords})
+    height, width = fov_locs["y1"].max(), fov_locs["x1"].max()
+    stitched_image = da.zeros(shape=(len(c_coords), height, width), dtype=image.dtype)
+    stitched_image = xr.DataArray(
+        stitched_image, dims=("c", "y", "x"), coords={"c": c_coords}
+    )
 
     for fov, im in fov_images.items():
         xmin, xmax = fov_locs.loc[fov, "x0"], fov_locs.loc[fov, "x1"]
         ymin, ymax = fov_locs.loc[fov, "y0"], fov_locs.loc[fov, "y1"]
-        stitched_image.loc[{"c": c_coords_dict[fov], "y": slice(ymin, ymax), "x": slice(xmin, xmax)}] = im
+
+        if flip_image:
+            y_slice, x_slice = (
+                slice(height - ymax, height - ymin),
+                slice(width - xmax, width - xmin),
+            )
+        else:
+            y_slice, x_slice = slice(ymin, ymax), slice(xmin, xmax)
+
+        stitched_image.loc[{"c": c_coords_dict[fov], "y": y_slice, "x": x_slice}] = im
 
         if len(c_coords_dict[fov]) < len(c_coords):
-            logger.warning(f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}")
+            logger.warning(
+                f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}"
+            )
 
     return stitched_image.data, c_coords
 
 
-def _check_fov_id(fov: str | int | None) -> tuple[str, int]:
-    if fov is None:
-        return None, None
+def _find_matching_fov_file(images_dir: Path, fov: str | int) -> Path:
+    assert isinstance(fov, int), "Expected `fov` to be an integer"
 
-    if isinstance(fov, int):
-        return f"F{fov:0>3}", fov
+    pattern = re.compile(rf".*_F0*{fov}\.TIF")
+    fov_files = [file for file in images_dir.rglob("*") if pattern.match(file.name)]
 
-    assert (
-        fov[0] == "F" and len(fov) == 4 and all(c.isdigit() for c in fov[1:])
-    ), f"'fov' needs to start with a F followed by three digits. Found '{fov}'."
+    assert len(fov_files), f"No file matches the pattern {pattern} inside {images_dir}"
+    assert len(fov_files) == 1, (
+        f"Multiple files match the pattern {pattern}: {', '.join(map(str, fov_files))}"
+    )
 
-    return fov, int(fov[1:])
+    return fov_files[0]
 
 
-def _read_transcripts_csv(path: Path, dataset_id: str) -> pd.DataFrame:
+def _read_transcripts_csv(
+    path: Path, dataset_id: str, nrows: int | None = None
+) -> pd.DataFrame:
     transcripts_file = path / f"{dataset_id}_tx_file.csv.gz"
 
     if transcripts_file.exists():
-        df = pd.read_csv(transcripts_file, compression="gzip")
+        df = pd.read_csv(transcripts_file, compression="gzip", nrows=nrows)
     else:
         transcripts_file = path / f"{dataset_id}_tx_file.csv"
-        assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found."
-        df = pd.read_csv(transcripts_file)
+        assert transcripts_file.exists(), (
+            f"Transcript file {transcripts_file} not found."
+        )
+        df = pd.read_csv(transcripts_file, nrows=nrows)
 
     TRANSCRIPT_COLUMNS = ["x_global_px", "y_global_px", "target"]
-    assert np.isin(
-        TRANSCRIPT_COLUMNS, df.columns
-    ).all(), f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module."
+    assert np.isin(TRANSCRIPT_COLUMNS, df.columns).all(), (
+        f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module."
+    )
+
+    df["unique_cell_id"] = (
+        df["fov"] * (df["cell_ID"].max() + 1) * (df["cell_ID"] > 0) + df["cell_ID"]
+    )
 
     return df
 
@@ -264,8 +312,12 @@ def _cosmx_morphology_coords(images_dir: Path) -> list[str]:
 
     with tifffile.TiffFile(images_paths[0]) as tif:
         description = tif.pages[0].description
+
         substrings = re.findall(r'"BiologicalTarget": "(.*?)",', description)
-        return substrings
+        channels = re.findall(r'"ChannelId": "(.*?)",', description)
+        channel_order = list(re.findall(r'"ChannelOrder": "(.*?)",', description)[0])
+
+        return [substrings[channels.index(x)] for x in channel_order if x in channels]
 
 
 def _get_cosmx_protein_name(image_path: Path) -> str:
@@ -277,7 +329,29 @@ def _get_cosmx_protein_name(image_path: Path) -> str:
 
 def _read_protein_fov(protein_dir: Path) -> tuple[da.Array, list[str]]:
     images_paths = list(protein_dir.rglob("*.TIF"))
-    protein_image = da.concatenate([imread(image_path) for image_path in images_paths], axis=0)
+    protein_image = da.concatenate(
+        [imread(image_path) for image_path in images_paths], axis=0
+    )
     channel_names = [_get_cosmx_protein_name(image_path) for image_path in images_paths]
 
     return protein_image, channel_names
+
+
+def _infer_flip_image(path: Path, dataset_id: str) -> bool:
+    df_ = _read_transcripts_csv(path, dataset_id, nrows=100)
+
+    fov = df_["fov"].value_counts().index[0]
+    df_ = df_[df_["fov"] == fov].sort_values("y_global_px")
+
+    assert len(df_) > 1, (
+        f"Not transcripts in {fov=} to infer `flip_image`. Please provide `flip_image` manually."
+    )
+
+    # in recent AtomX exports, y_local_px is negatively correlated with y_global_px
+    flip_image = df_["y_local_px"].iloc[0] > df_["y_local_px"].iloc[-1]
+
+    logger.info(
+        f"Inferring argument {flip_image=}. If the image stitching is wrong, please add a comment to https://github.com/gustaveroussy/sopa/issues/231"
+    )
+
+    return flip_image