From e0c63247d6a387580b84fa3e1329ade2e6cf4e92 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Mon, 2 Dec 2024 08:52:55 +0100 Subject: [PATCH 1/4] import read_text from anndata.io instead of anndata --- src/spatialdata_io/readers/_utils/_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index da0ac9e3..d7b831ff 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -5,7 +5,8 @@ from pathlib import Path from typing import Any, Union -from anndata import AnnData, read_text +from anndata import AnnData +from anndata.io import read_text from h5py import File from spatialdata_io.readers._utils._read_10x_h5 import _read_10x_h5 From 515ace0db461fdcb01570b075672ab0432b5cd22 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Mon, 2 Dec 2024 08:57:29 +0100 Subject: [PATCH 2/4] Revert "import read_text from anndata.io instead of anndata" This reverts commit e0c63247d6a387580b84fa3e1329ade2e6cf4e92. --- src/spatialdata_io/readers/_utils/_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index d7b831ff..da0ac9e3 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -5,8 +5,7 @@ from pathlib import Path from typing import Any, Union -from anndata import AnnData -from anndata.io import read_text +from anndata import AnnData, read_text from h5py import File from spatialdata_io.readers._utils._read_10x_h5 import _read_10x_h5 From 5612ff4963bc7f817a8def999d605c92c696c51e Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Tue, 18 Mar 2025 13:57:55 +0100 Subject: [PATCH 3/4] draft cosmx reader with stitching --- src/spatialdata_io/readers/cosmx.py | 509 ++++++++++++++-------------- 1 file changed, 247 insertions(+), 262 deletions(-) diff --git a/src/spatialdata_io/readers/cosmx.py b/src/spatialdata_io/readers/cosmx.py index 8f4ec38f..ab669017 100644 --- a/src/spatialdata_io/readers/cosmx.py +++ b/src/spatialdata_io/readers/cosmx.py @@ -1,298 +1,283 @@ from __future__ import annotations -import os import re from collections.abc import Mapping from pathlib import Path from types import MappingProxyType -from typing import Any +from typing import Any, Optional import dask.array as da import numpy as np import pandas as pd -import pyarrow as pa -from anndata import AnnData -from dask.dataframe import DataFrame as DaskDataFrame +import tifffile +import xarray as xr from dask_image.imread import imread -from scipy.sparse import csr_matrix - -# from spatialdata._core.core_utils import xy_cs -from skimage.transform import estimate_transform from spatialdata import SpatialData from spatialdata._logging import logger -from spatialdata.models import Image2DModel, Labels2DModel, PointsModel, TableModel - -# from spatialdata._core.ngff.ngff_coordinate_system import NgffAxis # , CoordinateSystem -from spatialdata.transformations.transformations import Affine, Identity - -from spatialdata_io._constants._constants import CosmxKeys -from spatialdata_io._docs import inject_docs - -__all__ = ["cosmx"] - -# x_axis = NgffAxis(name="x", type="space", unit="discrete") -# y_axis = NgffAxis(name="y", type="space", unit="discrete") -# c_axis = NgffAxis(name="c", type="channel", unit="index") +from spatialdata.models import Image2DModel, PointsModel -@inject_docs(cx=CosmxKeys) def cosmx( path: str | Path, - dataset_id: str | None = None, - transcripts: bool = True, + dataset_id: Optional[str] = None, + fov: int | str | None = None, + read_proteins: bool = False, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), image_models_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> SpatialData: """ - Read *Cosmx Nanostring* data. + Read *Cosmx Nanostring* data. The fields of view are stitched together, except if `fov` is provided. This function reads the following files: - - - ``_`{cx.COUNTS_SUFFIX!r}```: Counts matrix. - - ``_`{cx.METADATA_SUFFIX!r}```: Metadata file. - - ``_`{cx.FOV_SUFFIX!r}```: Field of view file. - - ``{cx.IMAGES_DIR!r}``: Directory containing the images. - - ``{cx.LABELS_DIR!r}``: Directory containing the labels. - - .. seealso:: - - - `Nanostring Spatial Molecular Imager `_. - - Parameters - ---------- - path - Path to the root directory containing *Nanostring* files. - dataset_id - Name of the dataset. - transcripts - Whether to also read in transcripts information. - imread_kwargs - Keyword arguments passed to :func:`dask_image.imread.imread`. - image_models_kwargs - Keyword arguments passed to :class:`spatialdata.models.Image2DModel`. - - Returns - ------- - :class:`spatialdata.SpatialData` + - `*_fov_positions_file.csv` or `*_fov_positions_file.csv.gz`: FOV locations + - `Morphology2D` directory: all the FOVs morphology images + - `*_tx_file.csv.gz` or `*_tx_file.csv`: Transcripts location and names + - If `read_proteins` is `True`, all the images under the nested `ProteinImages` directories will be read + + These files must be exported as flat files in AtomX. That is: within a study, click on "Export" and then select files from the "Flat CSV Files" section (transcripts flat and FOV position flat). + + Args: + path: Path to the root directory containing *Nanostring* files. + dataset_id: Optional name of the dataset (needs to be provided if not infered). + fov: Name or number of one single field of view to be read. If a string is provided, an example of correct syntax is "F008". By default, reads all FOVs. + read_proteins: Whether to read the proteins or the transcripts. + image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`. + imread_kwargs: Keyword arguments passed to `dask_image.imread.imread`. + + Returns: + A `SpatialData` object representing the CosMX experiment """ path = Path(path) - # tries to infer dataset_id from the name of the counts file - if dataset_id is None: - counts_files = [f for f in os.listdir(path) if str(f).endswith(CosmxKeys.COUNTS_SUFFIX)] - if len(counts_files) == 1: - found = re.match(rf"(.*)_{CosmxKeys.COUNTS_SUFFIX}", counts_files[0]) - if found: - dataset_id = found.group(1) - if dataset_id is None: - raise ValueError("Could not infer `dataset_id` from the name of the counts file. Please specify it manually.") - - # check for file existence - counts_file = path / f"{dataset_id}_{CosmxKeys.COUNTS_SUFFIX}" - if not counts_file.exists(): - raise FileNotFoundError(f"Counts file not found: {counts_file}.") - if transcripts: - transcripts_file = path / f"{dataset_id}_{CosmxKeys.TRANSCRIPTS_SUFFIX}" - if not transcripts_file.exists(): - raise FileNotFoundError(f"Transcripts file not found: {transcripts_file}.") + dataset_id = _infer_dataset_id(path, dataset_id) + fov_locs = _read_fov_locs(path, dataset_id) + fov_id, fov = _check_fov_id(fov) + + protein_dir_dict = {} + if read_proteins: + protein_dir_dict = { + int(protein_dir.parent.name[3:]): protein_dir for protein_dir in list(path.rglob("**/FOV*/ProteinImages")) + } + assert len(protein_dir_dict), f"No directory called 'ProteinImages' was found under {path}" + + ### Read image(s) + images_dir = _find_dir(path, "Morphology2D") + morphology_coords = _cosmx_morphology_coords(images_dir) + + if fov is None: + image, c_coords = _read_stitched_image( + images_dir, + fov_locs, + protein_dir_dict, + morphology_coords, + **imread_kwargs, + ) + image_name = "stitched_image" else: - transcripts_file = None - meta_file = path / f"{dataset_id}_{CosmxKeys.METADATA_SUFFIX}" - if not meta_file.exists(): - raise FileNotFoundError(f"Metadata file not found: {meta_file}.") - fov_file = path / f"{dataset_id}_{CosmxKeys.FOV_SUFFIX}" - if not fov_file.exists(): - raise FileNotFoundError(f"Found field of view file: {fov_file}.") - images_dir = path / CosmxKeys.IMAGES_DIR - if not images_dir.exists(): - raise FileNotFoundError(f"Images directory not found: {images_dir}.") - labels_dir = path / CosmxKeys.LABELS_DIR - if not labels_dir.exists(): - raise FileNotFoundError(f"Labels directory not found: {labels_dir}.") - - counts = pd.read_csv(path / counts_file, header=0, index_col=CosmxKeys.INSTANCE_KEY) - counts.index = counts.index.astype(str).str.cat(counts.pop(CosmxKeys.FOV).astype(str).values, sep="_") - - obs = pd.read_csv(path / meta_file, header=0, index_col=CosmxKeys.INSTANCE_KEY) - obs[CosmxKeys.FOV] = pd.Categorical(obs[CosmxKeys.FOV].astype(str)) - obs[CosmxKeys.REGION_KEY] = pd.Categorical(obs[CosmxKeys.FOV].astype(str).apply(lambda s: s + "_labels")) - obs[CosmxKeys.INSTANCE_KEY] = obs.index.astype(np.int64) - obs.rename_axis(None, inplace=True) - obs.index = obs.index.astype(str).str.cat(obs[CosmxKeys.FOV].values, sep="_") - - common_index = obs.index.intersection(counts.index) - - adata = AnnData( - csr_matrix(counts.loc[common_index, :].values), - dtype=counts.values.dtype, - obs=obs.loc[common_index, :], + pattern = f"*{fov_id}.TIF" + fov_files = list(images_dir.rglob(pattern)) + + assert len(fov_files), f"No file matches the pattern {pattern} inside {images_dir}" + assert len(fov_files) == 1, f"Multiple files match the pattern {pattern}: {', '.join(fov_files)}" + + image, c_coords = _read_fov_image(fov_files[0], protein_dir_dict.get(fov), morphology_coords, **imread_kwargs) + image_name = f"{fov}_image" + + parsed_image = Image2DModel.parse(image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs) + + if read_proteins: + return SpatialData(images={image_name: parsed_image}) + + ### Read transcripts + transcripts_data = _read_transcripts_csv(path, dataset_id) + + if fov is None: + transcripts_data["x"] = transcripts_data["x_global_px"] - fov_locs["xmin"].min() + transcripts_data["y"] = transcripts_data["y_global_px"] - fov_locs["ymin"].min() + coordinates = None + points_name = "points" + else: + transcripts_data = transcripts_data[transcripts_data["fov"] == fov] + coordinates = {"x": "x_local_px", "y": "y_local_px"} + points_name = f"{fov}_points" + + from spatialdata_io._constants._constants import CosmxKeys + + transcripts = PointsModel.parse( + transcripts_data, + coordinates=coordinates, + feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT, ) - adata.var_names = counts.columns - table = TableModel.parse( - adata, - region=list(set(adata.obs[CosmxKeys.REGION_KEY].astype(str).tolist())), - region_key=CosmxKeys.REGION_KEY.value, - instance_key=CosmxKeys.INSTANCE_KEY.value, + return SpatialData( + images={image_name: parsed_image}, + points={points_name: transcripts}, ) - fovs_counts = list(map(str, adata.obs.fov.astype(int).unique())) - affine_transforms_to_global = {} +def _infer_dataset_id(path: Path, dataset_id: str | None) -> str: + if isinstance(dataset_id, str): + return dataset_id - for fov in fovs_counts: - idx = table.obs.fov.astype(str) == fov - loc = table[idx, :].obs[[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL]].values - glob = table[idx, :].obs[[CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL]].values - out = estimate_transform(ttype="affine", src=loc, dst=glob) - affine_transforms_to_global[fov] = Affine( - # out.params, input_coordinate_system=input_cs, output_coordinate_system=output_cs - out.params, - input_axes=("x", "y"), - output_axes=("x", "y"), - ) + for suffix in [".csv", ".csv.gz"]: + counts_files = list(path.rglob(f"[!\\.]*_fov_positions_file{suffix}")) - table.obsm["global"] = table.obs[[CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL]].to_numpy() - table.obsm["spatial"] = table.obs[[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL]].to_numpy() - table.obs.drop( - columns=[CosmxKeys.X_LOCAL_CELL, CosmxKeys.Y_LOCAL_CELL, CosmxKeys.X_GLOBAL_CELL, CosmxKeys.Y_GLOBAL_CELL], - inplace=True, - ) + if len(counts_files) == 1: + found = re.match(rf"(.*)_fov_positions_file{suffix}", counts_files[0].name) + if found: + return found.group(1) - # prepare to read images and labels - file_extensions = (".jpg", ".png", ".jpeg", ".tif", ".tiff") - pat = re.compile(r".*_F(\d+)") - - # check if fovs are correct for images and labels - fovs_images = [] - for fname in os.listdir(path / CosmxKeys.IMAGES_DIR): - if fname.endswith(file_extensions): - fovs_images.append(str(int(pat.findall(fname)[0]))) - - fovs_labels = [] - for fname in os.listdir(path / CosmxKeys.LABELS_DIR): - if fname.endswith(file_extensions): - fovs_labels.append(str(int(pat.findall(fname)[0]))) - - fovs_images_and_labels = set(fovs_images).intersection(set(fovs_labels)) - fovs_diff = fovs_images_and_labels.difference(set(fovs_counts)) - if len(fovs_diff): - raise logger.warning( - f"Found images and labels for {len(fovs_images)} FOVs, but only {len(fovs_counts)} FOVs in the counts file.\n" - + f"The following FOVs are missing: {fovs_diff} \n" - + "... will use only fovs in Table." - ) + raise ValueError("Could not infer `dataset_id` from the name of the transcript file. Please specify it manually.") + + +def _read_fov_image( + morphology_path: Path, protein_path: Path | None, morphology_coords: list[str], **imread_kwargs +) -> tuple[da.Array, list[str]]: + image = imread(morphology_path, **imread_kwargs) + + protein_names = [] + if protein_path is not None: + protein_image, protein_names = _read_protein_fov(protein_path) + image = da.concatenate([image, protein_image], axis=0) + + return image, morphology_coords + protein_names + + +def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame: + fov_file = path / f"{dataset_id}_fov_positions_file.csv" + + if not fov_file.exists(): + fov_file = path / f"{dataset_id}_fov_positions_file.csv.gz" + + assert fov_file.exists(), f"Missing field of view file: {fov_file}" + + fov_locs = pd.read_csv(fov_file) + fov_locs["xmax"] = 0.0 # will be filled when reading the images + fov_locs["ymin"] = 0.0 # will be filled when reading the images + + fov_key, x_key, y_key, scale_factor = "fov", "x_global_px", "y_global_px", 1 + + if not np.isin([fov_key, x_key, y_key], fov_locs.columns).all(): # try different column names + fov_key, x_key, y_key = "FOV", "X_mm", "Y_mm" + scale_factor = 1e3 / 0.120280945 # CosMX milimeters to pixels + + assert np.isin( + [fov_key, x_key, y_key], fov_locs.columns + ).all(), f"The file {fov_file} must contain the following columns: {fov_key}, {x_key}, {y_key}. Consider using a different export module." + + fov_locs.index = fov_locs[fov_key] + fov_locs["xmin"] = fov_locs[x_key] * scale_factor + fov_locs["ymax"] = fov_locs[y_key] * scale_factor + + return fov_locs + + +def _read_stitched_image( + images_dir: Path, + fov_locs: pd.DataFrame, + protein_dir_dict: dict, + morphology_coords: list[str], + **imread_kwargs, +) -> tuple[da.Array, list[str] | None]: + fov_images = {} + c_coords_dict = {} + pattern = re.compile(r".*_F(\d+)") + for image_path in images_dir.iterdir(): + if image_path.suffix == ".TIF": + fov = int(pattern.findall(image_path.name)[0]) + + image, c_coords = _read_fov_image(image_path, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs) + + c_coords_dict[fov] = c_coords + + fov_images[fov] = da.flip(image, axis=1) + + fov_locs.loc[fov, "xmax"] = fov_locs.loc[fov, "xmin"] + image.shape[2] + fov_locs.loc[fov, "ymin"] = fov_locs.loc[fov, "ymax"] - image.shape[1] + + for dim in ["x", "y"]: + shift = fov_locs[f"{dim}min"].min() + fov_locs[f"{dim}0"] = (fov_locs[f"{dim}min"] - shift).round().astype(int) + fov_locs[f"{dim}1"] = (fov_locs[f"{dim}max"] - shift).round().astype(int) + + c_coords = list(set.union(*[set(names) for names in c_coords_dict.values()])) + + stitched_image = da.zeros(shape=(len(c_coords), fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype) + stitched_image = xr.DataArray(stitched_image, dims=("c", "y", "x"), coords={"c": c_coords}) + + for fov, im in fov_images.items(): + xmin, xmax = fov_locs.loc[fov, "x0"], fov_locs.loc[fov, "x1"] + ymin, ymax = fov_locs.loc[fov, "y0"], fov_locs.loc[fov, "y1"] + stitched_image.loc[{"c": c_coords_dict[fov], "y": slice(ymin, ymax), "x": slice(xmin, xmax)}] = im + + if len(c_coords_dict[fov]) < len(c_coords): + logger.warning(f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}") + + return stitched_image.data, c_coords + + +def _check_fov_id(fov: str | int | None) -> tuple[str, int]: + if fov is None: + return None, None + + if isinstance(fov, int): + return f"F{fov:0>3}", fov + + assert ( + fov[0] == "F" and len(fov) == 4 and all(c.isdigit() for c in fov[1:]) + ), f"'fov' needs to start with a F followed by three digits. Found '{fov}'." + + return fov, int(fov[1:]) + + +def _read_transcripts_csv(path: Path, dataset_id: str) -> pd.DataFrame: + transcripts_file = path / f"{dataset_id}_tx_file.csv.gz" + + if transcripts_file.exists(): + df = pd.read_csv(transcripts_file, compression="gzip") + else: + transcripts_file = path / f"{dataset_id}_tx_file.csv" + assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found." + df = pd.read_csv(transcripts_file) + + TRANSCRIPT_COLUMNS = ["x_global_px", "y_global_px", "target"] + assert np.isin( + TRANSCRIPT_COLUMNS, df.columns + ).all(), f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module." + + return df + + +def _find_dir(path: Path, name: str): + if (path / name).is_dir(): + return path / name + + paths = list(path.rglob(f"**/{name}")) + assert len(paths) == 1, f"Found {len(paths)} path(s) with name {name} inside {path}" + + return paths[0] + + +def _cosmx_morphology_coords(images_dir: Path) -> list[str]: + images_paths = list(images_dir.glob("*.TIF")) + assert len(images_paths) > 0, f"Expected to find images inside {images_dir}" + + with tifffile.TiffFile(images_paths[0]) as tif: + description = tif.pages[0].description + substrings = re.findall(r'"BiologicalTarget": "(.*?)",', description) + return substrings + + +def _get_cosmx_protein_name(image_path: Path) -> str: + with tifffile.TiffFile(image_path) as tif: + description = tif.pages[0].description + substrings = re.findall(r'"DisplayName": "(.*?)",', description) + return substrings[0] + + +def _read_protein_fov(protein_dir: Path) -> tuple[da.Array, list[str]]: + images_paths = list(protein_dir.rglob("*.TIF")) + protein_image = da.concatenate([imread(image_path) for image_path in images_paths], axis=0) + channel_names = [_get_cosmx_protein_name(image_path) for image_path in images_paths] - # read images - images = {} - for fname in os.listdir(path / CosmxKeys.IMAGES_DIR): - if fname.endswith(file_extensions): - fov = str(int(pat.findall(fname)[0])) - if fov in fovs_counts: - aff = affine_transforms_to_global[fov] - im = imread(path / CosmxKeys.IMAGES_DIR / fname, **imread_kwargs).squeeze() - flipped_im = da.flip(im, axis=0) - parsed_im = Image2DModel.parse( - flipped_im, - transformations={ - fov: Identity(), - "global": aff, - "global_only_image": aff, - }, - dims=("y", "x", "c"), - rgb=None, - **image_models_kwargs, - ) - images[f"{fov}_image"] = parsed_im - else: - logger.warning(f"FOV {fov} not found in counts file. Skipping image {fname}.") - - # read labels - labels = {} - for fname in os.listdir(path / CosmxKeys.LABELS_DIR): - if fname.endswith(file_extensions): - fov = str(int(pat.findall(fname)[0])) - if fov in fovs_counts: - aff = affine_transforms_to_global[fov] - la = imread(path / CosmxKeys.LABELS_DIR / fname, **imread_kwargs).squeeze() - flipped_la = da.flip(la, axis=0) - parsed_la = Labels2DModel.parse( - flipped_la, - transformations={ - fov: Identity(), - "global": aff, - "global_only_labels": aff, - }, - dims=("y", "x"), - **image_models_kwargs, - ) - labels[f"{fov}_labels"] = parsed_la - else: - logger.warning(f"FOV {fov} not found in counts file. Skipping labels {fname}.") - - points: dict[str, DaskDataFrame] = {} - if transcripts: - # assert transcripts_file is not None - # from pyarrow.csv import read_csv - # - # ptable = read_csv(path / transcripts_file) # , header=0) - # for fov in fovs_counts: - # aff = affine_transforms_to_global[fov] - # sub_table = ptable.filter(pa.compute.equal(ptable.column(CosmxKeys.FOV), int(fov))).to_pandas() - # sub_table[CosmxKeys.INSTANCE_KEY] = sub_table[CosmxKeys.INSTANCE_KEY].astype("category") - # # we rename z because we want to treat the data as 2d - # sub_table.rename(columns={"z": "z_raw"}, inplace=True) - # points[fov] = PointsModel.parse( - # sub_table, - # coordinates={"x": CosmxKeys.X_LOCAL_TRANSCRIPT, "y": CosmxKeys.Y_LOCAL_TRANSCRIPT}, - # feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT, - # instance_key=CosmxKeys.INSTANCE_KEY, - # transformations={ - # fov: Identity(), - # "global": aff, - # "global_only_labels": aff, - # }, - # ) - # let's convert the .csv to .parquet and let's read it with pyarrow.parquet for faster subsetting - import tempfile - - import pyarrow.parquet as pq - - with tempfile.TemporaryDirectory() as tmpdir: - print("converting .csv to .parquet to improve the speed of the slicing operations... ", end="") - assert transcripts_file is not None - transcripts_data = pd.read_csv(path / transcripts_file, header=0) - transcripts_data.to_parquet(Path(tmpdir) / "transcripts.parquet") - print("done") - - ptable = pq.read_table(Path(tmpdir) / "transcripts.parquet") - for fov in fovs_counts: - aff = affine_transforms_to_global[fov] - sub_table = ptable.filter(pa.compute.equal(ptable.column(CosmxKeys.FOV), int(fov))).to_pandas() - sub_table[CosmxKeys.INSTANCE_KEY] = sub_table[CosmxKeys.INSTANCE_KEY].astype("category") - # we rename z because we want to treat the data as 2d - sub_table.rename(columns={"z": "z_raw"}, inplace=True) - if len(sub_table) > 0: - points[f"{fov}_points"] = PointsModel.parse( - sub_table, - coordinates={"x": CosmxKeys.X_LOCAL_TRANSCRIPT, "y": CosmxKeys.Y_LOCAL_TRANSCRIPT}, - feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT, - instance_key=CosmxKeys.INSTANCE_KEY, - transformations={ - fov: Identity(), - "global": aff, - "global_only_labels": aff, - }, - ) - - # TODO: what to do with fov file? - # if fov_file is not None: - # fov_positions = pd.read_csv(path / fov_file, header=0, index_col=CosmxKeys.FOV) - # for fov, row in fov_positions.iterrows(): - # try: - # adata.uns["spatial"][str(fov)]["metadata"] = row.to_dict() - # except KeyError: - # logg.warning(f"FOV `{str(fov)}` does not exist, skipping it.") - # continue - - return SpatialData(images=images, labels=labels, points=points, table=table) + return protein_image, channel_names From 1a685340acecdcc46af70bc0c093d2296e8a8efe Mon Sep 17 00:00:00 2001 From: Quentin Blampey Date: Fri, 11 Apr 2025 17:19:59 +0200 Subject: [PATCH 4/4] use sopa reader that fixes sopa issues 180, 227, 231 --- src/spatialdata_io/readers/cosmx.py | 176 ++++++++++++++++++++-------- 1 file changed, 125 insertions(+), 51 deletions(-) diff --git a/src/spatialdata_io/readers/cosmx.py b/src/spatialdata_io/readers/cosmx.py index ab669017..307414a8 100644 --- a/src/spatialdata_io/readers/cosmx.py +++ b/src/spatialdata_io/readers/cosmx.py @@ -4,7 +4,7 @@ from collections.abc import Mapping from pathlib import Path from types import MappingProxyType -from typing import Any, Optional +from typing import Any import dask.array as da import numpy as np @@ -19,11 +19,12 @@ def cosmx( path: str | Path, - dataset_id: Optional[str] = None, - fov: int | str | None = None, + dataset_id: str | None = None, + fov: int | None = None, read_proteins: bool = False, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), image_models_kwargs: Mapping[str, Any] = MappingProxyType({}), + flip_image: bool | None = None, ) -> SpatialData: """ Read *Cosmx Nanostring* data. The fields of view are stitched together, except if `fov` is provided. @@ -38,11 +39,12 @@ def cosmx( Args: path: Path to the root directory containing *Nanostring* files. - dataset_id: Optional name of the dataset (needs to be provided if not infered). - fov: Name or number of one single field of view to be read. If a string is provided, an example of correct syntax is "F008". By default, reads all FOVs. + dataset_id: Optional name of the dataset (needs to be provided if not inferred). + fov: Number of one single field of view to be read. If not provided, reads all FOVs and create a stitched image. read_proteins: Whether to read the proteins or the transcripts. image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`. imread_kwargs: Keyword arguments passed to `dask_image.imread.imread`. + flip_image: For some buggy AtomX exports, `flip_image=True` has to be used for stitching. By default, the value is inferred based on the transcript file. See [this](https://github.com/gustaveroussy/sopa/issues/231) issue. Returns: A `SpatialData` object representing the CosMX experiment @@ -51,14 +53,19 @@ def cosmx( dataset_id = _infer_dataset_id(path, dataset_id) fov_locs = _read_fov_locs(path, dataset_id) - fov_id, fov = _check_fov_id(fov) protein_dir_dict = {} if read_proteins: protein_dir_dict = { - int(protein_dir.parent.name[3:]): protein_dir for protein_dir in list(path.rglob("**/FOV*/ProteinImages")) + int(protein_dir.parent.name[3:]): protein_dir + for protein_dir in list(path.rglob("**/FOV*/ProteinImages")) } - assert len(protein_dir_dict), f"No directory called 'ProteinImages' was found under {path}" + assert len(protein_dir_dict), ( + f"No directory called 'ProteinImages' was found under {path}" + ) + + if flip_image is None and not read_proteins: + flip_image = _infer_flip_image(path, dataset_id) ### Read image(s) images_dir = _find_dir(path, "Morphology2D") @@ -70,20 +77,22 @@ def cosmx( fov_locs, protein_dir_dict, morphology_coords, + flip_image, **imread_kwargs, ) image_name = "stitched_image" else: - pattern = f"*{fov_id}.TIF" - fov_files = list(images_dir.rglob(pattern)) + logger.info(f"Reading single FOV ({fov}), the image will not be stitched") + fov_file = _find_matching_fov_file(images_dir, fov) - assert len(fov_files), f"No file matches the pattern {pattern} inside {images_dir}" - assert len(fov_files) == 1, f"Multiple files match the pattern {pattern}: {', '.join(fov_files)}" - - image, c_coords = _read_fov_image(fov_files[0], protein_dir_dict.get(fov), morphology_coords, **imread_kwargs) - image_name = f"{fov}_image" + image, c_coords = _read_fov_image( + fov_file, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs + ) + image_name = f"F{fov:0>5}_image" - parsed_image = Image2DModel.parse(image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs) + parsed_image = Image2DModel.parse( + image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs + ) if read_proteins: return SpatialData(images={image_name: parsed_image}) @@ -99,7 +108,7 @@ def cosmx( else: transcripts_data = transcripts_data[transcripts_data["fov"] == fov] coordinates = {"x": "x_local_px", "y": "y_local_px"} - points_name = f"{fov}_points" + points_name = f"F{fov:0>5}_points" from spatialdata_io._constants._constants import CosmxKeys @@ -110,8 +119,7 @@ def cosmx( ) return SpatialData( - images={image_name: parsed_image}, - points={points_name: transcripts}, + images={image_name: parsed_image}, points={points_name: transcripts} ) @@ -127,11 +135,16 @@ def _infer_dataset_id(path: Path, dataset_id: str | None) -> str: if found: return found.group(1) - raise ValueError("Could not infer `dataset_id` from the name of the transcript file. Please specify it manually.") + raise ValueError( + "Could not infer `dataset_id` from the name of the transcript file. Please specify it manually." + ) def _read_fov_image( - morphology_path: Path, protein_path: Path | None, morphology_coords: list[str], **imread_kwargs + morphology_path: Path, + protein_path: Path | None, + morphology_coords: list[str], + **imread_kwargs, ) -> tuple[da.Array, list[str]]: image = imread(morphology_path, **imread_kwargs) @@ -153,21 +166,23 @@ def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame: fov_locs = pd.read_csv(fov_file) fov_locs["xmax"] = 0.0 # will be filled when reading the images - fov_locs["ymin"] = 0.0 # will be filled when reading the images + fov_locs["ymax"] = 0.0 # will be filled when reading the images fov_key, x_key, y_key, scale_factor = "fov", "x_global_px", "y_global_px", 1 - if not np.isin([fov_key, x_key, y_key], fov_locs.columns).all(): # try different column names + if not np.isin( + [fov_key, x_key, y_key], fov_locs.columns + ).all(): # try different column names fov_key, x_key, y_key = "FOV", "X_mm", "Y_mm" scale_factor = 1e3 / 0.120280945 # CosMX milimeters to pixels - assert np.isin( - [fov_key, x_key, y_key], fov_locs.columns - ).all(), f"The file {fov_file} must contain the following columns: {fov_key}, {x_key}, {y_key}. Consider using a different export module." + assert np.isin([fov_key, x_key, y_key], fov_locs.columns).all(), ( + f"The file {fov_file} must contain the following columns: {fov_key}, {x_key}, {y_key}. Consider using a different export module." + ) fov_locs.index = fov_locs[fov_key] fov_locs["xmin"] = fov_locs[x_key] * scale_factor - fov_locs["ymax"] = fov_locs[y_key] * scale_factor + fov_locs["ymin"] = fov_locs[y_key] * scale_factor return fov_locs @@ -177,6 +192,7 @@ def _read_stitched_image( fov_locs: pd.DataFrame, protein_dir_dict: dict, morphology_coords: list[str], + flip_image: int, **imread_kwargs, ) -> tuple[da.Array, list[str] | None]: fov_images = {} @@ -186,14 +202,24 @@ def _read_stitched_image( if image_path.suffix == ".TIF": fov = int(pattern.findall(image_path.name)[0]) - image, c_coords = _read_fov_image(image_path, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs) + image, c_coords = _read_fov_image( + image_path, + protein_dir_dict.get(fov), + morphology_coords, + **imread_kwargs, + ) c_coords_dict[fov] = c_coords fov_images[fov] = da.flip(image, axis=1) fov_locs.loc[fov, "xmax"] = fov_locs.loc[fov, "xmin"] + image.shape[2] - fov_locs.loc[fov, "ymin"] = fov_locs.loc[fov, "ymax"] - image.shape[1] + + if flip_image: + fov_locs.loc[fov, "ymax"] = fov_locs.loc[fov, "ymin"] + fov_locs.loc[fov, "ymin"] = fov_locs.loc[fov, "ymax"] - image.shape[1] + else: + fov_locs.loc[fov, "ymax"] = fov_locs.loc[fov, "ymin"] + image.shape[1] for dim in ["x", "y"]: shift = fov_locs[f"{dim}min"].min() @@ -202,48 +228,70 @@ def _read_stitched_image( c_coords = list(set.union(*[set(names) for names in c_coords_dict.values()])) - stitched_image = da.zeros(shape=(len(c_coords), fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype) - stitched_image = xr.DataArray(stitched_image, dims=("c", "y", "x"), coords={"c": c_coords}) + height, width = fov_locs["y1"].max(), fov_locs["x1"].max() + stitched_image = da.zeros(shape=(len(c_coords), height, width), dtype=image.dtype) + stitched_image = xr.DataArray( + stitched_image, dims=("c", "y", "x"), coords={"c": c_coords} + ) for fov, im in fov_images.items(): xmin, xmax = fov_locs.loc[fov, "x0"], fov_locs.loc[fov, "x1"] ymin, ymax = fov_locs.loc[fov, "y0"], fov_locs.loc[fov, "y1"] - stitched_image.loc[{"c": c_coords_dict[fov], "y": slice(ymin, ymax), "x": slice(xmin, xmax)}] = im + + if flip_image: + y_slice, x_slice = ( + slice(height - ymax, height - ymin), + slice(width - xmax, width - xmin), + ) + else: + y_slice, x_slice = slice(ymin, ymax), slice(xmin, xmax) + + stitched_image.loc[{"c": c_coords_dict[fov], "y": y_slice, "x": x_slice}] = im if len(c_coords_dict[fov]) < len(c_coords): - logger.warning(f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}") + logger.warning( + f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}" + ) return stitched_image.data, c_coords -def _check_fov_id(fov: str | int | None) -> tuple[str, int]: - if fov is None: - return None, None +def _find_matching_fov_file(images_dir: Path, fov: str | int) -> Path: + assert isinstance(fov, int), "Expected `fov` to be an integer" - if isinstance(fov, int): - return f"F{fov:0>3}", fov + pattern = re.compile(rf".*_F0*{fov}\.TIF") + fov_files = [file for file in images_dir.rglob("*") if pattern.match(file.name)] - assert ( - fov[0] == "F" and len(fov) == 4 and all(c.isdigit() for c in fov[1:]) - ), f"'fov' needs to start with a F followed by three digits. Found '{fov}'." + assert len(fov_files), f"No file matches the pattern {pattern} inside {images_dir}" + assert len(fov_files) == 1, ( + f"Multiple files match the pattern {pattern}: {', '.join(map(str, fov_files))}" + ) - return fov, int(fov[1:]) + return fov_files[0] -def _read_transcripts_csv(path: Path, dataset_id: str) -> pd.DataFrame: +def _read_transcripts_csv( + path: Path, dataset_id: str, nrows: int | None = None +) -> pd.DataFrame: transcripts_file = path / f"{dataset_id}_tx_file.csv.gz" if transcripts_file.exists(): - df = pd.read_csv(transcripts_file, compression="gzip") + df = pd.read_csv(transcripts_file, compression="gzip", nrows=nrows) else: transcripts_file = path / f"{dataset_id}_tx_file.csv" - assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found." - df = pd.read_csv(transcripts_file) + assert transcripts_file.exists(), ( + f"Transcript file {transcripts_file} not found." + ) + df = pd.read_csv(transcripts_file, nrows=nrows) TRANSCRIPT_COLUMNS = ["x_global_px", "y_global_px", "target"] - assert np.isin( - TRANSCRIPT_COLUMNS, df.columns - ).all(), f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module." + assert np.isin(TRANSCRIPT_COLUMNS, df.columns).all(), ( + f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module." + ) + + df["unique_cell_id"] = ( + df["fov"] * (df["cell_ID"].max() + 1) * (df["cell_ID"] > 0) + df["cell_ID"] + ) return df @@ -264,8 +312,12 @@ def _cosmx_morphology_coords(images_dir: Path) -> list[str]: with tifffile.TiffFile(images_paths[0]) as tif: description = tif.pages[0].description + substrings = re.findall(r'"BiologicalTarget": "(.*?)",', description) - return substrings + channels = re.findall(r'"ChannelId": "(.*?)",', description) + channel_order = list(re.findall(r'"ChannelOrder": "(.*?)",', description)[0]) + + return [substrings[channels.index(x)] for x in channel_order if x in channels] def _get_cosmx_protein_name(image_path: Path) -> str: @@ -277,7 +329,29 @@ def _get_cosmx_protein_name(image_path: Path) -> str: def _read_protein_fov(protein_dir: Path) -> tuple[da.Array, list[str]]: images_paths = list(protein_dir.rglob("*.TIF")) - protein_image = da.concatenate([imread(image_path) for image_path in images_paths], axis=0) + protein_image = da.concatenate( + [imread(image_path) for image_path in images_paths], axis=0 + ) channel_names = [_get_cosmx_protein_name(image_path) for image_path in images_paths] return protein_image, channel_names + + +def _infer_flip_image(path: Path, dataset_id: str) -> bool: + df_ = _read_transcripts_csv(path, dataset_id, nrows=100) + + fov = df_["fov"].value_counts().index[0] + df_ = df_[df_["fov"] == fov].sort_values("y_global_px") + + assert len(df_) > 1, ( + f"Not transcripts in {fov=} to infer `flip_image`. Please provide `flip_image` manually." + ) + + # in recent AtomX exports, y_local_px is negatively correlated with y_global_px + flip_image = df_["y_local_px"].iloc[0] > df_["y_local_px"].iloc[-1] + + logger.info( + f"Inferring argument {flip_image=}. If the image stitching is wrong, please add a comment to https://github.com/gustaveroussy/sopa/issues/231" + ) + + return flip_image