diff --git a/docs/conf.py b/docs/conf.py index 06c67438fe..6b85bfdf49 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -242,6 +242,7 @@ def setup(app: Sphinx): ("py:class", "scanpy._utils.Empty"), ("py:class", "numpy.random.mtrand.RandomState"), ("py:class", "scanpy.neighbors._types.KnnTransformerLike"), + ("py:class", "scanpy.tools._types.DensmapMethodKwds"), ] # Options for plot examples diff --git a/docs/references.bib b/docs/references.bib index c474353df3..954b9294d5 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -627,6 +627,20 @@ @article{Muraro2016 pages = {385--394.e3}, } +@article{Narayan2021, + author = {Narayan, Ashwin and Berger, Bonnie and Cho, Hyunghoon}, + title = {Assessing single-cell transcriptomic variability through density-preserving data visualization}, + volume = {39}, + url = {https://doi.org/10.1038/s41587-020-00801-7}, + doi = {10.1038/s41587-020-00801-7}, + number = {6}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + year = {2021}, + month = {jan}, + pages = {765--774}, +} + @article{Nowotschin2019, author = {Nowotschin, Sonja and Setty, Manu and Kuo, Ying-Yi and Liu, Vincent and Garg, Vidur and Sharma, Roshan and Simon, Claire S. and Saiz, Nestor and Gardner, Rui and Boutet, Stéphane C. and Church, Deanna M. and Hoodless, Pamela A. and Hadjantonakis, Anna-Katerina and Pe’er, Dana}, title = {The emergent landscape of the mouse gut endoderm at single-cell resolution}, diff --git a/docs/release-notes/2946.feature.md b/docs/release-notes/2946.feature.md new file mode 100644 index 0000000000..7a48bae2a8 --- /dev/null +++ b/docs/release-notes/2946.feature.md @@ -0,0 +1 @@ +Add DensMAP support via `method="densmap"` in {func}`~scanpy.tl.umap` {smaller}`M Keller` diff --git a/src/scanpy/tools/__init__.py b/src/scanpy/tools/__init__.py index e426470b84..5a9314d917 100644 --- a/src/scanpy/tools/__init__.py +++ b/src/scanpy/tools/__init__.py @@ -31,6 +31,8 @@ if TYPE_CHECKING: from typing import Any + from ._types import DensmapMethodKwds # noqa: F401 + def __getattr__(name: str) -> Any: if name == "pca": diff --git a/src/scanpy/tools/_types.py b/src/scanpy/tools/_types.py new file mode 100644 index 0000000000..eed8313e9c --- /dev/null +++ b/src/scanpy/tools/_types.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from typing import TypedDict + + +class DensmapMethodKwds(TypedDict, total=False): + dens_lambda: float + dens_frac: float + dens_var_shift: float diff --git a/src/scanpy/tools/_umap.py b/src/scanpy/tools/_umap.py index 8f880bc87c..63d2ed4401 100644 --- a/src/scanpy/tools/_umap.py +++ b/src/scanpy/tools/_umap.py @@ -1,7 +1,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import numpy as np from sklearn.utils import check_array, check_random_state @@ -18,6 +18,7 @@ from anndata import AnnData from .._utils.random import _LegacyRandom + from ._types import DensmapMethodKwds _InitPos = Literal["paga", "spectral", "random"] @@ -52,7 +53,8 @@ def umap( # noqa: PLR0913, PLR0915 random_state: _LegacyRandom = 0, a: float | None = None, b: float | None = None, - method: Literal["umap", "rapids"] = "umap", + method: Literal["umap", "rapids", "densmap"] = "umap", + method_kwds: DensmapMethodKwds | None = None, key_added: str | None = None, neighbors_key: str = "neighbors", copy: bool = False, @@ -127,6 +129,8 @@ def umap( # noqa: PLR0913, PLR0915 ``'umap'`` Umap’s simplical set embedding. + ``'densmap'`` + Umap’s simplical set embedding with densmap=True :cite:p:`Narayan2021`. ``'rapids'`` GPU accelerated implementation. @@ -146,19 +150,51 @@ def umap( # noqa: PLR0913, PLR0915 copy Return a copy instead of writing to adata. + method_kwds + Additional method parameters. + + If method is ``'densmap'``, the following parameters are available: + + ``dens_lambda`` : `float`, optional (default: 2.0) + Controls the regularization weight of the density correlation term + in densMAP. Higher values prioritize density preservation over the + UMAP objective, and vice versa for values closer to zero. Setting this + parameter to zero is equivalent to running the original UMAP algorithm. + ``dens_frac`` : `float`, optional (default: 0.3) + Controls the fraction of epochs (between 0 and 1) where the + density-augmented objective is used in densMAP. The first + (1 - dens_frac) fraction of epochs optimize the original UMAP objective + before introducing the density correlation term. + ``dens_var_shift`` : `float`, optional (default: 0.1) + A small constant added to the variance of local radii in the + embedding when calculating the density correlation objective to + prevent numerical instability from dividing by a small number + Returns ------- - Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields: + Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields unless method is 'densmap': `adata.obsm['X_umap' | key_added]` : :class:`numpy.ndarray` (dtype `float`) UMAP coordinates of data. `adata.uns['umap' | key_added]` : :class:`dict` UMAP parameters. + When method is 'densmap', sets the following fields: + + `adata.obsm['X_densmap']` : :class:`numpy.ndarray` (dtype `float`) + densMAP coordinates of data. + `adata.uns['densmap']` : :class:`dict` + densMAP parameters. + """ adata = adata.copy() if copy else adata - key_obsm, key_uns = ("X_umap", "umap") if key_added is None else [key_added] * 2 + key_obsm, key_uns = ( + (("X_densmap", "densmap") if method == "densmap" else ("X_umap", "umap")) + if key_added is None + else [key_added] * 2 + ) + method_name = "DensMAP" if method == "densmap" else "UMAP" if neighbors_key is None: # backwards compat neighbors_key = "neighbors" @@ -184,6 +220,7 @@ def umap( # noqa: PLR0913, PLR0915 if a is None or b is None: a, b = find_ab_params(spread, min_dist) + adata.uns[key_uns] = dict(params=dict(a=a, b=b)) if isinstance(init_pos, str) and init_pos in adata.obsm: init_coords = adata.obsm[init_pos] @@ -207,7 +244,33 @@ def umap( # noqa: PLR0913, PLR0915 n_pcs=neigh_params.get("n_pcs", None), silent=True, ) - if method == "umap": + + if method_kwds is None: + method_kwds = {} + + densmap_kwds = ( + { + "graph_dists": neighbors["distances"], + "n_neighbors": neigh_params.get("n_neighbors", 15), + # Default params from umap package + # Reference: https://github.com/lmcinnes/umap/blob/868e55cb614f361a0d31540c1f4a4b175136025c/umap/umap_.py#L1692 + # If user provided method_kwds, the user-provided values should + # overwrite the default values specified above. + "lambda": method_kwds.get("dens_lambda", 2.0), + "frac": method_kwds.get("dens_frac", 0.3), + "var_shift": method_kwds.get("dens_var_shift", 0.1), + } + if method == "densmap" + else {} + ) + if method == "densmap": + adata.uns[key_uns]["params"].update({ + "dens_lambda": densmap_kwds["lambda"], + "dens_frac": densmap_kwds["frac"], + "dens_var_shift": densmap_kwds["var_shift"], + }) + + if method == "umap" or method == "densmap": # the data matrix X is really only used for determining the number of connected components # for the init condition in the UMAP embedding default_epochs = 500 if neighbors["connectivities"].shape[0] <= 10000 else 200 @@ -226,8 +289,8 @@ def umap( # noqa: PLR0913, PLR0915 random_state=random_state, metric=neigh_params.get("metric", "euclidean"), metric_kwds=neigh_params.get("metric_kwds", {}), - densmap=False, - densmap_kwds={}, + densmap=(method == "densmap"), + densmap_kwds=densmap_kwds, output_dens=False, verbose=settings.verbosity > 3, ) @@ -272,8 +335,8 @@ def umap( # noqa: PLR0913, PLR0915 time=start, deep=( "added\n" - f" {key_obsm!r}, UMAP coordinates (adata.obsm)\n" - f" {key_uns!r}, UMAP parameters (adata.uns)" + f" {key_obsm!r}, {method_name} coordinates (adata.obsm)\n" + f" {key_uns!r}, {method_name} parameters (adata.uns)" ), ) return adata if copy else None diff --git a/tests/test_embedding.py b/tests/test_embedding.py index 692157a084..abb3115481 100644 --- a/tests/test_embedding.py +++ b/tests/test_embedding.py @@ -1,13 +1,19 @@ from __future__ import annotations +from typing import TYPE_CHECKING, Protocol + import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_raises +from sklearn.mixture import GaussianMixture import scanpy as sc from testing.scanpy._helpers.data import pbmc68k_reduced from testing.scanpy._pytest.marks import needs +if TYPE_CHECKING: + from numpy.typing import ArrayLike, NDArray + @pytest.mark.parametrize( ("key_added", "key_obsm", "key_uns"), @@ -88,3 +94,77 @@ def test_diffmap(): sc.tl.diffmap(pbmc, random_state=1234) d3 = pbmc.obsm["X_diffmap"].copy() assert_raises(AssertionError, assert_array_equal, d1, d3) + + +def test_densmap(): + pbmc = pbmc68k_reduced() + + # Checking that the results are reproducible + sc.tl.umap(pbmc, method="densmap") + d1 = pbmc.obsm["X_densmap"].copy() + sc.tl.umap(pbmc, method="densmap") + d2 = pbmc.obsm["X_densmap"].copy() + assert_array_equal(d1, d2) + + # Checking if specifying random_state works, arrays shouldn't be equal + sc.tl.umap(pbmc, method="densmap", random_state=1234) + d3 = pbmc.obsm["X_densmap"].copy() + assert_raises(AssertionError, assert_array_equal, d1, d3) + + # Checking if specifying dens_lambda works, arrays shouldn't be equal + sc.tl.umap(pbmc, method="densmap", method_kwds=dict(dens_lambda=2.3456)) + d4 = pbmc.obsm["X_densmap"].copy() + assert_raises(AssertionError, assert_array_equal, d1, d4) + + +def test_umap_raises_for_unsupported_method(): + pbmc = pbmc68k_reduced() + + # Checking that umap function raises a ValueError + # if a user passes an invalid `method` parameter. + with assert_raises(ValueError): + sc.tl.umap(pbmc, method="method_does_not_exist") + + +class GaussianMixtureLike(Protocol): + @property + def n_components(self) -> int: ... + @property + def covariances_(self) -> ArrayLike: ... + + +# Given a fit Gaussian mixture model with N components, +# return the mean of ellipse areas (one ellipse per component). +def get_mean_ellipse_area(gm: GaussianMixtureLike) -> np.floating: + # Adapted from GMM covariances ellipse plotting tutorial. + # Reference: https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html + result = [] + covariances: NDArray[np.float64] = np.asarray(gm.covariances_, dtype=np.float64) + for i in range(gm.n_components): + component_covariances = covariances[i][:2, :2] + v, _ = np.linalg.eigh(component_covariances) + v = 2.0 * np.sqrt(2.0) * np.sqrt(v) + width = v[0] + height = v[1] + result.append(np.pi * width * height) + return np.mean(np.array(result)) + + +def test_densmap_differs_from_umap(): + pbmc = pbmc68k_reduced() + + # Check that the areas of ellipses that result from + # fitting a Gaussian mixture model to the results + # of UMAP and DensMAP are different, + # with DensMAP ellipses having a larger area on average. + random_state = 1234 + mean_area_results = [] + n_components = pbmc.obs["louvain"].unique().shape[0] + for method in ["densmap", "umap"]: + sc.tl.umap(pbmc, method=method, random_state=random_state) + X_map = pbmc.obsm[f"X_{method}"].copy() + gm = GaussianMixture(n_components=n_components, random_state=random_state).fit( + X_map + ) + mean_area_results.append(get_mean_ellipse_area(gm)) + assert mean_area_results[0] > mean_area_results[1]