From bd5a121cb33adcd759895c27eb37ebd3f3bbbff5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 23 Jan 2026 13:48:47 +0100 Subject: [PATCH 1/4] perf: try out new cliping numba kernel --- benchmarks/benchmarks/preprocessing_log.py | 33 +++++++++++------- pyproject.toml | 2 ++ .../preprocessing/_highly_variable_genes.py | 34 +++++++++++++++---- 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py index 9633c8e208..8d79ce88e2 100644 --- a/benchmarks/benchmarks/preprocessing_log.py +++ b/benchmarks/benchmarks/preprocessing_log.py @@ -34,7 +34,7 @@ class PreprocessingSuite: # noqa: D101 def setup_cache(self) -> None: """Without this caching, asv was running several processes which meant the data was repeatedly downloaded.""" - for dataset, layer in product(*self.params): + for dataset, layer in product(*self.params[:2]): adata, _ = get_dataset(dataset, layer=layer) adata.write_h5ad(f"{dataset}_{layer}.h5ad") @@ -47,17 +47,6 @@ def time_pca(self, *_) -> None: def peakmem_pca(self, *_) -> None: sc.pp.pca(self.adata, svd_solver="arpack") - def time_highly_variable_genes(self, *_) -> None: - # the default flavor runs on log-transformed data - sc.pp.highly_variable_genes( - self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5 - ) - - def peakmem_highly_variable_genes(self, *_) -> None: - sc.pp.highly_variable_genes( - self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5 - ) - # regress_out is very slow for this dataset @skip_when(dataset={"pbmc3k"}) def time_regress_out(self, *_) -> None: @@ -72,3 +61,23 @@ def time_scale(self, *_) -> None: def peakmem_scale(self, *_) -> None: sc.pp.scale(self.adata, max_value=10) + + +class HVGSuite(PreprocessingSuite): # noqa: D101 + params = (*params, ["seurat_v3", "cell_ranger", "seurat"]) + param_names = (*param_names, "flavor") + + def setup(self, dataset, layer, flavor) -> None: + self.adata = ad.read_h5ad(f"{dataset}_{layer}.h5ad") + self.flavor = flavor + + def time_highly_variable_genes(self, *_) -> None: + # the default flavor runs on log-transformed data + sc.pp.highly_variable_genes( + self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor=self.flavor + ) + + def peakmem_highly_variable_genes(self, *_) -> None: + sc.pp.highly_variable_genes( + self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor=self.flavor + ) diff --git a/pyproject.toml b/pyproject.toml index c61961606c..16fcae67d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -198,6 +198,8 @@ filterwarnings = [ "ignore:.*'(parseAll)'.*'(parse_all)':DeprecationWarning", # igraph vs leidenalg warning "ignore:The `igraph` implementation of leiden clustering:UserWarning", + "ignore:Detected unsupported threading environment:UserWarning", + "ignore:Cannot cache compiled function", ] [tool.coverage.run] diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py index 26d5a7763e..845b1fd21e 100644 --- a/src/scanpy/preprocessing/_highly_variable_genes.py +++ b/src/scanpy/preprocessing/_highly_variable_genes.py @@ -13,7 +13,7 @@ from fast_array_utils import stats from .. import logging as logg -from .._compat import CSBase, CSRBase, DaskArray, old_positionals, warn +from .._compat import CSBase, CSRBase, DaskArray, njit, old_positionals, warn from .._settings import Verbosity, settings from .._utils import ( check_nonnegative_integers, @@ -98,8 +98,7 @@ def _(data_batch: CSBase, clip_val: np.ndarray) -> tuple[np.ndarray, np.ndarray] ) -# parallel=False needed for accuracy -@numba.njit(cache=True, parallel=False) # noqa: TID251 +@njit def _sum_and_sum_squares_clipped( indices: NDArray[np.integer], data: NDArray[np.floating], @@ -108,13 +107,34 @@ def _sum_and_sum_squares_clipped( clip_val: NDArray[np.float64], nnz: int, ) -> tuple[NDArray[np.float64], NDArray[np.float64]]: - squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64) - batch_counts_sum = np.zeros(n_cols, dtype=np.float64) + """ + Parallel implementation using thread-local buffers to avoid race conditions. + + Previous implementation used parallel=False due to race condition on shared arrays. + This version uses explicit thread-local reduction to restore both correctness + and parallelism. + """ + # Thread-local accumulators for parallel reduction + n_threads = numba.get_num_threads() + squared_local = np.zeros((n_threads, n_cols), dtype=np.float64) + sum_local = np.zeros((n_threads, n_cols), dtype=np.float64) + + # Parallel accumulation into thread-local buffers (no race condition) for i in numba.prange(nnz): + tid = numba.get_thread_id() idx = indices[i] element = min(np.float64(data[i]), clip_val[idx]) - squared_batch_counts_sum[idx] += element**2 - batch_counts_sum[idx] += element + squared_local[tid, idx] += element**2 + sum_local[tid, idx] += element + + # Reduction phase: combine thread-local results + squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64) + batch_counts_sum = np.zeros(n_cols, dtype=np.float64) + + for t in range(n_threads): + for j in range(n_cols): + squared_batch_counts_sum[j] += squared_local[t, j] + batch_counts_sum[j] += sum_local[t, j] return squared_batch_counts_sum, batch_counts_sum From 94b9b53df3477a39d0c71758b4484b8bbab82bc1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 23 Jan 2026 13:56:57 +0100 Subject: [PATCH 2/4] fix: try scikit misc --- benchmarks/asv.conf.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json index d19b822178..32b9ca1a8f 100644 --- a/benchmarks/asv.conf.json +++ b/benchmarks/asv.conf.json @@ -83,7 +83,7 @@ // "psutil": [""] "pooch": [""], "scikit-image": [""], // https://github.com/conda-forge/scikit-misc-feedstock/pull/29 - // "scikit-misc": [""], + "scikit-misc": [""], }, // Combinations of libraries/python versions can be excluded/included @@ -104,7 +104,7 @@ // - environment_type // Environment type, as above. // - sys_platform - // Platform, as in sys.platform. Possible values for the common + // Platform, as in sys.platform. Possible values for the commonπ // cases: 'linux2', 'win32', 'cygwin', 'darwin'. // // "exclude": [ From de3ba0f3a814979adbc2d7764279a8f34facace7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 23 Jan 2026 14:05:45 +0100 Subject: [PATCH 3/4] chore: try other datasets --- benchmarks/benchmarks/preprocessing_log.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py index 8d79ce88e2..3390d32795 100644 --- a/benchmarks/benchmarks/preprocessing_log.py +++ b/benchmarks/benchmarks/preprocessing_log.py @@ -63,12 +63,17 @@ def peakmem_scale(self, *_) -> None: sc.pp.scale(self.adata, max_value=10) -class HVGSuite(PreprocessingSuite): # noqa: D101 - params = (*params, ["seurat_v3", "cell_ranger", "seurat"]) - param_names = (*param_names, "flavor") +class HVGSuite: # noqa: D101 + params = (["seurat_v3", "cell_ranger", "seurat"],) + param_names = ("flavor",) + + def setup_cache(self) -> None: + """Without this caching, asv was running several processes which meant the data was repeatedly downloaded.""" + adata, _ = get_dataset("pbmc3k") + adata.write_h5ad("pbmc3k.h5ad") def setup(self, dataset, layer, flavor) -> None: - self.adata = ad.read_h5ad(f"{dataset}_{layer}.h5ad") + self.adata = sc.pp.filter_genes(ad.read_h5ad("pbmc3k.h5ad"), min_cells=3) self.flavor = flavor def time_highly_variable_genes(self, *_) -> None: From c462044e38d186e0907c31fbceda518df1eb32d8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 23 Jan 2026 14:15:31 +0100 Subject: [PATCH 4/4] fix: oops --- benchmarks/benchmarks/preprocessing_log.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py index 3390d32795..318a8ce82c 100644 --- a/benchmarks/benchmarks/preprocessing_log.py +++ b/benchmarks/benchmarks/preprocessing_log.py @@ -72,8 +72,9 @@ def setup_cache(self) -> None: adata, _ = get_dataset("pbmc3k") adata.write_h5ad("pbmc3k.h5ad") - def setup(self, dataset, layer, flavor) -> None: - self.adata = sc.pp.filter_genes(ad.read_h5ad("pbmc3k.h5ad"), min_cells=3) + def setup(self, flavor) -> None: + self.adata = ad.read_h5ad("pbmc3k.h5ad") + sc.pp.filter_genes(self.adata, min_cells=3) self.flavor = flavor def time_highly_variable_genes(self, *_) -> None: