From 0bd14644428eaadd9b71c7d43f9d7cd2509a62d5 Mon Sep 17 00:00:00 2001 From: ggit12 <115842122+ggit12@users.noreply.github.com> Date: Sun, 5 Nov 2023 13:20:56 -0800 Subject: [PATCH 1/6] Update _simple.py implemented the "add_intercept" argument to regress_out() --- scanpy/preprocessing/_simple.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index 754424db85..671b054a8d 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -571,6 +571,7 @@ def regress_out( layer: Optional[str] = None, n_jobs: Optional[int] = None, copy: bool = False, + add_intercept: bool = False ) -> Optional[AnnData]: """\ Regress out (mostly) unwanted sources of variation. @@ -584,14 +585,16 @@ def regress_out( adata The annotated data matrix. keys - Keys for observation annotation on which to regress on. + Keys for observation annotation on which to regress. layer - If provided, which element of layers to regress on. + If provided, which element of layers to use in regression. n_jobs Number of jobs for parallel computation. `None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`. copy Determines whether a copy of `adata` is returned. + add_intercept + If True, regress_out will add intercept back to residuals in order to transform results back into gene-count space. Defaults to False Returns ------- @@ -697,10 +700,16 @@ def _regress_out_chunk(data): else: regres = regressors try: - result = sm.GLM( - data_chunk[:, col_index], regres, family=sm.families.Gaussian() - ).fit() - new_column = result.resid_response + if add_intercept: + result = sm.GLM( + sm.add_constant(data_chunk[:, col_index]), regres, family=sm.families.Gaussian() + ).fit() + new_column = result.resid_response + result.params[0] + else: + result = sm.GLM( + data_chunk[:, col_index], regres, family=sm.families.Gaussian() + ).fit() + new_column = result.resid_response except PerfectSeparationError: # this emulates R's behavior logg.warning("Encountered PerfectSeparationError, setting to 0 as in R.") new_column = np.zeros(data_chunk.shape[0]) From 80d3106dc164e2418ad45fe63195fabd3c96612e Mon Sep 17 00:00:00 2001 From: ggit12 <115842122+ggit12@users.noreply.github.com> Date: Sun, 5 Nov 2023 14:39:53 -0800 Subject: [PATCH 2/6] Update _simple.py Finished including add_intercept into regress_out --- scanpy/preprocessing/_simple.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index 671b054a8d..554d4a2fb7 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -663,7 +663,7 @@ def regress_out( regres = regressors_chunk[idx] else: regres = regressors - tasks.append(tuple((data_chunk, regres, variable_is_categorical))) + tasks.append(tuple((data_chunk, regres, variable_is_categorical, add_intercept))) from joblib import Parallel, delayed @@ -683,6 +683,7 @@ def _regress_out_chunk(data): data_chunk = data[0] regressors = data[1] variable_is_categorical = data[2] + add_intercept = data[3] responses_chunk_list = [] import statsmodels.api as sm From d2e92f3e7c22017d087c187d771c47917c4ef50f Mon Sep 17 00:00:00 2001 From: ggit12 <115842122+ggit12@users.noreply.github.com> Date: Sun, 5 Nov 2023 14:57:33 -0800 Subject: [PATCH 3/6] Update _simple.py fixed regression model --- scanpy/preprocessing/_simple.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index 554d4a2fb7..03bff905a9 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -702,11 +702,14 @@ def _regress_out_chunk(data): regres = regressors try: if add_intercept: + #add constant to regres to get intercept in results result = sm.GLM( - sm.add_constant(data_chunk[:, col_index]), regres, family=sm.families.Gaussian() + data_chunk[:, col_index], sm.add_constant(regres), family=sm.families.Gaussian() ).fit() + #calculat result as resid + intercept new_column = result.resid_response + result.params[0] else: + #don't add intercept result = sm.GLM( data_chunk[:, col_index], regres, family=sm.families.Gaussian() ).fit() From bce2bdc36eb0da2af28c44ff00efdce6e1ae6adf Mon Sep 17 00:00:00 2001 From: ggit12 <115842122+ggit12@users.noreply.github.com> Date: Sun, 5 Nov 2023 15:11:21 -0800 Subject: [PATCH 4/6] Update _simple.py fixed indexing --- scanpy/preprocessing/_simple.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index 03bff905a9..d4936d7963 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -707,7 +707,7 @@ def _regress_out_chunk(data): data_chunk[:, col_index], sm.add_constant(regres), family=sm.families.Gaussian() ).fit() #calculat result as resid + intercept - new_column = result.resid_response + result.params[0] + new_column = result.resid_response + result.params.iloc[0] else: #don't add intercept result = sm.GLM( From 8dbd9efcb656e1ddb72b376ba8e650254d0a23b3 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Tue, 7 Nov 2023 09:54:36 +0100 Subject: [PATCH 5/6] Add data structure --- scanpy/preprocessing/_simple.py | 45 ++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index d4936d7963..2b2aa096aa 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -2,14 +2,17 @@ Compositions of these functions are found in sc.preprocess.recipes. """ +from __future__ import annotations + from functools import singledispatch from numbers import Number import warnings -from typing import Union, Optional, Tuple, Collection, Sequence, Iterable, Literal +from typing import Union, Optional, Tuple, Collection, Sequence, Iterable, Literal, NamedTuple import numba import numpy as np import scipy as sp +import pandas as pd from scipy.sparse import issparse, isspmatrix_csr, csr_matrix, spmatrix from sklearn.utils import sparsefuncs, check_array from pandas.api.types import CategoricalDtype @@ -565,6 +568,13 @@ def normalize_per_cell( return X if copy else None +class RegressionTask(NamedTuple): + data_chunk: np.ndarray + regressors: np.ndarray | pd.DataFrame + variable_is_categorical: bool + add_intercept: bool + + def regress_out( adata: AnnData, keys: Union[str, Sequence[str]], @@ -663,7 +673,7 @@ def regress_out( regres = regressors_chunk[idx] else: regres = regressors - tasks.append(tuple((data_chunk, regres, variable_is_categorical, add_intercept))) + tasks.append(RegressionTask(data_chunk, regres, variable_is_categorical, add_intercept)) from joblib import Parallel, delayed @@ -677,46 +687,39 @@ def regress_out( return adata if copy else None -def _regress_out_chunk(data): - # data is a tuple containing the selected columns from adata.X - # and the regressors dataFrame - data_chunk = data[0] - regressors = data[1] - variable_is_categorical = data[2] - add_intercept = data[3] - - responses_chunk_list = [] +def _regress_out_chunk(task: RegressionTask): import statsmodels.api as sm from statsmodels.tools.sm_exceptions import PerfectSeparationError - for col_index in range(data_chunk.shape[1]): + responses_chunk_list = [] + for col_index in range(task.data_chunk.shape[1]): # if all values are identical, the statsmodel.api.GLM throws an error; # but then no regression is necessary anyways... - if not (data_chunk[:, col_index] != data_chunk[0, col_index]).any(): - responses_chunk_list.append(data_chunk[:, col_index]) + if not (task.data_chunk[:, col_index] != task.data_chunk[0, col_index]).any(): + responses_chunk_list.append(task.data_chunk[:, col_index]) continue - if variable_is_categorical: - regres = np.c_[np.ones(regressors.shape[0]), regressors[:, col_index]] + if task.variable_is_categorical: + regres = np.c_[np.ones(task.regressors.shape[0]), task.regressors[:, col_index]] else: - regres = regressors + regres = task.regressors try: - if add_intercept: + if task.add_intercept: #add constant to regres to get intercept in results result = sm.GLM( - data_chunk[:, col_index], sm.add_constant(regres), family=sm.families.Gaussian() + task.data_chunk[:, col_index], sm.add_constant(regres), family=sm.families.Gaussian() ).fit() #calculat result as resid + intercept new_column = result.resid_response + result.params.iloc[0] else: #don't add intercept result = sm.GLM( - data_chunk[:, col_index], regres, family=sm.families.Gaussian() + task.data_chunk[:, col_index], regres, family=sm.families.Gaussian() ).fit() new_column = result.resid_response except PerfectSeparationError: # this emulates R's behavior logg.warning("Encountered PerfectSeparationError, setting to 0 as in R.") - new_column = np.zeros(data_chunk.shape[0]) + new_column = np.zeros(task.data_chunk.shape[0]) responses_chunk_list.append(new_column) From 5b9bc7225b520e1c35c8f60412ee39e21aff227c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Nov 2023 08:56:33 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scanpy/preprocessing/_simple.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index 2b2aa096aa..5434f9b37f 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -7,7 +7,16 @@ from functools import singledispatch from numbers import Number import warnings -from typing import Union, Optional, Tuple, Collection, Sequence, Iterable, Literal, NamedTuple +from typing import ( + Union, + Optional, + Tuple, + Collection, + Sequence, + Iterable, + Literal, + NamedTuple, +) import numba import numpy as np @@ -581,7 +590,7 @@ def regress_out( layer: Optional[str] = None, n_jobs: Optional[int] = None, copy: bool = False, - add_intercept: bool = False + add_intercept: bool = False, ) -> Optional[AnnData]: """\ Regress out (mostly) unwanted sources of variation. @@ -673,7 +682,9 @@ def regress_out( regres = regressors_chunk[idx] else: regres = regressors - tasks.append(RegressionTask(data_chunk, regres, variable_is_categorical, add_intercept)) + tasks.append( + RegressionTask(data_chunk, regres, variable_is_categorical, add_intercept) + ) from joblib import Parallel, delayed @@ -700,19 +711,23 @@ def _regress_out_chunk(task: RegressionTask): continue if task.variable_is_categorical: - regres = np.c_[np.ones(task.regressors.shape[0]), task.regressors[:, col_index]] + regres = np.c_[ + np.ones(task.regressors.shape[0]), task.regressors[:, col_index] + ] else: regres = task.regressors try: if task.add_intercept: - #add constant to regres to get intercept in results + # add constant to regres to get intercept in results result = sm.GLM( - task.data_chunk[:, col_index], sm.add_constant(regres), family=sm.families.Gaussian() + task.data_chunk[:, col_index], + sm.add_constant(regres), + family=sm.families.Gaussian(), ).fit() - #calculat result as resid + intercept + # calculat result as resid + intercept new_column = result.resid_response + result.params.iloc[0] else: - #don't add intercept + # don't add intercept result = sm.GLM( task.data_chunk[:, col_index], regres, family=sm.families.Gaussian() ).fit()