From c390b4af0c20f1694a1e8392c0148c4b496e576c Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Fri, 16 Jan 2026 14:51:02 -0800 Subject: [PATCH 1/7] Fix the type hint issue in ml/mllib and add scipy requirement --- dev/requirements.txt | 2 +- dev/spark-test-image/lint/Dockerfile | 1 + python/pyspark/ml/_typing.pyi | 8 +++- python/pyspark/ml/linalg/__init__.py | 51 ++++++++++++++----------- python/pyspark/mllib/_typing.pyi | 9 ++++- python/pyspark/mllib/linalg/__init__.py | 49 +++++++++++++----------- 6 files changed, 70 insertions(+), 50 deletions(-) diff --git a/dev/requirements.txt b/dev/requirements.txt index a64f9c4cc50a8..7153a8d71dc85 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -6,7 +6,7 @@ numpy>=1.22 pyarrow>=18.0.0 six==1.16.0 pandas>=2.2.0 -scipy +scipy>=1.8.0 plotly<6.0.0 mlflow>=2.3.1 scikit-learn diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile index 4984f56fc763a..e702745bb9351 100644 --- a/dev/spark-test-image/lint/Dockerfile +++ b/dev/spark-test-image/lint/Dockerfile @@ -99,6 +99,7 @@ RUN python3.11 -m pip install \ 'pyarrow>=22.0.0' \ 'pytest-mypy-plugins==1.9.3' \ 'pytest==7.1.3' \ + 'scipy>=1.8.0' \ && python3.11 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu \ && python3.11 -m pip install torcheval \ && python3.11 -m pip cache purge diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi index a5237dad7521c..2f22b6de0f5ed 100644 --- a/python/pyspark/ml/_typing.pyi +++ b/python/pyspark/ml/_typing.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, List, TypeVar, Tuple, Union +from typing import Any, Dict, List, TypeVar, Tuple, TYPE_CHECKING, Union from typing_extensions import Literal from numpy import ndarray @@ -28,6 +28,9 @@ import pyspark.ml.util from pyspark.ml.linalg import Vector import pyspark.ml.wrapper +if TYPE_CHECKING: + from scipy.sparse import spmatrix, sparray + ParamMap = Dict[pyspark.ml.param.Param, Any] PipelineStage = Union[pyspark.ml.base.Estimator, pyspark.ml.base.Transformer] @@ -81,4 +84,5 @@ RankingEvaluatorMetricType = Union[ Literal["recallAtK"], ] -VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]] +if TYPE_CHECKING: + VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index cedd3b04564ec..ee6fc8d21818a 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -70,7 +70,6 @@ if TYPE_CHECKING: from pyspark.mllib._typing import NormType from pyspark.ml._typing import VectorLike - from scipy.sparse import spmatrix # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, @@ -85,23 +84,25 @@ _have_scipy = False -def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector": +def _convert_to_vector(d: "VectorLike") -> "Vector": if isinstance(d, Vector): return d - elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range): + elif isinstance(d, (array.array, np.ndarray, list, tuple, range)): return DenseVector(d) elif _have_scipy and scipy.sparse.issparse(d): - assert cast("spmatrix", d).shape[1] == 1, "Expected column vector" + assert hasattr(d, "shape") + assert d.shape[1] == 1, "Expected column vector" # Make sure the converted csc_matrix has sorted indices. - csc = cast("spmatrix", d).tocsc() + assert hasattr(d, "tocsc") + csc = d.tocsc() if not csc.has_sorted_indices: csc.sort_indices() - return SparseVector(cast("spmatrix", d).shape[0], csc.indices, csc.data) + return SparseVector(d.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(d)) -def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int: +def _vector_size(v: "VectorLike") -> int: """ Returns the size of the vector. @@ -124,16 +125,17 @@ def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int: """ if isinstance(v, Vector): return len(v) - elif type(v) in (array.array, list, tuple, range): + elif isinstance(v, (array.array, list, tuple, range)): return len(v) - elif type(v) == np.ndarray: + elif isinstance(v, np.ndarray): if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): return len(v) else: raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape)) elif _have_scipy and scipy.sparse.issparse(v): - assert cast("spmatrix", v).shape[1] == 1, "Expected column vector" - return cast("spmatrix", v).shape[0] + assert hasattr(v, "shape") + assert v.shape[1] == 1, "Expected column vector" + return v.shape[0] else: raise TypeError("Cannot treat type %s as a vector" % type(v)) @@ -337,13 +339,13 @@ def __init__(self, ar: Union[bytes, np.ndarray, Iterable[float]]): def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]: return DenseVector, (self.array.tobytes(),) - def numNonzeros(self) -> int: + def numNonzeros(self) -> Union[int, np.intp]: """ Number of nonzero elements. This scans all active values and count non zeros """ return np.count_nonzero(self.array) - def norm(self, p: "NormType") -> np.float64: + def norm(self, p: "NormType") -> np.floating[Any]: """ Calculates the norm of a DenseVector. @@ -386,15 +388,17 @@ def dot(self, other: Iterable[float]) -> np.float64: ... AssertionError: dimension mismatch """ - if type(other) == np.ndarray: + if isinstance(other, np.ndarray): if other.ndim > 1: assert len(self) == other.shape[0], "dimension mismatch" return np.dot(self.array, other) elif _have_scipy and scipy.sparse.issparse(other): - assert len(self) == cast("spmatrix", other).shape[0], "dimension mismatch" - return cast("spmatrix", other).transpose().dot(self.toArray()) + assert hasattr(other, "shape") + assert len(self) == other.shape[0], "dimension mismatch" + assert hasattr(other, "transpose") + return other.transpose().dot(self.toArray()) else: - assert len(self) == _vector_size(other), "dimension mismatch" + assert len(self) == _vector_size(other), "dimension mismatch" # type: ignore[arg-type] if isinstance(other, SparseVector): return other.dot(self) elif isinstance(other, Vector): @@ -429,10 +433,11 @@ def squared_distance(self, other: Iterable[float]) -> np.float64: ... AssertionError: dimension mismatch """ - assert len(self) == _vector_size(other), "dimension mismatch" + assert len(self) == _vector_size(other), "dimension mismatch" # type: ignore[arg-type] if isinstance(other, SparseVector): return other.squared_distance(self) elif _have_scipy and scipy.sparse.issparse(other): + assert isinstance(other, scipy.sparse.spmatrix), "other must be a scipy.sparse.spmatrix" return _convert_to_vector(other).squared_distance(self) # type: ignore[attr-defined] if isinstance(other, Vector): @@ -636,13 +641,13 @@ def __init__( ) assert np.min(self.indices) >= 0, "Contains negative index %d" % (np.min(self.indices)) - def numNonzeros(self) -> int: + def numNonzeros(self) -> Union[int, np.intp]: """ Number of nonzero elements. This scans all active values and count non zeros. """ return np.count_nonzero(self.values) - def norm(self, p: "NormType") -> np.float64: + def norm(self, p: "NormType") -> np.floating[Any]: """ Calculates the norm of a SparseVector. @@ -699,7 +704,7 @@ def dot(self, other: Iterable[float]) -> np.float64: assert len(self) == other.shape[0], "dimension mismatch" return np.dot(self.values, other[self.indices]) - assert len(self) == _vector_size(other), "dimension mismatch" + assert len(self) == _vector_size(other), "dimension mismatch" # type: ignore[arg-type] if isinstance(other, DenseVector): return np.dot(other.array[self.indices], self.values) @@ -717,7 +722,7 @@ def dot(self, other: Iterable[float]) -> np.float64: else: return self.dot(_convert_to_vector(other)) # type: ignore[arg-type] - def squared_distance(self, other: Iterable[float]) -> np.float64: + def squared_distance(self, other: "VectorLike") -> np.float64: """ Squared distance from a SparseVector or 1-dimensional NumPy array. @@ -785,7 +790,7 @@ def squared_distance(self, other: Iterable[float]) -> np.float64: j += 1 return result else: - return self.squared_distance(_convert_to_vector(other)) # type: ignore[arg-type] + return self.squared_distance(_convert_to_vector(other)) def toArray(self) -> np.ndarray: """ diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi index ff90cb639f4c3..faffa669cb26a 100644 --- a/python/pyspark/mllib/_typing.pyi +++ b/python/pyspark/mllib/_typing.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import List, Tuple, TypeVar, Union +from typing import List, Tuple, TYPE_CHECKING, TypeVar, Union from typing_extensions import Literal from numpy import ndarray # noqa: F401 @@ -24,7 +24,12 @@ from py4j.java_gateway import JavaObject from pyspark.mllib.linalg import Vector -VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]] +if TYPE_CHECKING: + from scipy.sparse import spmatrix, sparray + + +if TYPE_CHECKING: + VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] C = TypeVar("C", bound=type) JavaObjectOrPickleDump = Union[JavaObject, bytearray, bytes] diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 40f0255a91bbe..15de94254855c 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -94,23 +94,25 @@ _have_scipy = False -def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector": +def _convert_to_vector(d: "VectorLike") -> "Vector": if isinstance(d, Vector): return d - elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range): + elif isinstance(d, (array.array, np.ndarray, list, tuple, range)): return DenseVector(d) elif _have_scipy and scipy.sparse.issparse(d): - assert cast("spmatrix", d).shape[1] == 1, "Expected column vector" + assert hasattr(d, "shape") + assert d.shape[1] == 1, "Expected column vector" # Make sure the converted csc_matrix has sorted indices. - csc = cast("spmatrix", d).tocsc() + assert hasattr(d, "tocsc") + csc = d.tocsc() if not csc.has_sorted_indices: csc.sort_indices() - return SparseVector(cast("spmatrix", d).shape[0], csc.indices, csc.data) + return SparseVector(d.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(d)) -def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int: +def _vector_size(v: "VectorLike") -> int: """ Returns the size of the vector. @@ -133,16 +135,17 @@ def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int: """ if isinstance(v, Vector): return len(v) - elif type(v) in (array.array, list, tuple, range): + elif isinstance(v, (array.array, list, tuple, range)): return len(v) - elif type(v) == np.ndarray: + elif isinstance(v, np.ndarray): if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): return len(v) else: raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape)) elif _have_scipy and scipy.sparse.issparse(v): - assert cast("spmatrix", v).shape[1] == 1, "Expected column vector" - return cast("spmatrix", v).shape[0] + assert hasattr(v, "shape") + assert v.shape[1] == 1, "Expected column vector" + return v.shape[0] else: raise TypeError("Cannot treat type %s as a vector" % type(v)) @@ -390,13 +393,13 @@ def parse(s: str) -> "DenseVector": def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]: return DenseVector, (self.array.tobytes(),) - def numNonzeros(self) -> int: + def numNonzeros(self) -> Union[int, np.intp]: """ Number of nonzero elements. This scans all active values and count non zeros """ return np.count_nonzero(self.array) - def norm(self, p: "NormType") -> np.float64: + def norm(self, p: "NormType") -> np.floating[Any]: """ Calculates the norm of a DenseVector. @@ -410,7 +413,7 @@ def norm(self, p: "NormType") -> np.float64: """ return np.linalg.norm(self.array, p) - def dot(self, other: Iterable[float]) -> np.float64: + def dot(self, other: "VectorLike") -> np.float64: """ Compute the dot product of two Vectors. We support (Numpy array, list, SparseVector, or SciPy sparse) @@ -444,8 +447,10 @@ def dot(self, other: Iterable[float]) -> np.float64: assert len(self) == other.shape[0], "dimension mismatch" return np.dot(self.array, other) elif _have_scipy and scipy.sparse.issparse(other): - assert len(self) == cast("spmatrix", other).shape[0], "dimension mismatch" - return cast("spmatrix", other).transpose().dot(self.toArray()) + assert hasattr(other, "shape") + assert len(self) == other.shape[0], "dimension mismatch" + assert hasattr(other, "transpose") + return other.transpose().dot(self.toArray()) else: assert len(self) == _vector_size(other), "dimension mismatch" if isinstance(other, SparseVector): @@ -455,7 +460,7 @@ def dot(self, other: Iterable[float]) -> np.float64: else: return np.dot(self.toArray(), cast("ArrayLike", other)) - def squared_distance(self, other: Iterable[float]) -> np.float64: + def squared_distance(self, other: "VectorLike") -> np.float64: """ Squared distance of two Vectors. @@ -685,13 +690,13 @@ def __init__( % (self.indices[i], self.indices[i + 1]) ) - def numNonzeros(self) -> int: + def numNonzeros(self) -> Union[int, np.intp]: """ Number of nonzero elements. This scans all active values and count non zeros. """ return np.count_nonzero(self.values) - def norm(self, p: "NormType") -> np.float64: + def norm(self, p: "NormType") -> np.floating[Any]: """ Calculates the norm of a SparseVector. @@ -766,7 +771,7 @@ def parse(s: str) -> "SparseVector": raise ValueError("Unable to parse values from %s." % s) return SparseVector(cast(int, size), indices, values) - def dot(self, other: Iterable[float]) -> np.float64: + def dot(self, other: "VectorLike") -> np.float64: """ Dot product with a SparseVector or 1- or 2-dimensional Numpy array. @@ -822,9 +827,9 @@ def dot(self, other: Iterable[float]) -> np.float64: return np.dot(self_values, other.values[other_cmind]) else: - return self.dot(_convert_to_vector(other)) # type: ignore[arg-type] + return self.dot(_convert_to_vector(other)) - def squared_distance(self, other: Iterable[float]) -> np.float64: + def squared_distance(self, other: "VectorLike") -> np.float64: """ Squared distance from a SparseVector or 1-dimensional NumPy array. @@ -892,7 +897,7 @@ def squared_distance(self, other: Iterable[float]) -> np.float64: j += 1 return result else: - return self.squared_distance(_convert_to_vector(other)) # type: ignore[arg-type] + return self.squared_distance(_convert_to_vector(other)) def toArray(self) -> np.ndarray: """ From 07273acf77445738a5c29484a25a8b80fd73f1a6 Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Fri, 16 Jan 2026 15:15:43 -0800 Subject: [PATCH 2/7] Fix more issues --- python/pyspark/mllib/_typing.pyi | 1 - python/pyspark/mllib/linalg/distributed.py | 2 +- python/pyspark/mllib/regression.py | 2 +- python/pyspark/mllib/stat/_statistics.py | 3 ++- python/pyspark/mllib/util.py | 6 +----- 5 files changed, 5 insertions(+), 9 deletions(-) diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi index faffa669cb26a..366efecf315c6 100644 --- a/python/pyspark/mllib/_typing.pyi +++ b/python/pyspark/mllib/_typing.pyi @@ -27,7 +27,6 @@ from pyspark.mllib.linalg import Vector if TYPE_CHECKING: from scipy.sparse import spmatrix, sparray - if TYPE_CHECKING: VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] C = TypeVar("C", bound=type) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index ecdb4e75ed490..aa98f959798ca 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -35,7 +35,7 @@ VT = TypeVar("VT", bound="Matrix") if TYPE_CHECKING: - from pyspark.ml._typing import VectorLike + from pyspark.mllib._typing import VectorLike __all__ = [ "BlockMatrix", diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index b384e0fa608e8..a69f8c00b221f 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -84,7 +84,7 @@ class LabeledPoint: 'label' and 'features' are accessible as class attributes. """ - def __init__(self, label: float, features: Iterable[float]): + def __init__(self, label: float, features: "VectorLike"): self.label = float(label) self.features = _convert_to_vector(features) diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index c638fb8195067..e993ced1a419c 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -189,7 +189,8 @@ def corr( if not y: return cast( - JavaObject, callMLlibFunc("corr", x.map(_convert_to_vector), method) + JavaObject, + callMLlibFunc("corr", cast(RDD[Vector], x).map(_convert_to_vector), method), ).toArray() else: return cast( diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index caa2c9338a959..65b25f8add6d7 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -145,11 +145,7 @@ def loadLibSVMFile( if numFeatures <= 0: parsed.cache() numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1 - return parsed.map( - lambda x: LabeledPoint( - x[0], Vectors.sparse(numFeatures, x[1], x[2]) # type: ignore[arg-type] - ) - ) + return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2]))) @staticmethod def saveAsLibSVMFile(data: RDD["LabeledPoint"], dir: str) -> None: From 2cce30cc90f4fbddbaa821ad25bee46c27053bd3 Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Sat, 17 Jan 2026 11:02:21 -0800 Subject: [PATCH 3/7] Always make VectorLike available --- python/pyspark/ml/_typing.pyi | 11 +++++------ python/pyspark/mllib/_typing.pyi | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi index 2f22b6de0f5ed..fe35a83f633ea 100644 --- a/python/pyspark/ml/_typing.pyi +++ b/python/pyspark/ml/_typing.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, List, TypeVar, Tuple, TYPE_CHECKING, Union +from typing import Any, Dict, List, TypeVar, Tuple, Union from typing_extensions import Literal from numpy import ndarray @@ -24,12 +24,14 @@ from py4j.java_gateway import JavaObject import pyspark.ml.base import pyspark.ml.param -import pyspark.ml.util from pyspark.ml.linalg import Vector import pyspark.ml.wrapper -if TYPE_CHECKING: +try: from scipy.sparse import spmatrix, sparray + VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] +except ImportError: + VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range] ParamMap = Dict[pyspark.ml.param.Param, Any] PipelineStage = Union[pyspark.ml.base.Estimator, pyspark.ml.base.Transformer] @@ -83,6 +85,3 @@ RankingEvaluatorMetricType = Union[ Literal["ndcgAtK"], Literal["recallAtK"], ] - -if TYPE_CHECKING: - VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi index 366efecf315c6..2e70a4c59df59 100644 --- a/python/pyspark/mllib/_typing.pyi +++ b/python/pyspark/mllib/_typing.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import List, Tuple, TYPE_CHECKING, TypeVar, Union +from typing import List, Tuple, TypeVar, Union from typing_extensions import Literal from numpy import ndarray # noqa: F401 @@ -24,11 +24,12 @@ from py4j.java_gateway import JavaObject from pyspark.mllib.linalg import Vector -if TYPE_CHECKING: +try: from scipy.sparse import spmatrix, sparray - -if TYPE_CHECKING: VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] +except ImportError: + VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range] + C = TypeVar("C", bound=type) JavaObjectOrPickleDump = Union[JavaObject, bytearray, bytes] From 9c01624c4e52c047076b50c4713d754d7054087d Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Sat, 17 Jan 2026 13:19:31 -0800 Subject: [PATCH 4/7] Reformat --- python/pyspark/ml/_typing.pyi | 1 + python/pyspark/mllib/_typing.pyi | 1 + 2 files changed, 2 insertions(+) diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi index fe35a83f633ea..dbcb38ea5694e 100644 --- a/python/pyspark/ml/_typing.pyi +++ b/python/pyspark/ml/_typing.pyi @@ -29,6 +29,7 @@ import pyspark.ml.wrapper try: from scipy.sparse import spmatrix, sparray + VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] except ImportError: VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range] diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi index 2e70a4c59df59..009635fa2c22b 100644 --- a/python/pyspark/mllib/_typing.pyi +++ b/python/pyspark/mllib/_typing.pyi @@ -26,6 +26,7 @@ from pyspark.mllib.linalg import Vector try: from scipy.sparse import spmatrix, sparray + VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] except ImportError: VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range] From d86072e112008ced1a0d38d8569c5cdbe00c9a31 Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Sat, 17 Jan 2026 18:09:27 -0800 Subject: [PATCH 5/7] Remove unused import --- python/pyspark/mllib/linalg/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 15de94254855c..1613e3d2cc8d0 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -61,7 +61,6 @@ if TYPE_CHECKING: from pyspark.mllib._typing import VectorLike, NormType - from scipy.sparse import spmatrix from numpy.typing import ArrayLike From 1e954f66d06936df722fbb90a0f36cbe17852199 Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Sat, 17 Jan 2026 20:32:06 -0800 Subject: [PATCH 6/7] Update type hint --- python/pyspark/ml/_typing.pyi | 10 ++++------ python/pyspark/mllib/_typing.pyi | 10 ++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi index dbcb38ea5694e..c24dfe577350e 100644 --- a/python/pyspark/ml/_typing.pyi +++ b/python/pyspark/ml/_typing.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, List, TypeVar, Tuple, Union +from typing import Any, Dict, List, TYPE_CHECKING, TypeVar, Tuple, Union from typing_extensions import Literal from numpy import ndarray @@ -27,13 +27,9 @@ import pyspark.ml.param from pyspark.ml.linalg import Vector import pyspark.ml.wrapper -try: +if TYPE_CHECKING: from scipy.sparse import spmatrix, sparray - VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] -except ImportError: - VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range] - ParamMap = Dict[pyspark.ml.param.Param, Any] PipelineStage = Union[pyspark.ml.base.Estimator, pyspark.ml.base.Transformer] @@ -86,3 +82,5 @@ RankingEvaluatorMetricType = Union[ Literal["ndcgAtK"], Literal["recallAtK"], ] + +VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], "spmatrix", "sparray", range] diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi index 009635fa2c22b..d34cfc84c7ae9 100644 --- a/python/pyspark/mllib/_typing.pyi +++ b/python/pyspark/mllib/_typing.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import List, Tuple, TypeVar, Union +from typing import List, Tuple, TYPE_CHECKING, TypeVar, Union from typing_extensions import Literal from numpy import ndarray # noqa: F401 @@ -24,16 +24,14 @@ from py4j.java_gateway import JavaObject from pyspark.mllib.linalg import Vector -try: +if TYPE_CHECKING: from scipy.sparse import spmatrix, sparray - VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range] -except ImportError: - VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range] - C = TypeVar("C", bound=type) JavaObjectOrPickleDump = Union[JavaObject, bytearray, bytes] CorrMethodType = Union[Literal["spearman"], Literal["pearson"]] KolmogorovSmirnovTestDistNameType = Literal["norm"] NormType = Union[None, float, Literal["fro"], Literal["nuc"]] + +VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], "spmatrix", "sparray", range] From 44e63be14c468a4769058ecf500bc490a563a2f2 Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Sat, 17 Jan 2026 23:17:36 -0800 Subject: [PATCH 7/7] Update scipy stubs --- dev/spark-test-image/lint/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile index e702745bb9351..9fd4bcd77e607 100644 --- a/dev/spark-test-image/lint/Dockerfile +++ b/dev/spark-test-image/lint/Dockerfile @@ -100,6 +100,7 @@ RUN python3.11 -m pip install \ 'pytest-mypy-plugins==1.9.3' \ 'pytest==7.1.3' \ 'scipy>=1.8.0' \ + 'scipy-stubs' \ && python3.11 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu \ && python3.11 -m pip install torcheval \ && python3.11 -m pip cache purge