From c390b4af0c20f1694a1e8392c0148c4b496e576c Mon Sep 17 00:00:00 2001
From: Tian Gao <gaogaotiantian@hotmail.com>
Date: Fri, 16 Jan 2026 14:51:02 -0800
Subject: [PATCH 1/7] Fix the type hint issue in ml/mllib and add scipy
 requirement

---
 dev/requirements.txt                    |  2 +-
 dev/spark-test-image/lint/Dockerfile    |  1 +
 python/pyspark/ml/_typing.pyi           |  8 +++-
 python/pyspark/ml/linalg/__init__.py    | 51 ++++++++++++++-----------
 python/pyspark/mllib/_typing.pyi        |  9 ++++-
 python/pyspark/mllib/linalg/__init__.py | 49 +++++++++++++-----------
 6 files changed, 70 insertions(+), 50 deletions(-)

diff --git a/dev/requirements.txt b/dev/requirements.txt
index a64f9c4cc50a8..7153a8d71dc85 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -6,7 +6,7 @@ numpy>=1.22
 pyarrow>=18.0.0
 six==1.16.0
 pandas>=2.2.0
-scipy
+scipy>=1.8.0
 plotly<6.0.0
 mlflow>=2.3.1
 scikit-learn
diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile
index 4984f56fc763a..e702745bb9351 100644
--- a/dev/spark-test-image/lint/Dockerfile
+++ b/dev/spark-test-image/lint/Dockerfile
@@ -99,6 +99,7 @@ RUN python3.11 -m pip install \
     'pyarrow>=22.0.0' \
     'pytest-mypy-plugins==1.9.3' \
     'pytest==7.1.3' \
+    'scipy>=1.8.0' \
     && python3.11 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu \
     && python3.11 -m pip install torcheval \
     && python3.11 -m pip cache purge
diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi
index a5237dad7521c..2f22b6de0f5ed 100644
--- a/python/pyspark/ml/_typing.pyi
+++ b/python/pyspark/ml/_typing.pyi
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, Dict, List, TypeVar, Tuple, Union
+from typing import Any, Dict, List, TypeVar, Tuple, TYPE_CHECKING, Union
 from typing_extensions import Literal
 
 from numpy import ndarray
@@ -28,6 +28,9 @@ import pyspark.ml.util
 from pyspark.ml.linalg import Vector
 import pyspark.ml.wrapper
 
+if TYPE_CHECKING:
+    from scipy.sparse import spmatrix, sparray
+
 ParamMap = Dict[pyspark.ml.param.Param, Any]
 PipelineStage = Union[pyspark.ml.base.Estimator, pyspark.ml.base.Transformer]
 
@@ -81,4 +84,5 @@ RankingEvaluatorMetricType = Union[
     Literal["recallAtK"],
 ]
 
-VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]]
+if TYPE_CHECKING:
+    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index cedd3b04564ec..ee6fc8d21818a 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -70,7 +70,6 @@
 if TYPE_CHECKING:
     from pyspark.mllib._typing import NormType
     from pyspark.ml._typing import VectorLike
-    from scipy.sparse import spmatrix
 
 
 # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
@@ -85,23 +84,25 @@
     _have_scipy = False
 
 
-def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector":
+def _convert_to_vector(d: "VectorLike") -> "Vector":
     if isinstance(d, Vector):
         return d
-    elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range):
+    elif isinstance(d, (array.array, np.ndarray, list, tuple, range)):
         return DenseVector(d)
     elif _have_scipy and scipy.sparse.issparse(d):
-        assert cast("spmatrix", d).shape[1] == 1, "Expected column vector"
+        assert hasattr(d, "shape")
+        assert d.shape[1] == 1, "Expected column vector"
         # Make sure the converted csc_matrix has sorted indices.
-        csc = cast("spmatrix", d).tocsc()
+        assert hasattr(d, "tocsc")
+        csc = d.tocsc()
         if not csc.has_sorted_indices:
             csc.sort_indices()
-        return SparseVector(cast("spmatrix", d).shape[0], csc.indices, csc.data)
+        return SparseVector(d.shape[0], csc.indices, csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(d))
 
 
-def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
+def _vector_size(v: "VectorLike") -> int:
     """
     Returns the size of the vector.
 
@@ -124,16 +125,17 @@ def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
     """
     if isinstance(v, Vector):
         return len(v)
-    elif type(v) in (array.array, list, tuple, range):
+    elif isinstance(v, (array.array, list, tuple, range)):
         return len(v)
-    elif type(v) == np.ndarray:
+    elif isinstance(v, np.ndarray):
         if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
             return len(v)
         else:
             raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
     elif _have_scipy and scipy.sparse.issparse(v):
-        assert cast("spmatrix", v).shape[1] == 1, "Expected column vector"
-        return cast("spmatrix", v).shape[0]
+        assert hasattr(v, "shape")
+        assert v.shape[1] == 1, "Expected column vector"
+        return v.shape[0]
     else:
         raise TypeError("Cannot treat type %s as a vector" % type(v))
 
@@ -337,13 +339,13 @@ def __init__(self, ar: Union[bytes, np.ndarray, Iterable[float]]):
     def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]:
         return DenseVector, (self.array.tobytes(),)
 
-    def numNonzeros(self) -> int:
+    def numNonzeros(self) -> Union[int, np.intp]:
         """
         Number of nonzero elements. This scans all active values and count non zeros
         """
         return np.count_nonzero(self.array)
 
-    def norm(self, p: "NormType") -> np.float64:
+    def norm(self, p: "NormType") -> np.floating[Any]:
         """
         Calculates the norm of a DenseVector.
 
@@ -386,15 +388,17 @@ def dot(self, other: Iterable[float]) -> np.float64:
             ...
         AssertionError: dimension mismatch
         """
-        if type(other) == np.ndarray:
+        if isinstance(other, np.ndarray):
             if other.ndim > 1:
                 assert len(self) == other.shape[0], "dimension mismatch"
             return np.dot(self.array, other)
         elif _have_scipy and scipy.sparse.issparse(other):
-            assert len(self) == cast("spmatrix", other).shape[0], "dimension mismatch"
-            return cast("spmatrix", other).transpose().dot(self.toArray())
+            assert hasattr(other, "shape")
+            assert len(self) == other.shape[0], "dimension mismatch"
+            assert hasattr(other, "transpose")
+            return other.transpose().dot(self.toArray())
         else:
-            assert len(self) == _vector_size(other), "dimension mismatch"
+            assert len(self) == _vector_size(other), "dimension mismatch"  # type: ignore[arg-type]
             if isinstance(other, SparseVector):
                 return other.dot(self)
             elif isinstance(other, Vector):
@@ -429,10 +433,11 @@ def squared_distance(self, other: Iterable[float]) -> np.float64:
             ...
         AssertionError: dimension mismatch
         """
-        assert len(self) == _vector_size(other), "dimension mismatch"
+        assert len(self) == _vector_size(other), "dimension mismatch"  # type: ignore[arg-type]
         if isinstance(other, SparseVector):
             return other.squared_distance(self)
         elif _have_scipy and scipy.sparse.issparse(other):
+            assert isinstance(other, scipy.sparse.spmatrix), "other must be a scipy.sparse.spmatrix"
             return _convert_to_vector(other).squared_distance(self)  # type: ignore[attr-defined]
 
         if isinstance(other, Vector):
@@ -636,13 +641,13 @@ def __init__(
             )
             assert np.min(self.indices) >= 0, "Contains negative index %d" % (np.min(self.indices))
 
-    def numNonzeros(self) -> int:
+    def numNonzeros(self) -> Union[int, np.intp]:
         """
         Number of nonzero elements. This scans all active values and count non zeros.
         """
         return np.count_nonzero(self.values)
 
-    def norm(self, p: "NormType") -> np.float64:
+    def norm(self, p: "NormType") -> np.floating[Any]:
         """
         Calculates the norm of a SparseVector.
 
@@ -699,7 +704,7 @@ def dot(self, other: Iterable[float]) -> np.float64:
             assert len(self) == other.shape[0], "dimension mismatch"
             return np.dot(self.values, other[self.indices])
 
-        assert len(self) == _vector_size(other), "dimension mismatch"
+        assert len(self) == _vector_size(other), "dimension mismatch"  # type: ignore[arg-type]
 
         if isinstance(other, DenseVector):
             return np.dot(other.array[self.indices], self.values)
@@ -717,7 +722,7 @@ def dot(self, other: Iterable[float]) -> np.float64:
         else:
             return self.dot(_convert_to_vector(other))  # type: ignore[arg-type]
 
-    def squared_distance(self, other: Iterable[float]) -> np.float64:
+    def squared_distance(self, other: "VectorLike") -> np.float64:
         """
         Squared distance from a SparseVector or 1-dimensional NumPy array.
 
@@ -785,7 +790,7 @@ def squared_distance(self, other: Iterable[float]) -> np.float64:
                 j += 1
             return result
         else:
-            return self.squared_distance(_convert_to_vector(other))  # type: ignore[arg-type]
+            return self.squared_distance(_convert_to_vector(other))
 
     def toArray(self) -> np.ndarray:
         """
diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi
index ff90cb639f4c3..faffa669cb26a 100644
--- a/python/pyspark/mllib/_typing.pyi
+++ b/python/pyspark/mllib/_typing.pyi
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import List, Tuple, TypeVar, Union
+from typing import List, Tuple, TYPE_CHECKING, TypeVar, Union
 
 from typing_extensions import Literal
 from numpy import ndarray  # noqa: F401
@@ -24,7 +24,12 @@ from py4j.java_gateway import JavaObject
 
 from pyspark.mllib.linalg import Vector
 
-VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]]
+if TYPE_CHECKING:
+    from scipy.sparse import spmatrix, sparray
+
+
+if TYPE_CHECKING:
+    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
 C = TypeVar("C", bound=type)
 JavaObjectOrPickleDump = Union[JavaObject, bytearray, bytes]
 
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 40f0255a91bbe..15de94254855c 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -94,23 +94,25 @@
     _have_scipy = False
 
 
-def _convert_to_vector(d: Union["VectorLike", "spmatrix", range]) -> "Vector":
+def _convert_to_vector(d: "VectorLike") -> "Vector":
     if isinstance(d, Vector):
         return d
-    elif type(d) in (array.array, np.array, np.ndarray, list, tuple, range):
+    elif isinstance(d, (array.array, np.ndarray, list, tuple, range)):
         return DenseVector(d)
     elif _have_scipy and scipy.sparse.issparse(d):
-        assert cast("spmatrix", d).shape[1] == 1, "Expected column vector"
+        assert hasattr(d, "shape")
+        assert d.shape[1] == 1, "Expected column vector"
         # Make sure the converted csc_matrix has sorted indices.
-        csc = cast("spmatrix", d).tocsc()
+        assert hasattr(d, "tocsc")
+        csc = d.tocsc()
         if not csc.has_sorted_indices:
             csc.sort_indices()
-        return SparseVector(cast("spmatrix", d).shape[0], csc.indices, csc.data)
+        return SparseVector(d.shape[0], csc.indices, csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(d))
 
 
-def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
+def _vector_size(v: "VectorLike") -> int:
     """
     Returns the size of the vector.
 
@@ -133,16 +135,17 @@ def _vector_size(v: Union["VectorLike", "spmatrix", range]) -> int:
     """
     if isinstance(v, Vector):
         return len(v)
-    elif type(v) in (array.array, list, tuple, range):
+    elif isinstance(v, (array.array, list, tuple, range)):
         return len(v)
-    elif type(v) == np.ndarray:
+    elif isinstance(v, np.ndarray):
         if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
             return len(v)
         else:
             raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
     elif _have_scipy and scipy.sparse.issparse(v):
-        assert cast("spmatrix", v).shape[1] == 1, "Expected column vector"
-        return cast("spmatrix", v).shape[0]
+        assert hasattr(v, "shape")
+        assert v.shape[1] == 1, "Expected column vector"
+        return v.shape[0]
     else:
         raise TypeError("Cannot treat type %s as a vector" % type(v))
 
@@ -390,13 +393,13 @@ def parse(s: str) -> "DenseVector":
     def __reduce__(self) -> Tuple[Type["DenseVector"], Tuple[bytes]]:
         return DenseVector, (self.array.tobytes(),)
 
-    def numNonzeros(self) -> int:
+    def numNonzeros(self) -> Union[int, np.intp]:
         """
         Number of nonzero elements. This scans all active values and count non zeros
         """
         return np.count_nonzero(self.array)
 
-    def norm(self, p: "NormType") -> np.float64:
+    def norm(self, p: "NormType") -> np.floating[Any]:
         """
         Calculates the norm of a DenseVector.
 
@@ -410,7 +413,7 @@ def norm(self, p: "NormType") -> np.float64:
         """
         return np.linalg.norm(self.array, p)
 
-    def dot(self, other: Iterable[float]) -> np.float64:
+    def dot(self, other: "VectorLike") -> np.float64:
         """
         Compute the dot product of two Vectors. We support
         (Numpy array, list, SparseVector, or SciPy sparse)
@@ -444,8 +447,10 @@ def dot(self, other: Iterable[float]) -> np.float64:
                 assert len(self) == other.shape[0], "dimension mismatch"
             return np.dot(self.array, other)
         elif _have_scipy and scipy.sparse.issparse(other):
-            assert len(self) == cast("spmatrix", other).shape[0], "dimension mismatch"
-            return cast("spmatrix", other).transpose().dot(self.toArray())
+            assert hasattr(other, "shape")
+            assert len(self) == other.shape[0], "dimension mismatch"
+            assert hasattr(other, "transpose")
+            return other.transpose().dot(self.toArray())
         else:
             assert len(self) == _vector_size(other), "dimension mismatch"
             if isinstance(other, SparseVector):
@@ -455,7 +460,7 @@ def dot(self, other: Iterable[float]) -> np.float64:
             else:
                 return np.dot(self.toArray(), cast("ArrayLike", other))
 
-    def squared_distance(self, other: Iterable[float]) -> np.float64:
+    def squared_distance(self, other: "VectorLike") -> np.float64:
         """
         Squared distance of two Vectors.
 
@@ -685,13 +690,13 @@ def __init__(
                         % (self.indices[i], self.indices[i + 1])
                     )
 
-    def numNonzeros(self) -> int:
+    def numNonzeros(self) -> Union[int, np.intp]:
         """
         Number of nonzero elements. This scans all active values and count non zeros.
         """
         return np.count_nonzero(self.values)
 
-    def norm(self, p: "NormType") -> np.float64:
+    def norm(self, p: "NormType") -> np.floating[Any]:
         """
         Calculates the norm of a SparseVector.
 
@@ -766,7 +771,7 @@ def parse(s: str) -> "SparseVector":
             raise ValueError("Unable to parse values from %s." % s)
         return SparseVector(cast(int, size), indices, values)
 
-    def dot(self, other: Iterable[float]) -> np.float64:
+    def dot(self, other: "VectorLike") -> np.float64:
         """
         Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
 
@@ -822,9 +827,9 @@ def dot(self, other: Iterable[float]) -> np.float64:
                 return np.dot(self_values, other.values[other_cmind])
 
         else:
-            return self.dot(_convert_to_vector(other))  # type: ignore[arg-type]
+            return self.dot(_convert_to_vector(other))
 
-    def squared_distance(self, other: Iterable[float]) -> np.float64:
+    def squared_distance(self, other: "VectorLike") -> np.float64:
         """
         Squared distance from a SparseVector or 1-dimensional NumPy array.
 
@@ -892,7 +897,7 @@ def squared_distance(self, other: Iterable[float]) -> np.float64:
                 j += 1
             return result
         else:
-            return self.squared_distance(_convert_to_vector(other))  # type: ignore[arg-type]
+            return self.squared_distance(_convert_to_vector(other))
 
     def toArray(self) -> np.ndarray:
         """

From 07273acf77445738a5c29484a25a8b80fd73f1a6 Mon Sep 17 00:00:00 2001
From: Tian Gao <gaogaotiantian@hotmail.com>
Date: Fri, 16 Jan 2026 15:15:43 -0800
Subject: [PATCH 2/7] Fix more issues

---
 python/pyspark/mllib/_typing.pyi           | 1 -
 python/pyspark/mllib/linalg/distributed.py | 2 +-
 python/pyspark/mllib/regression.py         | 2 +-
 python/pyspark/mllib/stat/_statistics.py   | 3 ++-
 python/pyspark/mllib/util.py               | 6 +-----
 5 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi
index faffa669cb26a..366efecf315c6 100644
--- a/python/pyspark/mllib/_typing.pyi
+++ b/python/pyspark/mllib/_typing.pyi
@@ -27,7 +27,6 @@ from pyspark.mllib.linalg import Vector
 if TYPE_CHECKING:
     from scipy.sparse import spmatrix, sparray
 
-
 if TYPE_CHECKING:
     VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
 C = TypeVar("C", bound=type)
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index ecdb4e75ed490..aa98f959798ca 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -35,7 +35,7 @@
 VT = TypeVar("VT", bound="Matrix")
 
 if TYPE_CHECKING:
-    from pyspark.ml._typing import VectorLike
+    from pyspark.mllib._typing import VectorLike
 
 __all__ = [
     "BlockMatrix",
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index b384e0fa608e8..a69f8c00b221f 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -84,7 +84,7 @@ class LabeledPoint:
     'label' and 'features' are accessible as class attributes.
     """
 
-    def __init__(self, label: float, features: Iterable[float]):
+    def __init__(self, label: float, features: "VectorLike"):
         self.label = float(label)
         self.features = _convert_to_vector(features)
 
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index c638fb8195067..e993ced1a419c 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -189,7 +189,8 @@ def corr(
 
         if not y:
             return cast(
-                JavaObject, callMLlibFunc("corr", x.map(_convert_to_vector), method)
+                JavaObject,
+                callMLlibFunc("corr", cast(RDD[Vector], x).map(_convert_to_vector), method),
             ).toArray()
         else:
             return cast(
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index caa2c9338a959..65b25f8add6d7 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -145,11 +145,7 @@ def loadLibSVMFile(
         if numFeatures <= 0:
             parsed.cache()
             numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
-        return parsed.map(
-            lambda x: LabeledPoint(
-                x[0], Vectors.sparse(numFeatures, x[1], x[2])  # type: ignore[arg-type]
-            )
-        )
+        return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
     def saveAsLibSVMFile(data: RDD["LabeledPoint"], dir: str) -> None:

From 2cce30cc90f4fbddbaa821ad25bee46c27053bd3 Mon Sep 17 00:00:00 2001
From: Tian Gao <gaogaotiantian@hotmail.com>
Date: Sat, 17 Jan 2026 11:02:21 -0800
Subject: [PATCH 3/7] Always make VectorLike available

---
 python/pyspark/ml/_typing.pyi    | 11 +++++------
 python/pyspark/mllib/_typing.pyi |  9 +++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi
index 2f22b6de0f5ed..fe35a83f633ea 100644
--- a/python/pyspark/ml/_typing.pyi
+++ b/python/pyspark/ml/_typing.pyi
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, Dict, List, TypeVar, Tuple, TYPE_CHECKING, Union
+from typing import Any, Dict, List, TypeVar, Tuple, Union
 from typing_extensions import Literal
 
 from numpy import ndarray
@@ -24,12 +24,14 @@ from py4j.java_gateway import JavaObject
 
 import pyspark.ml.base
 import pyspark.ml.param
-import pyspark.ml.util
 from pyspark.ml.linalg import Vector
 import pyspark.ml.wrapper
 
-if TYPE_CHECKING:
+try:
     from scipy.sparse import spmatrix, sparray
+    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
+except ImportError:
+    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range]
 
 ParamMap = Dict[pyspark.ml.param.Param, Any]
 PipelineStage = Union[pyspark.ml.base.Estimator, pyspark.ml.base.Transformer]
@@ -83,6 +85,3 @@ RankingEvaluatorMetricType = Union[
     Literal["ndcgAtK"],
     Literal["recallAtK"],
 ]
-
-if TYPE_CHECKING:
-    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi
index 366efecf315c6..2e70a4c59df59 100644
--- a/python/pyspark/mllib/_typing.pyi
+++ b/python/pyspark/mllib/_typing.pyi
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import List, Tuple, TYPE_CHECKING, TypeVar, Union
+from typing import List, Tuple, TypeVar, Union
 
 from typing_extensions import Literal
 from numpy import ndarray  # noqa: F401
@@ -24,11 +24,12 @@ from py4j.java_gateway import JavaObject
 
 from pyspark.mllib.linalg import Vector
 
-if TYPE_CHECKING:
+try:
     from scipy.sparse import spmatrix, sparray
-
-if TYPE_CHECKING:
     VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
+except ImportError:
+    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range]
+
 C = TypeVar("C", bound=type)
 JavaObjectOrPickleDump = Union[JavaObject, bytearray, bytes]
 

From 9c01624c4e52c047076b50c4713d754d7054087d Mon Sep 17 00:00:00 2001
From: Tian Gao <gaogaotiantian@hotmail.com>
Date: Sat, 17 Jan 2026 13:19:31 -0800
Subject: [PATCH 4/7] Reformat

---
 python/pyspark/ml/_typing.pyi    | 1 +
 python/pyspark/mllib/_typing.pyi | 1 +
 2 files changed, 2 insertions(+)

diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi
index fe35a83f633ea..dbcb38ea5694e 100644
--- a/python/pyspark/ml/_typing.pyi
+++ b/python/pyspark/ml/_typing.pyi
@@ -29,6 +29,7 @@ import pyspark.ml.wrapper
 
 try:
     from scipy.sparse import spmatrix, sparray
+
     VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
 except ImportError:
     VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range]
diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi
index 2e70a4c59df59..009635fa2c22b 100644
--- a/python/pyspark/mllib/_typing.pyi
+++ b/python/pyspark/mllib/_typing.pyi
@@ -26,6 +26,7 @@ from pyspark.mllib.linalg import Vector
 
 try:
     from scipy.sparse import spmatrix, sparray
+
     VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
 except ImportError:
     VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range]

From d86072e112008ced1a0d38d8569c5cdbe00c9a31 Mon Sep 17 00:00:00 2001
From: Tian Gao <gaogaotiantian@hotmail.com>
Date: Sat, 17 Jan 2026 18:09:27 -0800
Subject: [PATCH 5/7] Remove unused import

---
 python/pyspark/mllib/linalg/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 15de94254855c..1613e3d2cc8d0 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -61,7 +61,6 @@
 
 if TYPE_CHECKING:
     from pyspark.mllib._typing import VectorLike, NormType
-    from scipy.sparse import spmatrix
     from numpy.typing import ArrayLike
 
 

From 1e954f66d06936df722fbb90a0f36cbe17852199 Mon Sep 17 00:00:00 2001
From: Tian Gao <gaogaotiantian@hotmail.com>
Date: Sat, 17 Jan 2026 20:32:06 -0800
Subject: [PATCH 6/7] Update type hint

---
 python/pyspark/ml/_typing.pyi    | 10 ++++------
 python/pyspark/mllib/_typing.pyi | 10 ++++------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/ml/_typing.pyi b/python/pyspark/ml/_typing.pyi
index dbcb38ea5694e..c24dfe577350e 100644
--- a/python/pyspark/ml/_typing.pyi
+++ b/python/pyspark/ml/_typing.pyi
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, Dict, List, TypeVar, Tuple, Union
+from typing import Any, Dict, List, TYPE_CHECKING, TypeVar, Tuple, Union
 from typing_extensions import Literal
 
 from numpy import ndarray
@@ -27,13 +27,9 @@ import pyspark.ml.param
 from pyspark.ml.linalg import Vector
 import pyspark.ml.wrapper
 
-try:
+if TYPE_CHECKING:
     from scipy.sparse import spmatrix, sparray
 
-    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
-except ImportError:
-    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range]
-
 ParamMap = Dict[pyspark.ml.param.Param, Any]
 PipelineStage = Union[pyspark.ml.base.Estimator, pyspark.ml.base.Transformer]
 
@@ -86,3 +82,5 @@ RankingEvaluatorMetricType = Union[
     Literal["ndcgAtK"],
     Literal["recallAtK"],
 ]
+
+VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], "spmatrix", "sparray", range]
diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi
index 009635fa2c22b..d34cfc84c7ae9 100644
--- a/python/pyspark/mllib/_typing.pyi
+++ b/python/pyspark/mllib/_typing.pyi
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import List, Tuple, TypeVar, Union
+from typing import List, Tuple, TYPE_CHECKING, TypeVar, Union
 
 from typing_extensions import Literal
 from numpy import ndarray  # noqa: F401
@@ -24,16 +24,14 @@ from py4j.java_gateway import JavaObject
 
 from pyspark.mllib.linalg import Vector
 
-try:
+if TYPE_CHECKING:
     from scipy.sparse import spmatrix, sparray
 
-    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], spmatrix, sparray, range]
-except ImportError:
-    VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], range]
-
 C = TypeVar("C", bound=type)
 JavaObjectOrPickleDump = Union[JavaObject, bytearray, bytes]
 
 CorrMethodType = Union[Literal["spearman"], Literal["pearson"]]
 KolmogorovSmirnovTestDistNameType = Literal["norm"]
 NormType = Union[None, float, Literal["fro"], Literal["nuc"]]
+
+VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...], "spmatrix", "sparray", range]

From 44e63be14c468a4769058ecf500bc490a563a2f2 Mon Sep 17 00:00:00 2001
From: Tian Gao <gaogaotiantian@hotmail.com>
Date: Sat, 17 Jan 2026 23:17:36 -0800
Subject: [PATCH 7/7] Update scipy stubs

---
 dev/spark-test-image/lint/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/spark-test-image/lint/Dockerfile b/dev/spark-test-image/lint/Dockerfile
index e702745bb9351..9fd4bcd77e607 100644
--- a/dev/spark-test-image/lint/Dockerfile
+++ b/dev/spark-test-image/lint/Dockerfile
@@ -100,6 +100,7 @@ RUN python3.11 -m pip install \
     'pytest-mypy-plugins==1.9.3' \
     'pytest==7.1.3' \
     'scipy>=1.8.0' \
+    'scipy-stubs' \
     && python3.11 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu \
     && python3.11 -m pip install torcheval \
     && python3.11 -m pip cache purge