diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index e5d367958dd..4ca0f9b6dc6 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -239,6 +239,11 @@ jobs:
- name: Test
shell: bash
run: ci/scripts/python_test.sh $(pwd) $(pwd)/build
+ - name: Test annotations
+ shell: bash
+ env:
+ PYARROW_TEST_ANNOTATIONS: "ON"
+ run: ci/scripts/python_test_type_annotations.sh $(pwd)/python
windows:
name: AMD64 Windows 2022 Python 3.13
@@ -296,3 +301,7 @@ jobs:
shell: cmd
run: |
call "ci\scripts\python_test.bat" %cd%
+ - name: Test annotations
+ shell: cmd
+ run: |
+ call "ci\scripts\python_test_type_annotations.bat" %cd%\python
diff --git a/ci/scripts/python_test_type_annotations.bat b/ci/scripts/python_test_type_annotations.bat
new file mode 100644
index 00000000000..3446e329a89
--- /dev/null
+++ b/ci/scripts/python_test_type_annotations.bat
@@ -0,0 +1,38 @@
+@rem Licensed to the Apache Software Foundation (ASF) under one
+@rem or more contributor license agreements. See the NOTICE file
+@rem distributed with this work for additional information
+@rem regarding copyright ownership. The ASF licenses this file
+@rem to you under the Apache License, Version 2.0 (the
+@rem "License"); you may not use this file except in compliance
+@rem with the License. You may obtain a copy of the License at
+@rem
+@rem http://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing,
+@rem software distributed under the License is distributed on an
+@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+@rem KIND, either express or implied. See the License for the
+@rem specific language governing permissions and limitations
+@rem under the License.
+
+@echo on
+
+set PYARROW_DIR=%1
+
+echo Annotation testing on Windows ...
+
+@REM Install library stubs
+%PYTHON_CMD% -m pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1
+
+@REM Install other dependencies for type checking
+%PYTHON_CMD% -m pip install fsspec || exit /B 1
+
+@REM Install type checkers
+%PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1
+
+@REM Run type checkers
+pushd %PYARROW_DIR%
+
+mypy
+pyright
+ty check
diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh
new file mode 100755
index 00000000000..82610ce6630
--- /dev/null
+++ b/ci/scripts/python_test_type_annotations.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+pyarrow_dir=${1}
+
+if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then
+ # Install library stubs
+ pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil
+
+ # Install type checkers
+ pip install mypy pyright ty
+
+ # Install other dependencies for type checking
+ pip install fsspec
+
+ # Run type checkers
+ pushd ${pyarrow_dir}
+ mypy
+ pyright
+ ty check;
+else
+ echo "Skipping type annotation tests";
+fi
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
index 8d113312927..8d63679de08 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -175,6 +175,11 @@ export CMAKE_PREFIX_PATH=${build_dir}/install
export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION}
pushd ${source_dir}/python
+# We first populate stub docstrings and then build the wheel
+python setup.py build_ext --inplace
+python -m pip install griffe libcst
+python ../dev/update_stub_docstrings.py pyarrow-stubs
+
python setup.py bdist_wheel
popd
diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py
index 84fcaba42e6..ee4a31aedb8 100644
--- a/ci/scripts/python_wheel_validate_contents.py
+++ b/ci/scripts/python_wheel_validate_contents.py
@@ -35,6 +35,11 @@ def validate_wheel(path):
assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}"
print(f"The wheel: {wheels[0]} seems valid.")
+ candidates = [info for info in f.filelist if info.filename.endswith('compute.pyi')]
+ assert candidates, "compute.pyi not found in wheel"
+ content = f.read(candidates[0]).decode('utf-8', errors='replace')
+ assert '"""' in content, "compute.pyi missing docstrings (no triple quotes found)"
+
def main():
parser = argparse.ArgumentParser()
diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat
index b4b7fed99fd..3da7f60f182 100644
--- a/ci/scripts/python_wheel_windows_build.bat
+++ b/ci/scripts/python_wheel_windows_build.bat
@@ -135,6 +135,11 @@ pushd C:\arrow\python
@REM Build wheel
%PYTHON_CMD% setup.py bdist_wheel || exit /B 1
+@REM We first populate stub docstrings and then build the wheel
+%PYTHON_CMD% setup.py build_ext --inplace
+%PYTHON_CMD% -m pip install griffe libcst
+%PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs
+
@REM Repair the wheel with delvewheel
@REM
@REM Since we bundled the Arrow C++ libraries ourselves, we only need to
diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh
index a3fbeb3c0b3..977ef64e008 100755
--- a/ci/scripts/python_wheel_xlinux_build.sh
+++ b/ci/scripts/python_wheel_xlinux_build.sh
@@ -167,6 +167,11 @@ export ARROW_HOME=/tmp/arrow-dist
export CMAKE_PREFIX_PATH=/tmp/arrow-dist
pushd /arrow/python
+# We first populate stub docstrings and then build the wheel
+python setup.py build_ext --inplace
+python -m pip install griffe libcst
+python ../dev/update_stub_docstrings.py pyarrow-stubs
+
python setup.py bdist_wheel
echo "=== Strip symbols from wheel ==="
diff --git a/compose.yaml b/compose.yaml
index 84481e1af76..1d368d4df08 100644
--- a/compose.yaml
+++ b/compose.yaml
@@ -919,12 +919,14 @@ services:
environment:
<<: [*common, *ccache, *sccache]
PYTEST_ARGS: # inherit
+ PYARROW_TEST_ANNOTATIONS: "ON"
volumes: *conda-volumes
command: &python-conda-command
["
/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
- /arrow/ci/scripts/python_test.sh /arrow"]
+ /arrow/ci/scripts/python_test.sh /arrow &&
+ /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"]
conda-python-emscripten:
# Usage:
@@ -1001,6 +1003,7 @@ services:
ARROW_S3: "OFF"
ARROW_SUBSTRAIT: "OFF"
ARROW_WITH_OPENTELEMETRY: "OFF"
+ PYARROW_TEST_ANNOTATIONS: "ON"
SETUPTOOLS_SCM_PRETEND_VERSION:
volumes: *ubuntu-volumes
deploy: *cuda-deploy
@@ -1008,7 +1011,8 @@ services:
/bin/bash -c "
/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
- /arrow/ci/scripts/python_test.sh /arrow"
+ /arrow/ci/scripts/python_test.sh /arrow &&
+ /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"
debian-python:
# Usage:
@@ -1500,6 +1504,7 @@ services:
python: ${PYTHON}
shm_size: *shm-size
environment:
+ PYARROW_TEST_ANNOTATIONS: "ON"
<<: [*common, *ccache, *sccache]
PARQUET_REQUIRE_ENCRYPTION: # inherit
HYPOTHESIS_PROFILE: # inherit
@@ -1510,7 +1515,8 @@ services:
/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
mamba uninstall -y numpy &&
- /arrow/ci/scripts/python_test.sh /arrow"]
+ /arrow/ci/scripts/python_test.sh /arrow &&
+ /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"]
conda-python-docs:
# Usage:
@@ -1530,13 +1536,15 @@ services:
BUILD_DOCS_CPP: "ON"
BUILD_DOCS_PYTHON: "ON"
PYTEST_ARGS: "--doctest-modules --doctest-cython"
+ PYARROW_TEST_ANNOTATIONS: "ON"
volumes: *conda-volumes
command:
["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
pip install -e /arrow/dev/archery[numpydoc] &&
archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 &&
- /arrow/ci/scripts/python_test.sh /arrow"]
+ /arrow/ci/scripts/python_test.sh /arrow &&
+ /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"]
conda-python-dask:
# Possible $DASK parameters:
diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py
new file mode 100644
index 00000000000..eaeb2a510eb
--- /dev/null
+++ b/dev/update_stub_docstrings.py
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Utility to extract docstrings from pyarrow and update
+# docstrings in stubfiles.
+#
+# Usage
+# =====
+#
+# python ../dev/update_stub_docstrings.py pyarrow-stubs
+
+
+from pathlib import Path
+from textwrap import indent
+
+import click
+# TODO: perhaps replace griffe with importlib
+import griffe
+from griffe import AliasResolutionError
+import libcst
+from libcst import matchers as m
+
+
+def _get_docstring(name, package, indentation):
+ # print("extract_docstrings", name)
+ try:
+ obj = package.get_member(name)
+ except (KeyError, ValueError, AliasResolutionError):
+ # Some cython __init__ symbols can't be found
+ # e.g. pyarrow.lib.OSFile.__init__
+ stack = name.split(".")
+ parent_name = ".".join(stack[:-1])
+
+ try:
+ obj = package.get_member(parent_name).all_members[stack[-1]]
+ except (KeyError, ValueError, AliasResolutionError):
+ print(f"{name} not found in {package.name}, it's probably ok.")
+ return None
+
+ if obj.has_docstring:
+ docstring = obj.docstring.value
+ # Remove signature if present in docstring
+ if docstring.startswith(obj.name) or (
+ (hasattr(obj.parent, "name") and
+ docstring.startswith(f"{obj.parent.name}.{obj.name}"))):
+ docstring = "\n".join(docstring.splitlines()[2:])
+ # Skip empty docstrings
+ if docstring.strip() == "":
+ return None
+ # Indent docstring
+ indentation_prefix = indentation * " "
+ docstring = indent(docstring + '\n"""', indentation_prefix)
+ docstring = '"""\n' + docstring
+ return docstring
+ return None
+
+
+class ReplaceEllipsis(libcst.CSTTransformer):
+ def __init__(self, package, namespace):
+ self.package = package
+ self.base_namespace = namespace
+ self.stack = []
+ self.indentation = 0
+
+ # Insert module level docstring if _clone_signature is used
+ def leave_Module(self, original_node, updated_node):
+ new_body = []
+ clone_matcher = m.SimpleStatementLine(
+ body=[m.Assign(
+ value=m.Call(func=m.Name(value="_clone_signature"))
+ ), m.ZeroOrMore()]
+ )
+ for statement in updated_node.body:
+ new_body.append(statement)
+ if m.matches(statement, clone_matcher):
+ name = statement.body[0].targets[0].target.value
+ if self.base_namespace:
+ name = f"{self.base_namespace}.{name}"
+ docstring = _get_docstring(name, self.package, 0)
+ if docstring is not None:
+ new_expr = libcst.Expr(value=libcst.SimpleString(docstring))
+ new_line = libcst.SimpleStatementLine(body=[new_expr])
+ new_body.append(new_line)
+
+ return updated_node.with_changes(body=new_body)
+
+ def visit_ClassDef(self, node):
+ self.stack.append(node.name.value)
+ self.indentation += 1
+
+ def leave_ClassDef(self, original_node, updated_node):
+ name = ".".join(self.stack)
+ if self.base_namespace:
+ name = self.base_namespace + "." + name
+
+ class_matcher_1 = m.ClassDef(
+ name=m.Name(),
+ body=m.IndentedBlock(
+ body=[m.SimpleStatementLine(
+ body=[m.Expr(m.Ellipsis()), m.ZeroOrMore()]
+ ), m.ZeroOrMore()]
+ )
+ )
+ class_matcher_2 = m.ClassDef(
+ name=m.Name(),
+ body=m.IndentedBlock(
+ body=[m.FunctionDef(), m.ZeroOrMore()]
+ )
+ )
+
+ if m.matches(updated_node, class_matcher_1):
+ docstring = _get_docstring(name, self.package, self.indentation)
+ if docstring is not None:
+ new_node = libcst.SimpleString(value=docstring)
+ updated_node = updated_node.deep_replace(
+ updated_node.body.body[0].body[0].value, new_node)
+
+ if m.matches(updated_node, class_matcher_2):
+ docstring = _get_docstring(name, self.package, self.indentation)
+ if docstring is not None:
+ new_docstring = libcst.SimpleString(value=docstring)
+ new_body = [
+ libcst.SimpleWhitespace(self.indentation * " "),
+ libcst.Expr(value=new_docstring),
+ libcst.Newline()
+ ] + list(updated_node.body.body)
+ new_body = libcst.IndentedBlock(body=new_body)
+ updated_node = updated_node.with_changes(body=new_body)
+
+ self.stack.pop()
+ self.indentation -= 1
+ return updated_node
+
+ def visit_FunctionDef(self, node):
+ self.stack.append(node.name.value)
+ self.indentation += 1
+
+ def leave_FunctionDef(self, original_node, updated_node):
+ name = ".".join(self.stack)
+ if self.base_namespace:
+ name = self.base_namespace + "." + name
+
+ function_matcher = m.FunctionDef(
+ name=m.Name(),
+ body=m.SimpleStatementSuite(
+ body=[m.Expr(
+ m.Ellipsis()
+ )]))
+ if m.matches(original_node, function_matcher):
+ docstring = _get_docstring(name, self.package, self.indentation)
+ if docstring is not None:
+ new_docstring = libcst.SimpleString(value=docstring)
+ new_body = [
+ libcst.SimpleWhitespace(self.indentation * " "),
+ libcst.Expr(value=new_docstring),
+ libcst.Newline()
+ ]
+ new_body = libcst.IndentedBlock(body=new_body)
+ updated_node = updated_node.with_changes(body=new_body)
+
+ self.stack.pop()
+ self.indentation -= 1
+ return updated_node
+
+
+@click.command()
+@click.argument('pyarrow_folder', type=click.Path(resolve_path=True))
+def add_docs_to_stub_files(pyarrow_folder):
+ print("Updating docstrings of stub files in:", pyarrow_folder)
+ package = griffe.load("pyarrow", try_relative_path=True,
+ force_inspection=True, resolve_aliases=True)
+ lib_modules = ["array", "builder", "compat", "config", "device", "error", "io",
+ "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor",
+ "_types"]
+
+ for stub_file in Path(pyarrow_folder).rglob('*.pyi'):
+ if stub_file.name == "_stubs_typing.pyi":
+ continue
+ module = stub_file.with_suffix('').name
+ print(f"[{stub_file} {module}]")
+
+ with open(stub_file, 'r') as f:
+ tree = libcst.parse_module(f.read())
+
+ if module in lib_modules:
+ module = "lib"
+ elif stub_file.parent.name in ["parquet", "interchange"]:
+ module = f"{stub_file.parent.name}.{module}"
+ elif module == "__init__":
+ module = ""
+
+ modified_tree = tree.visit(ReplaceEllipsis(package, module))
+ with open(stub_file, "w") as f:
+ f.write(modified_tree.code)
+ print("\n")
+
+
+if __name__ == "__main__":
+ docstrings_map = {}
+ add_docs_to_stub_files(obj={})
diff --git a/docs/source/developers/python/development.rst b/docs/source/developers/python/development.rst
index d03b2439b10..c23891e94d0 100644
--- a/docs/source/developers/python/development.rst
+++ b/docs/source/developers/python/development.rst
@@ -42,7 +42,7 @@ Unit Testing
============
We are using `pytest `_ to develop our unit
-test suite. After `building the project `_ you can run its unit tests
+test suite. After `building the project `_ you can run its unit tests
like so:
.. code-block::
@@ -101,6 +101,74 @@ The test groups currently include:
* ``s3``: Tests for Amazon S3
* ``tensorflow``: Tests that involve TensorFlow
+Type Checking
+=============
+
+PyArrow provides type stubs (``*.pyi`` files) for static type checking. These
+stubs are located in the ``pyarrow-stubs/`` directory and are automatically
+included in the distributed wheel packages.
+
+Running Type Checkers
+---------------------
+
+We support multiple type checkers. Their configurations are in
+``pyproject.toml``.
+
+**mypy**
+
+To run mypy on the PyArrow codebase:
+
+.. code-block::
+
+ $ cd arrow/python
+ $ mypy
+
+The mypy configuration is in the ``[tool.mypy]`` section of ``pyproject.toml``.
+
+**pyright**
+
+To run pyright:
+
+.. code-block::
+
+ $ cd arrow/python
+ $ pyright
+
+The pyright configuration is in the ``[tool.pyright]`` section of ``pyproject.toml``.
+
+**ty**
+
+To run ty (note: currently only partially configured):
+
+.. code-block::
+
+ $ cd arrow/python
+ $ ty check
+
+Maintaining Type Stubs
+-----------------------
+
+Type stubs for PyArrow are maintained in the ``pyarrow-stubs/``
+directory. These stubs mirror the structure of the main ``pyarrow/`` package.
+
+When adding or modifying public APIs:
+
+1. **Update the corresponding ``.pyi`` stub file** in ``pyarrow-stubs/``
+ to reflect the new or changed function/class signatures.
+
+2. **Include type annotations** where possible. For Cython modules or
+ dynamically generated APIs such as compute kernels add the corresponding
+ stub in ``pyarrow-stubs/``.
+
+3. **Run type checkers** to ensure the stubs are correct and complete.
+
+The stub files are automatically copied into the built wheel during the build
+process and will be included when users install PyArrow, enabling type checking
+in downstream projects and for users' IDEs.
+
+Note: ``py.typed`` marker file in the ``pyarrow/`` directory indicates to type
+checkers that PyArrow supports type checking according to :pep:`561`.
+
Doctest
=======
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index ed7012e4b70..2840ba74128 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -4,6 +4,7 @@ include ../NOTICE.txt
global-include CMakeLists.txt
graft pyarrow
+graft pyarrow-stubs
graft cmake_modules
global-exclude *.so
diff --git a/python/pyarrow-stubs/pyarrow/__init__.pyi b/python/pyarrow-stubs/pyarrow/__init__.pyi
new file mode 100644
index 00000000000..ff0bd7fd5b8
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/__init__.pyi
@@ -0,0 +1,694 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pyarrow.lib as _lib
+
+from pyarrow.lib import (
+ BuildInfo,
+ CppBuildInfo,
+ RuntimeInfo,
+ set_timezone_db_path,
+ MonthDayNano,
+ VersionInfo,
+ build_info,
+ cpp_build_info,
+ cpp_version,
+ cpp_version_info,
+ runtime_info,
+ cpu_count,
+ set_cpu_count,
+ enable_signal_handlers,
+ io_thread_count,
+ set_io_thread_count,
+)
+
+from pyarrow.lib import (
+ null,
+ bool_,
+ int8,
+ int16,
+ int32,
+ int64,
+ uint8,
+ uint16,
+ uint32,
+ uint64,
+ time32,
+ time64,
+ timestamp,
+ date32,
+ date64,
+ duration,
+ month_day_nano_interval,
+ float16,
+ float32,
+ float64,
+ binary,
+ string,
+ utf8,
+ binary_view,
+ string_view,
+ large_binary,
+ large_string,
+ large_utf8,
+ decimal32,
+ decimal64,
+ decimal128,
+ decimal256,
+ list_,
+ large_list,
+ list_view,
+ large_list_view,
+ map_,
+ struct,
+ union,
+ sparse_union,
+ dense_union,
+ dictionary,
+ run_end_encoded,
+ json_,
+ uuid,
+ fixed_shape_tensor,
+ bool8,
+ opaque,
+ field,
+ type_for_alias,
+ DataType,
+ DictionaryType,
+ StructType,
+ ListType,
+ LargeListType,
+ FixedSizeListType,
+ ListViewType,
+ LargeListViewType,
+ MapType,
+ UnionType,
+ SparseUnionType,
+ DenseUnionType,
+ TimestampType,
+ Time32Type,
+ Time64Type,
+ DurationType,
+ FixedSizeBinaryType,
+ Decimal32Type,
+ Decimal64Type,
+ Decimal128Type,
+ Decimal256Type,
+ BaseExtensionType,
+ ExtensionType,
+ RunEndEncodedType,
+ FixedShapeTensorType,
+ Bool8Type,
+ UuidType,
+ JsonType,
+ OpaqueType,
+ UnknownExtensionType,
+ register_extension_type,
+ unregister_extension_type,
+ DictionaryMemo,
+ KeyValueMetadata,
+ Field,
+ Schema,
+ schema,
+ unify_schemas,
+ Array,
+ Tensor,
+ array,
+ arange,
+ chunked_array,
+ record_batch,
+ nulls,
+ repeat,
+ SparseCOOTensor,
+ SparseCSRMatrix,
+ SparseCSCMatrix,
+ SparseCSFTensor,
+ infer_type,
+ from_numpy_dtype,
+ NullArray,
+ NumericArray,
+ IntegerArray,
+ FloatingPointArray,
+ BooleanArray,
+ Int8Array,
+ UInt8Array,
+ Int16Array,
+ UInt16Array,
+ Int32Array,
+ UInt32Array,
+ Int64Array,
+ UInt64Array,
+ HalfFloatArray,
+ FloatArray,
+ DoubleArray,
+ ListArray,
+ LargeListArray,
+ FixedSizeListArray,
+ ListViewArray,
+ LargeListViewArray,
+ MapArray,
+ UnionArray,
+ BinaryArray,
+ StringArray,
+ LargeBinaryArray,
+ LargeStringArray,
+ BinaryViewArray,
+ StringViewArray,
+ FixedSizeBinaryArray,
+ DictionaryArray,
+ Date32Array,
+ Date64Array,
+ TimestampArray,
+ Time32Array,
+ Time64Array,
+ DurationArray,
+ MonthDayNanoIntervalArray,
+ Decimal32Array,
+ Decimal64Array,
+ Decimal128Array,
+ Decimal256Array,
+ StructArray,
+ ExtensionArray,
+ RunEndEncodedArray,
+ FixedShapeTensorArray,
+ Bool8Array,
+ UuidArray,
+ JsonArray,
+ OpaqueArray,
+ scalar,
+ NA,
+ _NULL as NULL,
+ Scalar,
+ NullScalar,
+ BooleanScalar,
+ Int8Scalar,
+ Int16Scalar,
+ Int32Scalar,
+ Int64Scalar,
+ UInt8Scalar,
+ UInt16Scalar,
+ UInt32Scalar,
+ UInt64Scalar,
+ HalfFloatScalar,
+ FloatScalar,
+ DoubleScalar,
+ Decimal32Scalar,
+ Decimal64Scalar,
+ Decimal128Scalar,
+ Decimal256Scalar,
+ ListScalar,
+ LargeListScalar,
+ FixedSizeListScalar,
+ ListViewScalar,
+ LargeListViewScalar,
+ Date32Scalar,
+ Date64Scalar,
+ Time32Scalar,
+ Time64Scalar,
+ TimestampScalar,
+ DurationScalar,
+ MonthDayNanoIntervalScalar,
+ BinaryScalar,
+ LargeBinaryScalar,
+ BinaryViewScalar,
+ StringScalar,
+ LargeStringScalar,
+ StringViewScalar,
+ FixedSizeBinaryScalar,
+ DictionaryScalar,
+ MapScalar,
+ StructScalar,
+ UnionScalar,
+ RunEndEncodedScalar,
+ ExtensionScalar,
+ Bool8Scalar,
+ UuidScalar,
+ JsonScalar,
+ OpaqueScalar,
+)
+
+# Buffers, allocation
+from pyarrow.lib import (
+ DeviceAllocationType,
+ Device,
+ MemoryManager,
+ default_cpu_memory_manager
+)
+
+from pyarrow.lib import (
+ Buffer,
+ ResizableBuffer,
+ foreign_buffer,
+ py_buffer,
+ Codec,
+ compress,
+ decompress,
+ allocate_buffer,
+)
+
+from pyarrow.lib import (
+ MemoryPool,
+ LoggingMemoryPool,
+ ProxyMemoryPool,
+ total_allocated_bytes,
+ set_memory_pool,
+ default_memory_pool,
+ system_memory_pool,
+ jemalloc_memory_pool,
+ mimalloc_memory_pool,
+ logging_memory_pool,
+ proxy_memory_pool,
+ log_memory_allocations,
+ jemalloc_set_decay_ms,
+ supported_memory_backends,
+)
+
+# I/O
+from pyarrow.lib import (
+ NativeFile,
+ PythonFile,
+ BufferedInputStream,
+ BufferedOutputStream,
+ CacheOptions,
+ CompressedInputStream,
+ CompressedOutputStream,
+ TransformInputStream,
+ transcoding_input_stream,
+ FixedSizeBufferWriter,
+ BufferReader,
+ BufferOutputStream,
+ OSFile,
+ MemoryMappedFile,
+ memory_map,
+ create_memory_map,
+ MockOutputStream,
+ input_stream,
+ output_stream,
+ have_libhdfs,
+)
+
+from pyarrow.lib import (
+ ChunkedArray,
+ RecordBatch,
+ Table,
+ table,
+ concat_arrays,
+ concat_batches,
+ concat_tables,
+ TableGroupBy,
+ RecordBatchReader,
+)
+
+# Exceptions
+from pyarrow.lib import (
+ ArrowCancelled,
+ ArrowCapacityError,
+ ArrowException,
+ ArrowKeyError,
+ ArrowIndexError,
+ ArrowInvalid,
+ ArrowIOError,
+ ArrowMemoryError,
+ ArrowNotImplementedError,
+ ArrowTypeError,
+ ArrowSerializationError,
+)
+
+from pyarrow.ipc import serialize_pandas, deserialize_pandas
+import pyarrow.ipc as ipc
+import pyarrow.lib as lib
+import pyarrow.types as types
+import pyarrow.feather as feather
+import pyarrow.compute as compute
+import pyarrow.csv as csv
+import pyarrow.json as json
+import pyarrow.dataset as dataset
+
+# ----------------------------------------------------------------------
+# Deprecations
+
+from pyarrow.util import _deprecate_api, _deprecate_class
+
+from pyarrow.ipc import (
+ Message,
+ MessageReader,
+ MetadataVersion,
+ RecordBatchFileReader,
+ RecordBatchFileWriter,
+ RecordBatchStreamReader,
+ RecordBatchStreamWriter,
+)
+
+
+__version__: str
+_gc_enabled: bool
+
+
+def show_versions() -> None: ...
+def show_info() -> None: ...
+def _module_is_available(module: str) -> bool: ...
+def _filesystem_is_available(fs: str) -> bool: ...
+
+
+def get_include() -> str: ...
+def _get_pkg_config_executable() -> str: ...
+def _has_pkg_config(pkgname: str) -> bool: ...
+def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ...
+def get_libraries() -> list[str]: ...
+def create_library_symlinks() -> None: ...
+def get_library_dirs() -> list[str]: ...
+
+
+__all__ = [
+ "__version__",
+ "_lib",
+ "_gc_enabled",
+ "BuildInfo",
+ "CppBuildInfo",
+ "RuntimeInfo",
+ "set_timezone_db_path",
+ "MonthDayNano",
+ "VersionInfo",
+ "build_info",
+ "cpp_build_info",
+ "cpp_version",
+ "cpp_version_info",
+ "runtime_info",
+ "cpu_count",
+ "set_cpu_count",
+ "enable_signal_handlers",
+ "io_thread_count",
+ "set_io_thread_count",
+ "show_versions",
+ "show_info",
+ "_module_is_available",
+ "_filesystem_is_available",
+ "null",
+ "bool_",
+ "int8",
+ "int16",
+ "int32",
+ "int64",
+ "uint8",
+ "uint16",
+ "uint32",
+ "uint64",
+ "time32",
+ "time64",
+ "timestamp",
+ "date32",
+ "date64",
+ "duration",
+ "month_day_nano_interval",
+ "float16",
+ "float32",
+ "float64",
+ "binary",
+ "string",
+ "utf8",
+ "binary_view",
+ "string_view",
+ "large_binary",
+ "large_string",
+ "large_utf8",
+ "decimal32",
+ "decimal64",
+ "decimal128",
+ "decimal256",
+ "list_",
+ "large_list",
+ "list_view",
+ "large_list_view",
+ "map_",
+ "struct",
+ "union",
+ "sparse_union",
+ "dense_union",
+ "dictionary",
+ "run_end_encoded",
+ "json_",
+ "uuid",
+ "fixed_shape_tensor",
+ "bool8",
+ "opaque",
+ "field",
+ "type_for_alias",
+ "DataType",
+ "DictionaryType",
+ "StructType",
+ "ListType",
+ "LargeListType",
+ "FixedSizeListType",
+ "ListViewType",
+ "LargeListViewType",
+ "MapType",
+ "UnionType",
+ "SparseUnionType",
+ "DenseUnionType",
+ "TimestampType",
+ "Time32Type",
+ "Time64Type",
+ "DurationType",
+ "FixedSizeBinaryType",
+ "Decimal32Type",
+ "Decimal64Type",
+ "Decimal128Type",
+ "Decimal256Type",
+ "BaseExtensionType",
+ "ExtensionType",
+ "RunEndEncodedType",
+ "FixedShapeTensorType",
+ "Bool8Type",
+ "UuidType",
+ "JsonType",
+ "OpaqueType",
+ "UnknownExtensionType",
+ "register_extension_type",
+ "unregister_extension_type",
+ "DictionaryMemo",
+ "KeyValueMetadata",
+ "Field",
+ "Schema",
+ "schema",
+ "unify_schemas",
+ "Array",
+ "Tensor",
+ "array",
+ "arange",
+ "chunked_array",
+ "record_batch",
+ "nulls",
+ "repeat",
+ "SparseCOOTensor",
+ "SparseCSRMatrix",
+ "SparseCSCMatrix",
+ "SparseCSFTensor",
+ "infer_type",
+ "from_numpy_dtype",
+ "NullArray",
+ "NumericArray",
+ "IntegerArray",
+ "FloatingPointArray",
+ "BooleanArray",
+ "Int8Array",
+ "UInt8Array",
+ "Int16Array",
+ "UInt16Array",
+ "Int32Array",
+ "UInt32Array",
+ "Int64Array",
+ "UInt64Array",
+ "HalfFloatArray",
+ "FloatArray",
+ "DoubleArray",
+ "ListArray",
+ "LargeListArray",
+ "FixedSizeListArray",
+ "ListViewArray",
+ "LargeListViewArray",
+ "MapArray",
+ "UnionArray",
+ "BinaryArray",
+ "StringArray",
+ "LargeBinaryArray",
+ "LargeStringArray",
+ "BinaryViewArray",
+ "StringViewArray",
+ "FixedSizeBinaryArray",
+ "DictionaryArray",
+ "Date32Array",
+ "Date64Array",
+ "TimestampArray",
+ "Time32Array",
+ "Time64Array",
+ "DurationArray",
+ "MonthDayNanoIntervalArray",
+ "Decimal32Array",
+ "Decimal64Array",
+ "Decimal128Array",
+ "Decimal256Array",
+ "StructArray",
+ "ExtensionArray",
+ "Bool8Array",
+ "UuidArray",
+ "JsonArray",
+ "OpaqueArray",
+ "RunEndEncodedArray",
+ "FixedShapeTensorArray",
+ "scalar",
+ "NA",
+ "NULL",
+ "Scalar",
+ "NullScalar",
+ "BooleanScalar",
+ "Int8Scalar",
+ "Int16Scalar",
+ "Int32Scalar",
+ "Int64Scalar",
+ "UInt8Scalar",
+ "UInt16Scalar",
+ "UInt32Scalar",
+ "UInt64Scalar",
+ "HalfFloatScalar",
+ "FloatScalar",
+ "DoubleScalar",
+ "Decimal32Scalar",
+ "Decimal64Scalar",
+ "Decimal128Scalar",
+ "Decimal256Scalar",
+ "ListScalar",
+ "LargeListScalar",
+ "FixedSizeListScalar",
+ "ListViewScalar",
+ "LargeListViewScalar",
+ "Date32Scalar",
+ "Date64Scalar",
+ "Time32Scalar",
+ "Time64Scalar",
+ "TimestampScalar",
+ "DurationScalar",
+ "MonthDayNanoIntervalScalar",
+ "BinaryScalar",
+ "LargeBinaryScalar",
+ "BinaryViewScalar",
+ "StringScalar",
+ "LargeStringScalar",
+ "StringViewScalar",
+ "FixedSizeBinaryScalar",
+ "DictionaryScalar",
+ "MapScalar",
+ "StructScalar",
+ "UnionScalar",
+ "RunEndEncodedScalar",
+ "ExtensionScalar",
+ "Bool8Scalar",
+ "UuidScalar",
+ "JsonScalar",
+ "OpaqueScalar",
+ "DeviceAllocationType",
+ "Device",
+ "MemoryManager",
+ "default_cpu_memory_manager",
+ "Buffer",
+ "ResizableBuffer",
+ "foreign_buffer",
+ "py_buffer",
+ "Codec",
+ "compress",
+ "decompress",
+ "allocate_buffer",
+ "MemoryPool",
+ "LoggingMemoryPool",
+ "ProxyMemoryPool",
+ "total_allocated_bytes",
+ "set_memory_pool",
+ "default_memory_pool",
+ "system_memory_pool",
+ "jemalloc_memory_pool",
+ "mimalloc_memory_pool",
+ "logging_memory_pool",
+ "proxy_memory_pool",
+ "log_memory_allocations",
+ "jemalloc_set_decay_ms",
+ "supported_memory_backends",
+ "NativeFile",
+ "PythonFile",
+ "BufferedInputStream",
+ "BufferedOutputStream",
+ "CacheOptions",
+ "CompressedInputStream",
+ "CompressedOutputStream",
+ "TransformInputStream",
+ "transcoding_input_stream",
+ "FixedSizeBufferWriter",
+ "BufferReader",
+ "BufferOutputStream",
+ "OSFile",
+ "MemoryMappedFile",
+ "memory_map",
+ "create_memory_map",
+ "MockOutputStream",
+ "input_stream",
+ "output_stream",
+ "have_libhdfs",
+ "ChunkedArray",
+ "RecordBatch",
+ "Table",
+ "table",
+ "concat_arrays",
+ "concat_batches",
+ "concat_tables",
+ "TableGroupBy",
+ "RecordBatchReader",
+ "ArrowCancelled",
+ "ArrowCapacityError",
+ "ArrowException",
+ "ArrowKeyError",
+ "ArrowIndexError",
+ "ArrowInvalid",
+ "ArrowIOError",
+ "ArrowMemoryError",
+ "ArrowNotImplementedError",
+ "ArrowTypeError",
+ "ArrowSerializationError",
+ "serialize_pandas",
+ "deserialize_pandas",
+ "lib",
+ "ipc",
+ "types",
+ "_deprecate_api",
+ "_deprecate_class",
+ "Message",
+ "MessageReader",
+ "MetadataVersion",
+ "RecordBatchFileReader",
+ "RecordBatchFileWriter",
+ "RecordBatchStreamReader",
+ "RecordBatchStreamWriter",
+ "get_include",
+ "_get_pkg_config_executable",
+ "compute",
+ "feather",
+ "csv",
+ "json",
+ "_has_pkg_config",
+ "_read_pkg_config_variable",
+ "get_libraries",
+ "create_library_symlinks",
+ "dataset",
+ "get_library_dirs",
+]
diff --git a/python/pyarrow-stubs/pyarrow/_acero.pyi b/python/pyarrow-stubs/pyarrow/_acero.pyi
new file mode 100644
index 00000000000..85ed9683e7e
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_acero.pyi
@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+from collections.abc import Iterable, Collection, Sequence
+
+if sys.version_info >= (3, 11):
+ from typing import Self, LiteralString
+else:
+ from typing_extensions import Self, LiteralString
+if sys.version_info >= (3, 10):
+ from typing import TypeAlias
+else:
+ from typing_extensions import TypeAlias
+from typing import Literal
+
+from . import lib
+from .compute import Expression
+from .dataset import InMemoryDataset, Dataset
+from .table import Aggregation, AggregateOptions
+
+_StrOrExpr: TypeAlias = str | Expression
+
+IntoField: TypeAlias = str | int | Expression
+Target: TypeAlias = (
+ IntoField
+ | tuple[IntoField, ...]
+ | list[str]
+ | list[int]
+ | list[Expression]
+ | list[IntoField]
+)
+
+UserDefinedAggregation: TypeAlias = LiteralString
+OutputName: TypeAlias = str
+AggregationSpec: TypeAlias = tuple[
+ Target, Aggregation | UserDefinedAggregation, AggregateOptions | None, OutputName
+]
+
+
+class Declaration(lib._Weakrefable):
+ def __init__(
+ self,
+ factory_name: str,
+ options: ExecNodeOptions,
+ inputs: list[Declaration] | None = None,
+ ) -> None: ...
+ @classmethod
+ def from_sequence(cls, decls: Iterable[Declaration]) -> Self: ...
+ def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ...
+ def to_table(self, use_threads: bool = True) -> lib.Table: ...
+
+
+class ExecNodeOptions(lib._Weakrefable):
+ ...
+
+
+class TableSourceNodeOptions(ExecNodeOptions):
+ def __init__(self, table: lib.Table | lib.RecordBatch | None) -> None: ...
+
+
+class FilterNodeOptions(ExecNodeOptions):
+ def __init__(self, filter_expression: Expression | None) -> None: ...
+
+
+class ProjectNodeOptions(ExecNodeOptions):
+ def __init__(self, expressions: Collection[Expression],
+ names: Collection[str] | None = None) -> None: ...
+
+
+class AggregateNodeOptions(ExecNodeOptions):
+ def __init__(
+ self,
+ aggregates: Iterable[
+ tuple[
+ Target,
+ Aggregation | UserDefinedAggregation,
+ AggregateOptions | None,
+ OutputName,
+ ]
+ ],
+ keys: Iterable[str | Expression] | None = None,
+ ) -> None: ...
+
+
+class OrderByNodeOptions(ExecNodeOptions):
+ def __init__(
+ self,
+ sort_keys:
+ Iterable[tuple[str | Expression | int, Literal["ascending", "descending"]]]
+ = (),
+ *,
+ null_placement: Literal["at_start", "at_end"] = "at_end",
+ ) -> None: ...
+
+
+class HashJoinNodeOptions(ExecNodeOptions):
+ def __init__(
+ self,
+ join_type: Literal[
+ "left semi",
+ "right semi",
+ "left anti",
+ "right anti",
+ "inner",
+ "left outer",
+ "right outer",
+ "full outer",
+ ],
+ left_keys: _StrOrExpr | Sequence[_StrOrExpr],
+ right_keys: _StrOrExpr | Sequence[_StrOrExpr],
+ left_output: Sequence[_StrOrExpr] | None = None,
+ right_output: Sequence[_StrOrExpr] | None = None,
+ output_suffix_for_left: str = "",
+ output_suffix_for_right: str = "",
+ filter_expression:
+ lib.BooleanScalar | lib.BooleanArray | Expression | None = None,
+ ) -> None: ...
+
+
+class AsofJoinNodeOptions(ExecNodeOptions):
+ def __init__(
+ self,
+ left_on: _StrOrExpr,
+ left_by: _StrOrExpr | Sequence[_StrOrExpr],
+ right_on: _StrOrExpr,
+ right_by: _StrOrExpr | Sequence[_StrOrExpr],
+ tolerance: int,
+ ) -> None: ...
+
+
+def _perform_join(
+ join_type: str,
+ left_operand: lib.Table | Dataset,
+ left_keys: str | list[str],
+ right_operand: lib.Table | Dataset,
+ right_keys: str | list[str],
+ left_suffix: str,
+ right_suffix: str,
+ use_threads: bool,
+ coalesce_keys: bool,
+ output_type: type[lib.Table | InMemoryDataset] = lib.Table,
+ filter_expression: Expression | None = None,
+) -> lib.Table | InMemoryDataset: ...
+
+
+def _filter_table(
+ table: lib.Table | lib.RecordBatch, filter_expression: Expression,
+ use_threads: bool = True) -> lib.Table | lib.RecordBatch: ...
diff --git a/python/pyarrow-stubs/pyarrow/_azurefs.pyi b/python/pyarrow-stubs/pyarrow/_azurefs.pyi
new file mode 100644
index 00000000000..5872de03825
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_azurefs.pyi
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Literal
+
+from ._fs import FileSystem
+
+
+class AzureFileSystem(FileSystem):
+ def __init__(
+ self,
+ account_name: str | None = None,
+ account_key: str | None = None,
+ blob_storage_authority: str | None = None,
+ dfs_storage_authority: str | None = None,
+ blob_storage_scheme: Literal["http", "https"] = "https",
+ dfs_storage_scheme: Literal["http", "https"] = "https",
+ sas_token: str | None = None,
+ tenant_id: str | None = None,
+ client_id: str | None = None,
+ client_secret: str | None = None,
+ ) -> None: ...
diff --git a/python/pyarrow-stubs/pyarrow/_compute.pyi b/python/pyarrow-stubs/pyarrow/_compute.pyi
new file mode 100644
index 00000000000..dfe46908c08
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_compute.pyi
@@ -0,0 +1,671 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import types as stdlib_types
+from collections.abc import (
+ Callable,
+ Iterable,
+ Mapping,
+ Sequence,
+)
+
+from typing import (
+ Any,
+ Literal,
+ TypeAlias,
+ TypedDict,
+)
+
+from . import lib
+
+_Order: TypeAlias = Literal["ascending", "descending"]
+_Placement: TypeAlias = Literal["at_start", "at_end"]
+
+
+class Kernel(lib._Weakrefable):
+ ...
+
+
+class Function(lib._Weakrefable):
+ @property
+ def arity(self) -> int | stdlib_types.EllipsisType: ...
+
+ @property
+ def kind(
+ self,
+ ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: ...
+ @property
+ def name(self) -> str: ...
+ @property
+ def num_kernels(self) -> int: ...
+
+ @property
+ def kernels(
+ self,
+ ) -> list[
+ ScalarKernel | VectorKernel | ScalarAggregateKernel | HashAggregateKernel
+ ]: ...
+
+ def call(
+ self,
+ args: Iterable,
+ options: FunctionOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+ length: int | None = None,
+ ) -> Any: ...
+
+
+class FunctionOptions(lib._Weakrefable):
+ def serialize(self) -> lib.Buffer: ...
+ @classmethod
+ def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ...
+
+
+class FunctionRegistry(lib._Weakrefable):
+ def get_function(self, name: str) -> Function: ...
+ def list_functions(self) -> list[str]: ...
+
+
+class HashAggregateFunction(Function):
+ ...
+
+
+class HashAggregateKernel(Kernel):
+ ...
+
+
+class ScalarAggregateFunction(Function):
+ ...
+
+
+class ScalarAggregateKernel(Kernel):
+ ...
+
+
+class ScalarFunction(Function):
+ ...
+
+
+class ScalarKernel(Kernel):
+ ...
+
+
+class VectorFunction(Function):
+ ...
+
+
+class VectorKernel(Kernel):
+ ...
+
+# ==================== _compute.pyx Option classes ====================
+
+
+class ArraySortOptions(FunctionOptions):
+ def __init__(
+ self,
+ order: _Order = "ascending",
+ null_placement: _Placement = "at_end",
+ ) -> None: ...
+
+
+class AssumeTimezoneOptions(FunctionOptions):
+ def __init__(
+ self,
+ timezone: str,
+ *,
+ ambiguous: Literal["raise", "earliest", "latest"] = "raise",
+ nonexistent: Literal["raise", "earliest", "latest"] = "raise",
+ ) -> None: ...
+
+
+class CastOptions(FunctionOptions):
+ allow_int_overflow: bool
+ allow_time_truncate: bool
+ allow_time_overflow: bool
+ allow_decimal_truncate: bool
+ allow_float_truncate: bool
+ allow_invalid_utf8: bool
+
+ def __init__(
+ self,
+ target_type: lib.DataType | None = None,
+ *,
+ allow_int_overflow: bool | None = None,
+ allow_time_truncate: bool | None = None,
+ allow_time_overflow: bool | None = None,
+ allow_decimal_truncate: bool | None = None,
+ allow_float_truncate: bool | None = None,
+ allow_invalid_utf8: bool | None = None,
+ ) -> None: ...
+ @staticmethod
+ def safe(target_type: lib.DataType | None = None) -> CastOptions: ...
+ @staticmethod
+ def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ...
+ def is_safe(self) -> bool: ...
+
+
+class CountOptions(FunctionOptions):
+ def __init__(self, mode: Literal["only_valid",
+ "only_null", "all"] = "only_valid") -> None: ...
+
+
+class CumulativeOptions(FunctionOptions):
+ def __init__(self, start: lib.Scalar | None = None,
+ *, skip_nulls: bool = False) -> None: ...
+
+
+class CumulativeSumOptions(FunctionOptions):
+ def __init__(self, start: lib.Scalar | None = None,
+ *, skip_nulls: bool = False) -> None: ...
+
+
+class DayOfWeekOptions(FunctionOptions):
+ def __init__(self, *, count_from_zero: bool = True,
+ week_start: int = 1) -> None: ...
+
+
+class DictionaryEncodeOptions(FunctionOptions):
+ def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ...
+
+
+class RunEndEncodeOptions(FunctionOptions):
+ # TODO: default is DataType(int32)
+ def __init__(self, run_end_type: lib.DataType | str = ...) -> None: ...
+
+
+class ElementWiseAggregateOptions(FunctionOptions):
+ def __init__(self, *, skip_nulls: bool = True) -> None: ...
+
+
+class ExtractRegexOptions(FunctionOptions):
+ def __init__(self, pattern: str) -> None: ...
+
+
+class ExtractRegexSpanOptions(FunctionOptions):
+ def __init__(self, pattern: str) -> None: ...
+
+
+class FilterOptions(FunctionOptions):
+ def __init__(self,
+ null_selection_behavior: Literal["drop",
+ "emit_null"] = "drop") -> None: ...
+
+
+class IndexOptions(FunctionOptions):
+ def __init__(self, value: lib.Scalar) -> None: ...
+
+
+class JoinOptions(FunctionOptions):
+ def __init__(
+ self,
+ null_handling:
+ Literal["emit_null", "skip", "replace"]
+ = "emit_null", *, null_replacement: str = "") -> None: ...
+
+
+class ListSliceOptions(FunctionOptions):
+ def __init__(
+ self,
+ start: int,
+ stop: int | None = None,
+ step: int = 1,
+ return_fixed_size_list: bool | None = None,
+ ) -> None: ...
+
+
+class ListFlattenOptions(FunctionOptions):
+ def __init__(self, recursive: bool = False) -> None: ...
+
+
+class MakeStructOptions(FunctionOptions):
+ def __init__(
+ self,
+ field_names: Sequence[str] = (),
+ *,
+ field_nullability: Sequence[bool] | None = None,
+ field_metadata: Sequence[lib.KeyValueMetadata] | None = None,
+ ) -> None: ...
+
+
+class MapLookupOptions(FunctionOptions):
+ # TODO: query_key: Scalar or Object can be converted to Scalar
+ def __init__(
+ self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"]
+ ) -> None: ...
+
+
+class MatchSubstringOptions(FunctionOptions):
+ def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ...
+
+
+class ModeOptions(FunctionOptions):
+ def __init__(self, n: int = 1, *, skip_nulls: bool = True,
+ min_count: int = 0) -> None: ...
+
+
+class NullOptions(FunctionOptions):
+ def __init__(self, *, nan_is_null: bool = False) -> None: ...
+
+
+class PadOptions(FunctionOptions):
+ def __init__(
+ self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True
+ ) -> None: ...
+
+
+class PairwiseOptions(FunctionOptions):
+ def __init__(self, period: int = 1) -> None: ...
+
+
+class PartitionNthOptions(FunctionOptions):
+ def __init__(self, pivot: int, *,
+ null_placement: _Placement = "at_end") -> None: ...
+
+
+class WinsorizeOptions(FunctionOptions):
+ def __init__(self, lower_limit: float, upper_limit: float) -> None: ...
+
+
+class QuantileOptions(FunctionOptions):
+ def __init__(
+ self,
+ q: float | Sequence[float] = 0.5,
+ *,
+ interpolation: Literal["linear", "lower",
+ "higher", "nearest", "midpoint"] = "linear",
+ skip_nulls: bool = True,
+ min_count: int = 0,
+ ) -> None: ...
+
+
+class RandomOptions(FunctionOptions):
+ def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ...
+
+
+class RankOptions(FunctionOptions):
+ def __init__(
+ self,
+ sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending",
+ *,
+ null_placement: _Placement = "at_end",
+ tiebreaker: Literal["min", "max", "first", "dense"] = "first",
+ ) -> None: ...
+
+
+class RankQuantileOptions(FunctionOptions):
+ def __init__(
+ self,
+ sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending",
+ *,
+ null_placement: _Placement = "at_end",
+ ) -> None: ...
+
+
+class PivotWiderOptions(FunctionOptions):
+ def __init__(
+ self,
+ key_names: Sequence[str],
+ *,
+ unexpected_key_behavior: Literal["ignore", "raise"] = "ignore",
+ ) -> None: ...
+
+
+class ReplaceSliceOptions(FunctionOptions):
+ def __init__(self, start: int, stop: int, replacement: str) -> None: ...
+
+
+class ReplaceSubstringOptions(FunctionOptions):
+ def __init__(
+ self, pattern: str, replacement: str, *, max_replacements: int | None = None
+ ) -> None: ...
+
+
+_RoundMode: TypeAlias = Literal[
+ "down",
+ "up",
+ "towards_zero",
+ "towards_infinity",
+ "half_down",
+ "half_up",
+ "half_towards_zero",
+ "half_towards_infinity",
+ "half_to_even",
+ "half_to_odd",
+]
+
+
+class RoundBinaryOptions(FunctionOptions):
+ def __init__(
+ self,
+ round_mode: _RoundMode = "half_to_even",
+ ) -> None: ...
+
+
+class RoundOptions(FunctionOptions):
+ def __init__(
+ self,
+ ndigits: int = 0,
+ round_mode: _RoundMode = "half_to_even",
+ ) -> None: ...
+
+
+_DateTimeUint: TypeAlias = Literal[
+ "year",
+ "quarter",
+ "month",
+ "week",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "millisecond",
+ "microsecond",
+ "nanosecond",
+]
+
+
+class RoundTemporalOptions(FunctionOptions):
+ def __init__(
+ self,
+ multiple: int = 1,
+ unit: _DateTimeUint = "day",
+ *,
+ week_starts_monday: bool = True,
+ ceil_is_strictly_greater: bool = False,
+ calendar_based_origin: bool = False,
+ ) -> None: ...
+
+
+class RoundToMultipleOptions(FunctionOptions):
+ def __init__(self, multiple: int | float | lib.Scalar = 1.0,
+ round_mode: _RoundMode = "half_to_even") -> None: ...
+
+
+class ScalarAggregateOptions(FunctionOptions):
+ def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ...
+
+
+class SelectKOptions(FunctionOptions):
+ def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ...
+
+
+class SetLookupOptions(FunctionOptions):
+ def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ...
+
+
+class SliceOptions(FunctionOptions):
+ def __init__(
+ self, start: int, stop: int | None = None, step: int = 1) -> None: ...
+
+
+class SortOptions(FunctionOptions):
+ def __init__(
+ self,
+ sort_keys: Sequence[tuple[str, _Order]],
+ *,
+ null_placement: _Placement = "at_end"
+ ) -> None: ...
+
+
+class SplitOptions(FunctionOptions):
+ def __init__(self, *, max_splits: int | None = None,
+ reverse: bool = False) -> None: ...
+
+
+class SplitPatternOptions(FunctionOptions):
+ def __init__(
+ self, pattern: str, *, max_splits: int | None = None, reverse: bool = False
+ ) -> None: ...
+
+
+class StrftimeOptions(FunctionOptions):
+ def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S",
+ locale: str = "C") -> None: ...
+
+
+class StrptimeOptions(FunctionOptions):
+ def __init__(self,
+ format: str,
+ unit: Literal["s",
+ "ms",
+ "us",
+ "ns"],
+ error_is_null: bool = False) -> None: ...
+
+
+class StructFieldOptions(FunctionOptions):
+ def __init__(self, indices: list[str] | list[bytes] |
+ list[int] | Expression | bytes | str | int) -> None: ...
+
+
+class TakeOptions(FunctionOptions):
+ def __init__(self, boundscheck: bool = True) -> None: ...
+
+
+class TDigestOptions(FunctionOptions):
+ def __init__(
+ self,
+ q: float | Sequence[float] = 0.5,
+ *,
+ delta: int = 100,
+ buffer_size: int = 500,
+ skip_nulls: bool = True,
+ min_count: int = 0,
+ ) -> None: ...
+
+
+class TrimOptions(FunctionOptions):
+ def __init__(self, characters: str) -> None: ...
+
+
+class Utf8NormalizeOptions(FunctionOptions):
+ def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ...
+
+
+class VarianceOptions(FunctionOptions):
+ def __init__(self, *, ddof: int = 0, skip_nulls: bool = True,
+ min_count: int = 0) -> None: ...
+
+
+class SkewOptions(FunctionOptions):
+ def __init__(
+ self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0
+ ) -> None: ...
+
+
+class WeekOptions(FunctionOptions):
+ def __init__(
+ self,
+ *,
+ week_starts_monday: bool = True,
+ count_from_zero: bool = False,
+ first_week_is_fully_in_year: bool = False,
+ ) -> None: ...
+
+
+class ZeroFillOptions(FunctionOptions):
+ def __init__(self, width: int, padding: str = "0") -> None: ...
+
+# ==================== _compute.pyx Functions ====================
+
+
+def call_function(
+ name: str,
+ args: list,
+ options: FunctionOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+ length: int | None = None,
+) -> Any: ...
+def function_registry() -> FunctionRegistry: ...
+def get_function(name: str) -> Function: ...
+def list_functions() -> list[str]: ...
+
+# ==================== _compute.pyx Udf ====================
+
+
+def call_tabular_function(
+ function_name: str,
+ args: Iterable | None = None,
+ func_registry: FunctionRegistry | None = None) -> lib.RecordBatchReader: ...
+
+
+class _FunctionDoc(TypedDict):
+ summary: str
+ description: str
+
+
+def register_scalar_function(
+ func: Callable | None,
+ function_name: str | None,
+ function_doc: _FunctionDoc | dict[str, str],
+ in_types: Mapping[str, lib.DataType] | None,
+ out_type: lib.DataType | None,
+ func_registry: FunctionRegistry | None = None,
+) -> None: ...
+
+
+def register_tabular_function(
+ func: Callable,
+ function_name: str,
+ function_doc: _FunctionDoc | dict[str, str],
+ in_types: Mapping[str, lib.DataType],
+ out_type: lib.DataType,
+ func_registry: FunctionRegistry | None = None,
+) -> None: ...
+
+
+def register_aggregate_function(
+ func: Callable,
+ function_name: str,
+ function_doc: _FunctionDoc | dict[str, str],
+ in_types: Mapping[str, lib.DataType],
+ out_type: lib.DataType,
+ func_registry: FunctionRegistry | None = None,
+) -> None: ...
+
+
+def register_vector_function(
+ func: Callable,
+ function_name: str,
+ function_doc: _FunctionDoc | dict[str, str],
+ in_types: Mapping[str, lib.DataType],
+ out_type: lib.DataType,
+ func_registry: FunctionRegistry | None = None,
+) -> None: ...
+
+
+class UdfContext:
+ @property
+ def batch_length(self) -> int: ...
+ @property
+ def memory_pool(self) -> lib.MemoryPool: ...
+
+
+def _get_udf_context(memory_pool: lib.MemoryPool, batch_length: int) -> UdfContext: ...
+
+# ==================== _compute.pyx Expression ====================
+
+
+class Expression(lib._Weakrefable):
+ @staticmethod
+ def from_substrait(buffer: bytes | lib.Buffer) -> Expression: ...
+
+ def to_substrait(self, schema: lib.Schema,
+ allow_arrow_extensions: bool = False) -> lib.Buffer: ...
+
+ @staticmethod
+ def _call(
+ func_name: str, args: list, options: FunctionOptions | None = None
+ ) -> Expression: ...
+
+ @staticmethod
+ def _field(name_or_index: str | int) -> Expression: ...
+
+ @staticmethod
+ def _nested_field(name: str) -> Expression: ...
+
+ @staticmethod
+ def _scalar(value: Any) -> Expression: ...
+
+ def __invert__(self) -> Expression: ...
+
+ def __and__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __rand__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __or__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __ror__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __add__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __radd__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __mul__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __rmul__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __sub__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __rsub__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __eq__(self, value: object) -> Expression: ... # type: ignore[override]
+ def __ne__(self, value: object) -> Expression: ... # type: ignore[override]
+ def __gt__(self, value: object) -> Expression: ...
+ def __lt__(self, value: object) -> Expression: ...
+ def __ge__(self, value: object) -> Expression: ...
+ def __le__(self, value: object) -> Expression: ...
+
+ def __truediv__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def __rtruediv__(
+ self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ...
+
+ def is_valid(self) -> Expression: ...
+ def is_null(self, nan_is_null: bool = False) -> Expression: ...
+ def is_nan(self) -> Expression: ...
+
+ def cast(
+ self,
+ type: lib.DataType | str, safe: bool = True, options: CastOptions | None = None
+ ) -> Expression: ...
+
+ def isin(self, values: lib.Array | Iterable | Any) -> Expression: ...
+ def equals(self, other: object) -> bool: ...
+
+ # Attributes and methods for materialized expressions (used in tests)
+ @property
+ def type(self) -> lib.DataType: ...
+ def to_pylist(self) -> list: ...
+ def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> Any: ...
+ def to_pandas(self, **kwargs) -> Any: ...
+ def as_py(self) -> Any: ...
+ def tolist(self) -> list: ...
+ def slice(self, offset: int = 0, length: int | None = None) -> Expression: ...
+
+# ==================== _compute.py ====================
diff --git a/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi b/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi
new file mode 100644
index 00000000000..514a4e4269c
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+function_doc_additions: dict[str, str]
diff --git a/python/pyarrow-stubs/pyarrow/_csv.pyi b/python/pyarrow-stubs/pyarrow/_csv.pyi
new file mode 100644
index 00000000000..6c911a8b0c1
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_csv.pyi
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass, field
+from typing import IO, Any, Literal
+
+from _typeshed import StrPath
+
+from . import lib
+
+
+@dataclass(kw_only=True)
+class ReadOptions(lib._Weakrefable):
+ use_threads: bool = field(default=True, kw_only=False) # noqa: Y015
+ block_size: int | float | None = None
+ skip_rows: int = 0
+ skip_rows_after_names: int = 0
+ column_names: Sequence[str] | None = None
+ autogenerate_column_names: bool = False
+ encoding: str = "utf8"
+ def validate(self) -> None: ...
+
+
+@dataclass(kw_only=True)
+class ParseOptions(lib._Weakrefable):
+ delimiter: str = field(default=",", kw_only=False) # noqa: Y015
+ quote_char: str | Literal[False] = '"'
+ double_quote: bool = True
+ escape_char: str | Literal[False] = False
+ newlines_in_values: bool = False
+ ignore_empty_lines: bool = True
+ invalid_row_handler: Callable[[InvalidRow], str] | None = None
+
+ def validate(self) -> None: ...
+
+
+@dataclass(kw_only=True)
+class ConvertOptions(lib._Weakrefable):
+ check_utf8: bool = field(default=True, kw_only=False) # noqa: Y015
+ column_types: lib.Schema | dict | Sequence[tuple[str, lib.DataType]] | None = None
+ null_values: list[str] | None = None
+ true_values: list[str] | None = None
+ false_values: list[str] | None = None
+ decimal_point: str = "."
+ strings_can_be_null: bool = False
+ quoted_strings_can_be_null: bool = True
+ include_columns: list[str] | None = None
+ include_missing_columns: bool = False
+ auto_dict_encode: bool = False
+ auto_dict_max_cardinality: int | None = None
+ timestamp_parsers: Sequence[str | lib._Weakrefable] | None = None
+
+ def validate(self) -> None: ...
+
+
+@dataclass(kw_only=True)
+class WriteOptions(lib._Weakrefable):
+ include_header: bool = field(default=True, kw_only=False) # noqa: Y015
+ batch_size: int = 1024
+ delimiter: str = ","
+ quoting_style: Literal["needed", "all_valid", "none"] = "needed"
+ quoting_header: Literal["needed", "all_valid", "none"] = "needed"
+
+ def validate(self) -> None: ...
+
+
+@dataclass
+class InvalidRow(lib._Weakrefable):
+ expected_columns: int
+ actual_columns: int
+ number: int | None
+ text: str
+
+
+class CSVWriter(lib._CRecordBatchWriter):
+ def __init__(
+ self,
+ # TODO: OutputStream
+ sink: StrPath | IO[Any],
+ schema: lib.Schema,
+ write_options: WriteOptions | None = None,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> None: ...
+
+
+class CSVStreamingReader(lib.RecordBatchReader):
+ ...
+
+
+ISO8601: lib._Weakrefable
+
+
+def open_csv(
+ input_file: StrPath | IO[Any],
+ read_options: ReadOptions | None = None,
+ parse_options: ParseOptions | None = None,
+ convert_options: ConvertOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> CSVStreamingReader: ...
+
+
+def read_csv(
+ input_file: StrPath | IO[Any],
+ read_options: ReadOptions | None = None,
+ parse_options: ParseOptions | None = None,
+ convert_options: ConvertOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Table: ...
+
+
+def write_csv(
+ data: lib.RecordBatch | lib.Table,
+ output_file: StrPath | lib.NativeFile | IO[Any],
+ write_options: WriteOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> None: ...
diff --git a/python/pyarrow-stubs/pyarrow/_cuda.pyi b/python/pyarrow-stubs/pyarrow/_cuda.pyi
new file mode 100644
index 00000000000..d484fc5cf5f
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_cuda.pyi
@@ -0,0 +1,158 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any
+
+import cuda # type: ignore[import-not-found]
+
+from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-untyped, import-not-found] # noqa: E501
+
+from . import lib
+from ._stubs_typing import ArrayLike
+
+
+class Context(lib._Weakrefable):
+ def __init__(self, device_number: int = 0, handle: int | None = None) -> None: ...
+
+ @staticmethod
+ def from_numba(context: _numba_driver.Context | None = None) -> Context: ...
+
+ def to_numba(self) -> _numba_driver.Context: ...
+
+ @staticmethod
+ def get_num_devices() -> int: ...
+
+ @property
+ def device_number(self) -> int: ...
+
+ @property
+ def handle(self) -> int: ...
+
+ def synchronize(self) -> None: ...
+
+ @property
+ def bytes_allocated(self) -> int: ...
+
+ def get_device_address(self, address: int) -> int: ...
+
+ def new_buffer(self, nbytes: int) -> CudaBuffer: ...
+
+ @property
+ def memory_manager(self) -> lib.MemoryManager: ...
+
+ @property
+ def device(self) -> lib.Device: ...
+
+ def foreign_buffer(self, address: int, size: int, base: Any |
+ None = None) -> CudaBuffer: ...
+
+ def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: ...
+
+ def buffer_from_data(
+ self,
+ data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike,
+ offset: int = 0,
+ size: int = -1,
+ ) -> CudaBuffer: ...
+
+ def buffer_from_object(self, obj: Any) -> CudaBuffer: ...
+
+
+class IpcMemHandle(lib._Weakrefable):
+ @staticmethod
+ def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: ...
+
+ def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: ...
+
+
+class CudaBuffer(lib.Buffer):
+ @staticmethod
+ def from_buffer(buf: lib.Buffer) -> CudaBuffer: ...
+
+ @staticmethod
+ def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: ...
+
+ def to_numba(self) -> _numba_driver.MemoryPointer: ...
+
+ def copy_to_host(
+ self,
+ position: int = 0,
+ nbytes: int = -1,
+ buf: lib.Buffer | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+ resizable: bool = False,
+ ) -> lib.Buffer: ...
+
+ def copy_from_host(
+ self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1
+ ) -> int: ...
+
+ def copy_from_device(self, buf: CudaBuffer, position: int = 0,
+ nbytes: int = -1) -> int: ...
+
+ def export_for_ipc(self) -> IpcMemHandle: ...
+
+ @property
+ def context(self) -> Context: ...
+
+ def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: ...
+
+ def to_pybytes(self) -> bytes: ...
+
+
+class HostBuffer(lib.Buffer):
+ @property
+ def size(self) -> int: ...
+
+
+class BufferReader(lib.NativeFile):
+ def __init__(self, obj: CudaBuffer) -> None: ...
+ def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: ...
+
+
+class BufferWriter(lib.NativeFile):
+ def __init__(self, obj: CudaBuffer) -> None: ...
+ def writeat(self, position: int, data: ArrayLike) -> None: ...
+
+ @property
+ def buffer_size(self) -> int: ...
+
+ @buffer_size.setter
+ def buffer_size(self, buffer_size: int): ...
+
+ @property
+ def num_bytes_buffered(self) -> int: ...
+
+
+def new_host_buffer(size: int, device: int = 0) -> HostBuffer: ...
+
+
+def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: ...
+
+
+def read_message(
+ source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None
+) -> lib.Message: ...
+
+
+def read_record_batch(
+ buffer: lib.Buffer,
+ object: lib.Schema,
+ *,
+ dictionary_memo: lib.DictionaryMemo | None = None,
+ pool: lib.MemoryPool | None = None,
+) -> lib.RecordBatch: ...
diff --git a/python/pyarrow-stubs/pyarrow/_dataset.pyi b/python/pyarrow-stubs/pyarrow/_dataset.pyi
new file mode 100644
index 00000000000..c8cd3d97089
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_dataset.pyi
@@ -0,0 +1,682 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+from collections.abc import Collection, Callable, Iterator, Iterable
+from typing import (
+ IO,
+ Any,
+ Generic,
+ Literal,
+ NamedTuple,
+ TypeVar,
+)
+
+from _typeshed import StrPath
+
+from . import csv, _json, _parquet, lib
+from ._fs import FileSelector, FileSystem, SupportedFileSystem
+from ._stubs_typing import Indices, JoinType, Order
+from .acero import ExecNodeOptions
+from .compute import Expression
+from .ipc import IpcWriteOptions, RecordBatchReader
+
+
+class Dataset(lib._Weakrefable):
+ @property
+ def partition_expression(self) -> Expression: ...
+
+ def replace_schema(self, schema: lib.Schema) -> Self: ...
+
+ def get_fragments(self, filter: Expression | None = None): ...
+
+ def scanner(
+ self,
+ columns: list[str] | dict[str, Expression] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> Scanner: ...
+
+ def to_batches(
+ self,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> Iterator[lib.RecordBatch]: ...
+
+ def to_table(
+ self,
+ columns: list[str] | dict[str, Expression] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> lib.Table: ...
+
+ def take(
+ self,
+ indices: Indices,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> lib.Table: ...
+
+ def head(
+ self,
+ num_rows: int,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> lib.Table: ...
+
+ def count_rows(
+ self,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> int: ...
+
+ @property
+ def schema(self) -> lib.Schema: ...
+
+ def filter(self, expression: Expression | None) -> Self: ...
+
+ def sort_by(self, sorting: str |
+ list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ...
+
+ def join(
+ self,
+ right_dataset: Dataset,
+ keys: str | list[str],
+ right_keys: str | list[str] | None = None,
+ join_type: JoinType = "left outer",
+ left_suffix: str | None = None,
+ right_suffix: str | None = None,
+ coalesce_keys: bool = True,
+ use_threads: bool = True,
+ ) -> InMemoryDataset: ...
+
+ def join_asof(
+ self,
+ right_dataset: Dataset,
+ on: str,
+ by: str | list[str],
+ tolerance: int,
+ right_on: str | list[str] | None = None,
+ right_by: str | list[str] | None = None,
+ ) -> InMemoryDataset: ...
+
+ @property
+ def format(self) -> FileFormat: ...
+
+
+class InMemoryDataset(Dataset):
+ def __init__(
+ self,
+ source: lib.Table
+ | lib.RecordBatch
+ | lib.RecordBatchReader
+ | Iterable[lib.RecordBatch]
+ | list[Any],
+ schema: lib.Schema | None = None,
+ ) -> None: ...
+
+
+class UnionDataset(Dataset):
+ def __init__(
+ self,
+ schema: lib.Schema | None = None,
+ children: list[Dataset] | None = None,
+ ) -> None: ...
+
+ @property
+ def children(self) -> list[Dataset]: ...
+
+
+class FileSystemDataset(Dataset):
+ def __init__(
+ self,
+ fragments: list[Fragment],
+ schema: lib.Schema,
+ format: FileFormat,
+ filesystem: SupportedFileSystem | None = None,
+ root_partition: Expression | None = None,
+ ) -> None: ...
+
+ @classmethod
+ def from_paths(
+ cls,
+ paths: list[str],
+ schema: lib.Schema | None = None,
+ format: FileFormat | None = None,
+ filesystem: SupportedFileSystem | None = None,
+ partitions: list[Expression] | None = None,
+ root_partition: Expression | None = None,
+ ) -> FileSystemDataset: ...
+
+ @property
+ def filesystem(self) -> FileSystem: ...
+ @property
+ def partitioning(self) -> Partitioning | None: ...
+
+ @property
+ def files(self) -> list[str]: ...
+
+
+class FileWriteOptions(lib._Weakrefable):
+ @property
+ def format(self) -> FileFormat: ...
+
+
+class FileFormat(lib._Weakrefable):
+ def inspect(
+ self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None
+ ) -> lib.Schema: ...
+
+ def make_fragment(
+ self,
+ file: StrPath | IO | lib.Buffer | lib.BufferReader,
+ filesystem: SupportedFileSystem | None = None,
+ partition_expression: Expression | None = None,
+ *,
+ file_size: int | None = None,
+ ) -> Fragment: ...
+
+ def make_write_options(self) -> FileWriteOptions: ...
+ @property
+ def default_extname(self) -> str: ...
+ @property
+ def default_fragment_scan_options(self) -> FragmentScanOptions: ...
+ @default_fragment_scan_options.setter
+ def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ...
+
+
+class Fragment(lib._Weakrefable):
+ def open(self) -> lib.NativeFile | lib.BufferReader: ...
+ @property
+ def path(self) -> str: ...
+ @property
+ def row_groups(self) -> list[int]: ...
+
+ @property
+ def filesystem(self) -> SupportedFileSystem: ...
+
+ @property
+ def physical_schema(self) -> lib.Schema: ...
+
+ @property
+ def partition_expression(self) -> Expression: ...
+
+ def scanner(
+ self,
+ schema: lib.Schema | None = None,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> Scanner: ...
+
+ def to_batches(
+ self,
+ schema: lib.Schema | None = None,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> Iterator[lib.RecordBatch]: ...
+
+ def to_table(
+ self,
+ schema: lib.Schema | None = None,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> lib.Table: ...
+
+ def take(
+ self,
+ indices: Indices,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> lib.Table: ...
+
+ def head(
+ self,
+ num_rows: int,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> lib.Table: ...
+
+ def count_rows(
+ self,
+ columns: list[str] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> int: ...
+
+
+class FileFragment(Fragment):
+ def open(self) -> lib.NativeFile: ...
+
+ @property
+ def path(self) -> str: ...
+
+ @property
+ def filesystem(self) -> FileSystem: ...
+
+ @property
+ def buffer(self) -> lib.Buffer: ...
+
+ @property
+ def format(self) -> FileFormat: ...
+
+
+class FragmentScanOptions(lib._Weakrefable):
+ @property
+ def type_name(self) -> str: ...
+
+
+class IpcFileWriteOptions(FileWriteOptions):
+ @property
+ def write_options(self) -> IpcWriteOptions: ...
+ @write_options.setter
+ def write_options(self, write_options: IpcWriteOptions) -> None: ...
+
+
+class IpcFileFormat(FileFormat):
+ def equals(self, other: IpcFileFormat) -> bool: ...
+ def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ...
+ @property
+ def default_extname(self) -> str: ...
+
+
+class FeatherFileFormat(IpcFileFormat):
+ ...
+
+
+class CsvFileFormat(FileFormat):
+ def __init__(
+ self,
+ parse_options: csv.ParseOptions | None = None,
+ default_fragment_scan_options: CsvFragmentScanOptions | None = None,
+ convert_options: csv.ConvertOptions | None = None,
+ read_options: csv.ReadOptions | None = None,
+ ) -> None: ...
+ def make_write_options(
+ self, **kwargs) -> CsvFileWriteOptions: ... # type: ignore[override]
+
+ @property
+ def parse_options(self) -> csv.ParseOptions: ...
+ @parse_options.setter
+ def parse_options(self, parse_options: csv.ParseOptions) -> None: ...
+ def equals(self, other: CsvFileFormat) -> bool: ...
+
+
+class CsvFragmentScanOptions(FragmentScanOptions):
+ convert_options: csv.ConvertOptions
+ read_options: csv.ReadOptions
+
+ def __init__(
+ self,
+ convert_options: csv.ConvertOptions | None = None,
+ read_options: csv.ReadOptions | None = None,
+ ) -> None: ...
+ def equals(self, other: CsvFragmentScanOptions) -> bool: ...
+
+
+class CsvFileWriteOptions(FileWriteOptions):
+ write_options: csv.WriteOptions
+
+
+class JsonFileFormat(FileFormat):
+ def __init__(
+ self,
+ default_fragment_scan_options: JsonFragmentScanOptions | None = None,
+ parse_options: _json.ParseOptions | None = None,
+ read_options: _json.ReadOptions | None = None,
+ ) -> None: ...
+ def equals(self, other: JsonFileFormat) -> bool: ...
+
+
+class JsonFragmentScanOptions(FragmentScanOptions):
+ parse_options: _json.ParseOptions
+ read_options: _json.ReadOptions
+
+ def __init__(
+ self,
+ parse_options: _json.ParseOptions | None = None,
+ read_options: _json.ReadOptions | None = None,
+ ) -> None: ...
+ def equals(self, other: JsonFragmentScanOptions) -> bool: ...
+
+
+class Partitioning(lib._Weakrefable):
+ def parse(self, path: str) -> Expression: ...
+
+ def format(self, expr: Expression) -> tuple[str, str]: ...
+
+ @property
+ def schema(self) -> lib.Schema: ...
+
+ @property
+ def dictionaries(self) -> list[Any]: ...
+
+
+class PartitioningFactory(lib._Weakrefable):
+ @property
+ def type_name(self) -> str: ...
+
+
+class KeyValuePartitioning(Partitioning):
+ @property
+ def dictionaries(self) -> list[Any]: ...
+
+
+class DirectoryPartitioning(KeyValuePartitioning):
+ @staticmethod
+ def discover(
+ field_names: list[str] | None = None,
+ infer_dictionary: bool = False,
+ max_partition_dictionary_size: int = 0,
+ schema: lib.Schema | None = None,
+ segment_encoding: Literal["uri", "none"] = "uri",
+ ) -> PartitioningFactory: ...
+
+ def __init__(
+ self,
+ schema: lib.Schema,
+ dictionaries: dict[str, lib.Array] | None = None,
+ segment_encoding: Literal["uri", "none"] = "uri",
+ ) -> None: ...
+
+
+class HivePartitioning(KeyValuePartitioning):
+ def __init__(
+ self,
+ schema: lib.Schema,
+ dictionaries: dict[str, lib.Array] | None = None,
+ null_fallback: str = "__HIVE_DEFAULT_PARTITION__",
+ segment_encoding: Literal["uri", "none"] = "uri",
+ ) -> None: ...
+
+ @staticmethod
+ def discover(
+ infer_dictionary: bool = False,
+ max_partition_dictionary_size: int = 0,
+ null_fallback="__HIVE_DEFAULT_PARTITION__",
+ schema: lib.Schema | None = None,
+ segment_encoding: Literal["uri", "none"] = "uri",
+ ) -> PartitioningFactory: ...
+
+
+class FilenamePartitioning(KeyValuePartitioning):
+ def __init__(
+ self,
+ schema: lib.Schema,
+ dictionaries: dict[str, lib.Array] | None = None,
+ segment_encoding: Literal["uri", "none"] = "uri",
+ ) -> None: ...
+
+ @staticmethod
+ def discover(
+ field_names: list[str] | None = None,
+ infer_dictionary: bool = False,
+ schema: lib.Schema | None = None,
+ segment_encoding: Literal["uri", "none"] = "uri",
+ ) -> PartitioningFactory: ...
+
+
+class DatasetFactory(lib._Weakrefable):
+ root_partition: Expression
+ def finish(self, schema: lib.Schema | None = None) -> Dataset: ...
+
+ def inspect(
+ self,
+ *,
+ promote_options: str = "default",
+ fragments: list[Fragment] | int | str | None = None,
+ ) -> lib.Schema: ...
+
+ def inspect_schemas(self) -> list[lib.Schema]: ...
+
+
+class FileSystemFactoryOptions(lib._Weakrefable):
+ partitioning: Partitioning
+ partitioning_factory: PartitioningFactory
+ partition_base_dir: str
+ exclude_invalid_files: bool
+ selector_ignore_prefixes: list[str]
+
+ def __init__(
+ self,
+ partition_base_dir: str | None = None,
+ partitioning: Partitioning | PartitioningFactory | None = None,
+ exclude_invalid_files: bool | None = True,
+ selector_ignore_prefixes: list[str] | None = None,
+ ) -> None: ...
+
+
+class FileSystemDatasetFactory(DatasetFactory):
+ def __init__(
+ self,
+ filesystem: SupportedFileSystem,
+ paths_or_selector: Collection[str] | FileSelector,
+ format: FileFormat,
+ options: FileSystemFactoryOptions | None = None,
+ ) -> None: ...
+
+
+class UnionDatasetFactory(DatasetFactory):
+ def __init__(self, factories: list[DatasetFactory]) -> None: ...
+
+
+_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch)
+
+
+class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]):
+ def __iter__(self) -> Self: ...
+ def __next__(self) -> _RecordBatchT: ...
+
+
+class TaggedRecordBatch(NamedTuple):
+ record_batch: lib.RecordBatch
+ fragment: Fragment
+
+
+class TaggedRecordBatchIterator(lib._Weakrefable):
+ def __iter__(self) -> Self: ...
+ def __next__(self) -> TaggedRecordBatch: ...
+
+
+class Scanner(lib._Weakrefable):
+ @staticmethod
+ def from_dataset(
+ dataset: Dataset,
+ *,
+ columns: list[str] | dict[str, Expression] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> Scanner: ...
+
+ @staticmethod
+ def from_fragment(
+ fragment: Fragment,
+ *,
+ schema: lib.Schema | None = None,
+ columns: list[str] | dict[str, Expression] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> Scanner: ...
+
+ @staticmethod
+ def from_batches(
+ source: Iterator[lib.RecordBatch] | RecordBatchReader | Any,
+ *,
+ schema: lib.Schema | None = None,
+ columns: list[str] | dict[str, Expression] | None = None,
+ filter: Expression | None = None,
+ batch_size: int = ...,
+ batch_readahead: int = 16,
+ fragment_readahead: int = 4,
+ fragment_scan_options: FragmentScanOptions | None = None,
+ use_threads: bool = True,
+ cache_metadata: bool = True,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> Scanner: ...
+
+ @property
+ def dataset_schema(self) -> lib.Schema: ...
+
+ @property
+ def projected_schema(self) -> lib.Schema: ...
+
+ def to_batches(self) -> Iterator[lib.RecordBatch]: ...
+
+ def scan_batches(self) -> TaggedRecordBatchIterator: ...
+
+ def to_table(self) -> lib.Table: ...
+
+ def take(self, indices: Indices) -> lib.Table: ...
+
+ def head(self, num_rows: int) -> lib.Table: ...
+
+ def count_rows(self) -> int: ...
+
+ def to_reader(self) -> RecordBatchReader: ...
+
+
+def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ...
+
+
+class WrittenFile(lib._Weakrefable):
+ def __init__(self, path: str, metadata: _parquet.FileMetaData |
+ None, size: int) -> None: ...
+
+
+def _filesystemdataset_write(
+ data: Scanner,
+ base_dir: StrPath,
+ basename_template: str,
+ filesystem: SupportedFileSystem,
+ partitioning: Partitioning,
+ preserve_order: bool,
+ file_options: FileWriteOptions,
+ max_partitions: int,
+ file_visitor: Callable[[str], None] | None,
+ existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"],
+ max_open_files: int,
+ max_rows_per_file: int,
+ min_rows_per_group: int,
+ max_rows_per_group: int,
+ create_dir: bool,
+): ...
+
+
+class _ScanNodeOptions(ExecNodeOptions):
+ def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ...
+
+
+class ScanNodeOptions(_ScanNodeOptions):
+ def __init__(
+ self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs
+ ) -> None: ...
diff --git a/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi b/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi
new file mode 100644
index 00000000000..62f49bf5d30
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from ._dataset import FileFormat
+
+
+class OrcFileFormat(FileFormat):
+ def equals(self, other: OrcFileFormat) -> bool: ...
+ @property
+ def default_extname(self): ...
diff --git a/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi b/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi
new file mode 100644
index 00000000000..6c27e3c8a93
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import IO, Any, TypedDict
+
+from _typeshed import StrPath
+
+from ._compute import Expression
+from ._dataset import (
+ DatasetFactory,
+ FileFormat,
+ FileFragment,
+ FileWriteOptions,
+ Fragment,
+ FragmentScanOptions,
+ Partitioning,
+ PartitioningFactory,
+)
+from ._dataset_parquet_encryption import ParquetDecryptionConfig
+from ._fs import SupportedFileSystem
+from ._parquet import FileDecryptionProperties, FileMetaData
+from ._types import DataType, LargeListType, ListType
+from .lib import CacheOptions, Schema, _Weakrefable, NativeFile, Buffer, BufferReader
+
+parquet_encryption_enabled: bool
+
+
+class ParquetFileFormat(FileFormat):
+ def __init__(
+ self,
+ read_options: ParquetReadOptions | None = None,
+ default_fragment_scan_options: ParquetFragmentScanOptions | None = None,
+ *,
+ pre_buffer: bool = True,
+ coerce_int96_timestamp_unit: str | None = None,
+ thrift_string_size_limit: int | None = None,
+ thrift_container_size_limit: int | None = None,
+ page_checksum_verification: bool = False,
+ arrow_extensions_enabled: bool = True,
+ binary_type: DataType | None = None,
+ list_type: type[ListType | LargeListType] | None = None,
+ use_buffered_stream: bool = False,
+ buffer_size: int = 8192,
+ dictionary_columns: list[str] | set[str] | None = None,
+ decryption_properties: FileDecryptionProperties | None = None,
+ ) -> None: ...
+ @property
+ def read_options(self) -> ParquetReadOptions: ...
+ def make_write_options(
+ self, **kwargs) -> ParquetFileWriteOptions: ... # type: ignore[override]
+
+ def equals(self, other: ParquetFileFormat) -> bool: ...
+ @property
+ def default_extname(self) -> str: ...
+
+ def make_fragment(
+ self,
+ file: StrPath | IO | Buffer | BufferReader,
+ filesystem: SupportedFileSystem | None = None,
+ partition_expression: Expression | None = None,
+ row_groups: Iterable[int] | None = None,
+ *,
+ file_size: int | None = None,
+ ) -> Fragment: ...
+
+
+class _NameStats(TypedDict):
+ min: Any
+ max: Any
+
+
+class RowGroupInfo:
+ id: int
+ metadata: FileMetaData
+ schema: Schema
+
+ def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ...
+ @property
+ def num_rows(self) -> int: ...
+ @property
+ def total_byte_size(self) -> int: ...
+ @property
+ def statistics(self) -> dict[str, _NameStats]: ...
+
+
+class ParquetFileFragment(FileFragment):
+ def ensure_complete_metadata(self) -> None: ...
+ @property
+ def path(self) -> str: ...
+ @property
+ def filesystem(self) -> SupportedFileSystem: ...
+ def open(self) -> NativeFile: ...
+
+ @property
+ def row_groups(self) -> list[int]: ...
+ @property
+ def metadata(self) -> FileMetaData: ...
+ @property
+ def num_row_groups(self) -> int: ...
+
+ def split_by_row_group(
+ self, filter: Expression | None = None, schema: Schema | None = None
+ ) -> list[Fragment]: ...
+
+ def subset(
+ self,
+ filter: Expression | None = None,
+ schema: Schema | None = None,
+ row_group_ids: list[int] | None = None,
+ ) -> ParquetFileFormat: ...
+
+
+class ParquetReadOptions(_Weakrefable):
+ def __init__(
+ self,
+ dictionary_columns: list[str] | set[str] | None = None,
+ coerce_int96_timestamp_unit: str | None = None,
+ binary_type: DataType | None = None,
+ list_type: type[ListType | LargeListType] | None = None,
+ ) -> None: ...
+
+ @property
+ def dictionary_columns(self) -> set[str]: ...
+ @dictionary_columns.setter
+ def dictionary_columns(self, columns: list[str] | set[str]) -> None: ...
+
+ @property
+ def coerce_int96_timestamp_unit(self) -> str: ...
+ @coerce_int96_timestamp_unit.setter
+ def coerce_int96_timestamp_unit(self, unit: str) -> None: ...
+
+ @property
+ def binary_type(self) -> DataType: ...
+ @binary_type.setter
+ def binary_type(self, type: DataType | None) -> None: ...
+
+ @property
+ def list_type(self) -> type[ListType | LargeListType]: ...
+ @list_type.setter
+ def list_type(self, type: type[ListType | LargeListType] | None) -> None: ...
+
+ def equals(self, other: ParquetReadOptions) -> bool: ...
+
+
+class ParquetFileWriteOptions(FileWriteOptions):
+ def update(self, **kwargs) -> None: ...
+ def _set_properties(self) -> None: ...
+ def _set_arrow_properties(self) -> None: ...
+ def _set_encryption_config(self) -> None: ...
+ # accept passthrough options used in tests
+ def __init__(self, **kwargs) -> None: ...
+
+
+@dataclass(kw_only=True)
+class ParquetFragmentScanOptions(FragmentScanOptions):
+ use_buffered_stream: bool = False
+ buffer_size: int = 8192
+ pre_buffer: bool = True
+ cache_options: CacheOptions | None = None
+ thrift_string_size_limit: int | None = None
+ thrift_container_size_limit: int | None = None
+ decryption_config: ParquetDecryptionConfig | None = None
+ decryption_properties: FileDecryptionProperties | None = None
+ page_checksum_verification: bool = False
+
+ def equals(self, other: ParquetFragmentScanOptions) -> bool: ...
+
+
+@dataclass
+class ParquetFactoryOptions(_Weakrefable):
+
+ partition_base_dir: str | None = None
+ partitioning: Partitioning | PartitioningFactory | None = None
+ validate_column_chunk_paths: bool = False
+
+
+class ParquetDatasetFactory(DatasetFactory):
+ def __init__(
+ self,
+ metadata_path: str,
+ filesystem: SupportedFileSystem,
+ format: FileFormat,
+ options: ParquetFactoryOptions | None = None,
+ ) -> None: ...
diff --git a/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi b/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi
new file mode 100644
index 00000000000..b36f18522e5
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions
+from ._parquet import FileDecryptionProperties
+from ._parquet_encryption import (CryptoFactory, EncryptionConfiguration,
+ DecryptionConfiguration, KmsConnectionConfig)
+from .lib import _Weakrefable
+
+
+class ParquetEncryptionConfig(_Weakrefable):
+ def __init__(
+ self,
+ crypto_factory: CryptoFactory,
+ kms_connection_config: KmsConnectionConfig,
+ encryption_config: EncryptionConfiguration,
+ ) -> None: ...
+
+
+class ParquetDecryptionConfig(_Weakrefable):
+ def __init__(
+ self,
+ crypto_factory: CryptoFactory,
+ kms_connection_config: KmsConnectionConfig,
+ decryption_config: DecryptionConfiguration,
+ ) -> None: ...
+
+
+def set_encryption_config(
+ opts: ParquetFileWriteOptions,
+ config: ParquetEncryptionConfig,
+) -> None: ...
+
+
+def set_decryption_properties(
+ opts: ParquetFragmentScanOptions,
+ config: FileDecryptionProperties,
+): ...
+
+
+def set_decryption_config(
+ opts: ParquetFragmentScanOptions,
+ config: ParquetDecryptionConfig,
+): ...
diff --git a/python/pyarrow-stubs/pyarrow/_feather.pyi b/python/pyarrow-stubs/pyarrow/_feather.pyi
new file mode 100644
index 00000000000..2f4757cd5f1
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_feather.pyi
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import IO, Literal
+from collections.abc import Sequence
+
+from _typeshed import StrPath
+
+from .lib import Buffer, NativeFile, Table, _Weakrefable
+
+
+class FeatherError(Exception):
+ ...
+
+
+def write_feather(
+ table: Table,
+ dest: StrPath | IO | NativeFile,
+ compression: str | None = None,
+ compression_level: int | None = None,
+ chunksize: int | None = None,
+ version: Literal[1, 2] = 2,
+): ...
+
+
+class FeatherReader(_Weakrefable):
+ def __init__(
+ self,
+ source: StrPath | IO | NativeFile | Buffer,
+ use_memory_map: bool,
+ use_threads: bool,
+ ) -> None: ...
+ @property
+ def version(self) -> str: ...
+ def read(self) -> Table: ...
+ def read_indices(self, indices: Sequence[int]) -> Table: ...
+ def read_names(self, names: Sequence[str]) -> Table: ...
diff --git a/python/pyarrow-stubs/pyarrow/_flight.pyi b/python/pyarrow-stubs/pyarrow/_flight.pyi
new file mode 100644
index 00000000000..03d6c6580ab
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_flight.pyi
@@ -0,0 +1,660 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import asyncio
+import enum
+import sys
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+from collections.abc import Generator, Iterable, Iterator, Sequence
+from typing import Any, Generic, NamedTuple, TypeVar
+from datetime import datetime
+from typing_extensions import deprecated
+
+from .ipc import _ReadPandasMixin, ReadStats
+from .lib import (
+ ArrowCancelled,
+ ArrowException,
+ ArrowInvalid,
+ Buffer,
+ IpcReadOptions,
+ IpcWriteOptions,
+ RecordBatch,
+ RecordBatchReader,
+ Scalar,
+ Schema,
+ Table,
+ _CRecordBatchWriter,
+ _Weakrefable,
+)
+
+_T = TypeVar("_T")
+
+
+class FlightCallOptions(_Weakrefable):
+ def __init__(
+ self,
+ timeout: float | None = None,
+ write_options: IpcWriteOptions | None = None,
+ headers: list[tuple[str | bytes, str | bytes]] | None = None,
+ read_options: IpcReadOptions | None = None,
+ ) -> None: ...
+
+
+class CertKeyPair(NamedTuple):
+ cert: str | bytes | None
+ key: str | bytes | None
+
+
+class FlightError(Exception):
+ extra_info: bytes
+
+
+class FlightInternalError(FlightError, ArrowException):
+ ...
+
+
+class FlightTimedOutError(FlightError, ArrowException):
+ ...
+
+
+class FlightCancelledError(FlightError, ArrowCancelled):
+ def __init__(self, message: str, *, extra_info: bytes | None = None) -> None: ...
+
+
+class FlightServerError(FlightError, ArrowException):
+ ...
+
+
+class FlightUnauthenticatedError(FlightError, ArrowException):
+ ...
+
+
+class FlightUnauthorizedError(FlightError, ArrowException):
+ ...
+
+
+class FlightUnavailableError(FlightError, ArrowException):
+ ...
+
+
+class FlightWriteSizeExceededError(ArrowInvalid):
+ limit: int
+ actual: int
+
+
+class Action(_Weakrefable):
+ def __init__(
+ self, action_type: bytes | str, buf: Buffer | bytes | None) -> None: ...
+
+ @property
+ def type(self) -> str: ...
+
+ @property
+ def body(self) -> Buffer: ...
+
+ def serialize(self) -> bytes: ...
+
+ @classmethod
+ def deserialize(cls, serialized: bytes) -> Self: ...
+
+
+class ActionType(NamedTuple):
+ type: str
+ description: str
+
+ def make_action(self, buf: Buffer | bytes) -> Action: ...
+
+
+class Result(_Weakrefable):
+ def __init__(self, buf: Buffer | bytes) -> None: ...
+
+ @property
+ def body(self) -> Buffer: ...
+
+ def serialize(self) -> bytes: ...
+
+ @classmethod
+ def deserialize(cls, serialized: bytes) -> Self: ...
+
+
+class BasicAuth(_Weakrefable):
+ def __init__(
+ self, username: str | bytes | None = None, password: str | bytes | None = None
+ ) -> None: ...
+
+ @property
+ def username(self) -> bytes: ...
+ @property
+ def password(self) -> bytes: ...
+ def serialize(self) -> str: ...
+ @staticmethod
+ def deserialize(serialized: str | bytes) -> BasicAuth: ...
+
+
+class DescriptorType(enum.Enum):
+ UNKNOWN = 0
+ PATH = 1
+ CMD = 2
+
+
+class FlightMethod(enum.Enum):
+ INVALID = 0
+ HANDSHAKE = 1
+ LIST_FLIGHTS = 2
+ GET_FLIGHT_INFO = 3
+ GET_SCHEMA = 4
+ DO_GET = 5
+ DO_PUT = 6
+ DO_ACTION = 7
+ LIST_ACTIONS = 8
+ DO_EXCHANGE = 9
+
+
+class FlightDescriptor(_Weakrefable):
+ @staticmethod
+ def for_path(*path: str | bytes) -> FlightDescriptor: ...
+
+ @staticmethod
+ def for_command(command: str | bytes) -> FlightDescriptor: ...
+
+ @property
+ def descriptor_type(self) -> DescriptorType: ...
+
+ @property
+ def path(self) -> list[bytes] | None: ...
+
+ @property
+ def command(self) -> bytes | None: ...
+
+ def serialize(self) -> bytes: ...
+ @classmethod
+ def deserialize(cls, serialized: bytes) -> Self: ...
+
+
+class Ticket(_Weakrefable):
+ def __init__(self, ticket: str | bytes) -> None: ...
+ @property
+ def ticket(self) -> bytes: ...
+ def serialize(self) -> bytes: ...
+ @classmethod
+ def deserialize(cls, serialized: bytes) -> Self: ...
+
+
+class Location(_Weakrefable):
+ def __init__(self, uri: str | bytes) -> None: ...
+ @property
+ def uri(self) -> bytes: ...
+ def equals(self, other: Location) -> bool: ...
+ @staticmethod
+ def for_grpc_tcp(host: str | bytes, port: int) -> Location: ...
+
+ @staticmethod
+ def for_grpc_tls(host: str | bytes, port: int) -> Location: ...
+
+ @staticmethod
+ def for_grpc_unix(path: str | bytes) -> Location: ...
+
+
+class FlightEndpoint(_Weakrefable):
+ def __init__(
+ self,
+ ticket: Ticket | str | bytes | object,
+ locations: list[str | bytes | Location | object],
+ expiration_time: Scalar[Any] | str | datetime | None = ...,
+ app_metadata: bytes | str | object = ...,
+ ): ...
+
+ @property
+ def ticket(self) -> Ticket: ...
+
+ @property
+ def locations(self) -> list[Location]: ...
+
+ def serialize(self) -> bytes: ...
+ @property
+ def expiration_time(self) -> Scalar[Any] | None: ...
+
+ @property
+ def app_metadata(self) -> bytes | str: ...
+
+ @classmethod
+ def deserialize(cls, serialized: bytes) -> Self: ...
+
+
+class SchemaResult(_Weakrefable):
+ def __init__(self, schema: Schema) -> None: ...
+
+ @property
+ def schema(self) -> Schema: ...
+
+ def serialize(self) -> bytes: ...
+ @classmethod
+ def deserialize(cls, serialized: bytes) -> Self: ...
+
+
+class FlightInfo(_Weakrefable):
+ def __init__(
+ self,
+ schema: Schema | None,
+ descriptor: FlightDescriptor,
+ endpoints: list[FlightEndpoint],
+ total_records: int | None = ...,
+ total_bytes: int | None = ...,
+ ordered: bool = ...,
+ app_metadata: bytes | str = ...,
+ ) -> None: ...
+
+ @property
+ def schema(self) -> Schema | None: ...
+
+ @property
+ def descriptor(self) -> FlightDescriptor: ...
+
+ @property
+ def endpoints(self) -> list[FlightEndpoint]: ...
+
+ @property
+ def total_records(self) -> int: ...
+
+ @property
+ def total_bytes(self) -> int: ...
+
+ @property
+ def ordered(self) -> bool: ...
+
+ @property
+ def app_metadata(self) -> bytes | str: ...
+
+ def serialize(self) -> bytes: ...
+ @classmethod
+ def deserialize(cls, serialized: bytes) -> Self: ...
+
+
+class FlightStreamChunk(_Weakrefable):
+ @property
+ def data(self) -> RecordBatch | None: ...
+ @property
+ def app_metadata(self) -> Buffer | None: ...
+ def __iter__(self): ...
+
+
+class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin):
+ # Needs to be separate class so the "real" class can subclass the
+ # pure-Python mixin class
+
+ def __iter__(self) -> Self: ...
+ def __next__(self) -> FlightStreamChunk: ...
+ @property
+ def schema(self) -> Schema: ...
+
+ def read_all(self) -> Table: ...
+
+ def read_chunk(self) -> FlightStreamChunk: ...
+
+ def to_reader(self) -> RecordBatchReader: ...
+
+
+class MetadataRecordBatchReader(_MetadataRecordBatchReader):
+ @property
+ def stats(self) -> ReadStats: ...
+
+
+class FlightStreamReader(MetadataRecordBatchReader):
+ @property
+ def stats(self) -> ReadStats: ...
+
+ def cancel(self) -> None: ...
+
+ def read_all(self) -> Table: ...
+
+ def read(self) -> RecordBatch | None: ...
+
+
+class MetadataRecordBatchWriter(_CRecordBatchWriter):
+ def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: ...
+
+ def write_metadata(self, buf: Buffer | bytes) -> None: ...
+
+ def write_batch(self, batch: RecordBatch) -> None: ... # type: ignore[override]
+
+ def write_table(self, table: Table, max_chunksize: int |
+ None = None, **kwargs) -> None: ...
+
+ def close(self) -> None: ...
+
+ def write_with_metadata(self, batch: RecordBatch, buf: Buffer | bytes) -> None: ...
+
+
+class FlightStreamWriter(MetadataRecordBatchWriter):
+ def done_writing(self) -> None: ...
+
+
+class FlightMetadataReader(_Weakrefable):
+ def read(self) -> Buffer | None: ...
+
+
+class FlightMetadataWriter(_Weakrefable):
+ def write(self, message: Buffer) -> None: ...
+
+
+class AsyncioCall(Generic[_T]):
+ _future: asyncio.Future[_T]
+
+ def as_awaitable(self) -> asyncio.Future[_T]: ...
+ def wakeup(self, result_or_exception: BaseException | _T) -> None: ...
+
+
+class AsyncioFlightClient:
+ def __init__(self, client: FlightClient) -> None: ...
+
+ async def get_flight_info(
+ self,
+ descriptor: FlightDescriptor,
+ *,
+ options: FlightCallOptions | None = None,
+ ): ...
+
+
+class FlightClient(_Weakrefable):
+ def __init__(
+ self,
+ location: str | tuple[str, int] | Location,
+ *,
+ tls_root_certs: str | None = None,
+ cert_chain: str | None = None,
+ private_key: str | None = None,
+ override_hostname: str | None = None,
+ middleware: list[ClientMiddlewareFactory] | None = None,
+ write_size_limit_bytes: int | None = None,
+ disable_server_verification: bool = False,
+ generic_options: list[tuple[str, int | str]] | None = None,
+ ): ...
+
+ @property
+ def supports_async(self) -> bool: ...
+ def as_async(self) -> AsyncioFlightClient: ...
+ def wait_for_available(self, timeout: int = 5) -> None: ...
+
+ @classmethod
+ @deprecated(
+ "Use the ``FlightClient`` constructor or "
+ "``pyarrow.flight.connect`` function instead."
+ )
+ def connect(
+ cls,
+ location: str | tuple[str, int] | Location,
+ tls_root_certs: str | None = None,
+ cert_chain: str | None = None,
+ private_key: str | None = None,
+ override_hostname: str | None = None,
+ disable_server_verification: bool = False,
+ ) -> FlightClient: ...
+
+ def authenticate(
+ self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None
+ ) -> None: ...
+
+ def authenticate_basic_token(
+ self, username: str | bytes, password: str | bytes,
+ options: FlightCallOptions | None = None
+ ) -> tuple[str, str]: ...
+
+ def list_actions(self, options: FlightCallOptions |
+ None = None) -> list[Action]: ...
+
+ def do_action(
+ self, action: Action | tuple[bytes | str, bytes | str] | str,
+ options: FlightCallOptions | None = None
+ ) -> Iterator[Result]: ...
+
+ def list_flights(
+ self, criteria: str | bytes | None = None,
+ options: FlightCallOptions | None = None
+ ) -> Generator[FlightInfo, None, None]: ...
+
+ def get_flight_info(
+ self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None
+ ) -> FlightInfo: ...
+
+ def get_schema(
+ self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None
+ ) -> SchemaResult: ...
+
+ def do_get(
+ self, ticket: Ticket, options: FlightCallOptions | None = None
+ ) -> FlightStreamReader: ...
+
+ def do_put(
+ self,
+ descriptor: FlightDescriptor,
+ schema: Schema | None,
+ options: FlightCallOptions | None = None,
+ ) -> tuple[FlightStreamWriter, FlightStreamReader]: ...
+
+ def do_exchange(
+ self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None
+ ) -> tuple[FlightStreamWriter, FlightStreamReader]: ...
+
+ def close(self) -> None: ...
+
+ def __enter__(self) -> Self: ...
+ def __exit__(self, exc_type, exc_value, traceback) -> None: ...
+
+
+class FlightDataStream(_Weakrefable):
+ ...
+
+
+class RecordBatchStream(FlightDataStream):
+ def __init__(self, data_source: RecordBatchReader | Table | None = None,
+ options: IpcWriteOptions | None = None) -> None: ...
+
+
+class GeneratorStream(FlightDataStream):
+ def __init__(
+ self,
+ schema: Schema,
+ generator: Iterable[
+ FlightDataStream
+ | Table
+ | RecordBatch
+ | RecordBatchReader
+ | tuple[RecordBatch, bytes]
+ ],
+ options: IpcWriteOptions | None = None,
+ ) -> None: ...
+
+
+class ServerCallContext(_Weakrefable):
+ def peer_identity(self) -> bytes: ...
+
+ def peer(self) -> str: ...
+
+ # Set safe=True as gRPC on Windows sometimes gives garbage bytes
+ def is_cancelled(self) -> bool: ...
+
+ def add_header(self, key: str, value: str) -> None: ...
+
+ def add_trailer(self, key: str, value: str) -> None: ...
+
+ def get_middleware(self, key: str) -> ServerMiddleware | None: ...
+
+
+class ServerAuthReader(_Weakrefable):
+ def read(self) -> str: ...
+
+
+class ServerAuthSender(_Weakrefable):
+ def write(self, message: str) -> None: ...
+
+
+class ClientAuthReader(_Weakrefable):
+ def read(self) -> str: ...
+
+
+class ClientAuthSender(_Weakrefable):
+ def write(self, message: str) -> None: ...
+
+
+class ServerAuthHandler(_Weakrefable):
+ def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): ...
+
+ def is_valid(self, token: str) -> bool: ...
+
+
+class ClientAuthHandler(_Weakrefable):
+ def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): ...
+
+ def get_token(self) -> str: ...
+
+
+class CallInfo(NamedTuple):
+ method: FlightMethod
+
+
+class ClientMiddlewareFactory(_Weakrefable):
+ def start_call(self, info: CallInfo) -> ClientMiddleware | None: ...
+
+
+class ClientMiddleware(_Weakrefable):
+ def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ...
+
+ def received_headers(self, headers: dict[str, list[str] | list[bytes]]): ...
+
+ def call_completed(self, exception: ArrowException): ...
+
+
+class ServerMiddlewareFactory(_Weakrefable):
+ def start_call(
+ self, info: CallInfo, headers: dict[str, list[str] | list[bytes]]
+ ) -> ServerMiddleware | None: ...
+
+
+class TracingServerMiddlewareFactory(ServerMiddlewareFactory):
+ ...
+
+
+class ServerMiddleware(_Weakrefable):
+ def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ...
+
+ def call_completed(self, exception: ArrowException): ...
+
+ @property
+ def trace_context(self) -> dict: ...
+
+
+class TracingServerMiddleware(ServerMiddleware):
+ trace_context: dict
+ def __init__(self, trace_context: dict) -> None: ...
+
+
+class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory):
+ def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ...
+
+ def start_call( # type: ignore[override]
+ self, info: CallInfo, headers: dict[str, list[str] | list[bytes]]
+ ) -> _ServerMiddlewareFactoryWrapper | None: ...
+
+
+class _ServerMiddlewareWrapper(ServerMiddleware):
+ def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ...
+ def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ...
+ def call_completed(self, exception: ArrowException) -> None: ...
+
+
+class _FlightServerFinalizer(_Weakrefable):
+
+ def finalize(self) -> None: ...
+
+
+class FlightServerBase(_Weakrefable):
+ def __init__(
+ self,
+ location: str | tuple[str, int] | Location | None = None,
+ auth_handler: ServerAuthHandler | None = None,
+ tls_certificates: list[tuple[str, str]] | None = None,
+ verify_client: bool = False,
+ root_certificates: str | None = None,
+ middleware: dict[str, ServerMiddlewareFactory] | None = None,
+ ): ...
+
+ @property
+ def port(self) -> int: ...
+
+ def list_flights(self, context: ServerCallContext,
+ criteria: str) -> Iterator[FlightInfo]: ...
+
+ def get_flight_info(
+ self, context: ServerCallContext, descriptor: FlightDescriptor
+ ) -> FlightInfo: ...
+
+ def get_schema(self, context: ServerCallContext,
+ descriptor: FlightDescriptor) -> Schema: ...
+
+ def do_put(
+ self,
+ context: ServerCallContext,
+ descriptor: FlightDescriptor,
+ reader: MetadataRecordBatchReader,
+ writer: FlightMetadataWriter,
+ ) -> None: ...
+
+ def do_get(self, context: ServerCallContext,
+ ticket: Ticket) -> FlightDataStream: ...
+
+ def do_exchange(
+ self,
+ context: ServerCallContext,
+ descriptor: FlightDescriptor,
+ reader: MetadataRecordBatchReader,
+ writer: MetadataRecordBatchWriter,
+ ) -> None: ...
+
+ def list_actions(self, context: ServerCallContext) -> Iterable[Action]: ...
+
+ def do_action(self, context: ServerCallContext,
+ action: Action) -> Iterable[bytes]: ...
+
+ def serve(self) -> None: ...
+
+ def run(self) -> None: ...
+
+ def shutdown(self) -> None: ...
+
+ def wait(self) -> None: ...
+
+ def __enter__(self) -> Self: ...
+ def __exit__(
+ self, exc_type: object, exc_value: object, traceback: object) -> None: ...
+
+
+def connect(
+ location: str | tuple[str, int] | Location,
+ *,
+ tls_root_certs: str | None = None,
+ cert_chain: str | None = None,
+ private_key: str | None = None,
+ override_hostname: str | None = None,
+ middleware: list[ClientMiddlewareFactory] | None = None,
+ write_size_limit_bytes: int | None = None,
+ disable_server_verification: bool = False,
+ generic_options: Sequence[tuple[str, int | str]] | None = None,
+) -> FlightClient: ...
diff --git a/python/pyarrow-stubs/pyarrow/_fs.pyi b/python/pyarrow-stubs/pyarrow/_fs.pyi
new file mode 100644
index 00000000000..caf23a75d99
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_fs.pyi
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime as dt
+import enum
+import sys
+
+from abc import ABC, abstractmethod
+from _typeshed import StrPath
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+if sys.version_info >= (3, 10):
+ from typing import TypeAlias
+else:
+ from typing_extensions import TypeAlias
+
+from fsspec import AbstractFileSystem # type: ignore
+
+from .lib import NativeFile, _Weakrefable
+
+
+class FileType(enum.IntFlag):
+ NotFound = enum.auto()
+ Unknown = enum.auto()
+ File = enum.auto()
+ Directory = enum.auto()
+
+
+class FileInfo(_Weakrefable):
+ def __init__(
+ self,
+ path: str,
+ type: FileType = FileType.Unknown,
+ *,
+ mtime: dt.datetime | float | None = None,
+ mtime_ns: int | None = None,
+ size: int | None = None,
+ ): ...
+
+ def __getitem__(self, int) -> FileInfo: ...
+
+ @property
+ def type(self) -> FileType: ...
+
+ @property
+ def is_file(self) -> bool: ...
+ @property
+ def path(self) -> str: ...
+
+ @property
+ def base_name(self) -> str: ...
+
+ @property
+ def size(self) -> int: ...
+
+ @property
+ def extension(self) -> str: ...
+
+ @property
+ def mtime(self) -> dt.datetime | None: ...
+
+ @property
+ def mtime_ns(self) -> int | None: ...
+
+
+class FileSelector(_Weakrefable):
+ base_dir: str
+ allow_not_found: bool
+ recursive: bool
+ def __init__(self, base_dir: str, allow_not_found: bool = False,
+ recursive: bool = False): ...
+
+
+class FileSystem(_Weakrefable):
+ @classmethod
+ def from_uri(cls, uri: str | StrPath) -> tuple[Self, str]: ...
+
+ def equals(self, other: FileSystem | object) -> bool: ...
+
+ @property
+ def type_name(self) -> str: ...
+
+ def get_file_info(
+ self, paths_or_selector: str | list[str] | FileSelector
+ ) -> list[FileInfo] | FileInfo: ...
+
+ def create_dir(self, path: str, *, recursive: bool = True) -> None: ...
+
+ def delete_dir(self, path: str) -> None: ...
+
+ def delete_dir_contents(
+ self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False
+ ) -> None: ...
+
+ def move(self, src: str, dest: str) -> None: ...
+
+ def copy_file(self, src: str, dest: str) -> None: ...
+
+ def delete_file(self, path: str) -> None: ...
+
+ def open_input_file(self, path: str) -> NativeFile: ...
+
+ def open_input_stream(
+ self,
+ path: str,
+ compression: str | None = "detect",
+ buffer_size: int | None = None) -> NativeFile: ...
+
+ def open_output_stream(
+ self,
+ path: str,
+ compression: str | None = "detect",
+ buffer_size: int | None = None,
+ metadata: dict[str, str] | None = None,
+ ) -> NativeFile: ...
+
+ def open_append_stream(
+ self,
+ path: str,
+ compression: str | None = "detect",
+ buffer_size: int | None = None,
+ metadata: dict[str, str] | None = None,
+ ): ...
+
+ def normalize_path(self, path: str) -> str: ...
+
+
+class LocalFileSystem(FileSystem):
+ def __init__(self, *, use_mmap: bool = False) -> None: ...
+
+
+class SubTreeFileSystem(FileSystem):
+ def __init__(self, base_path: str, base_fs: FileSystem): ...
+ @property
+ def base_path(self) -> str: ...
+ @property
+ def base_fs(self) -> FileSystem: ...
+
+
+class _MockFileSystem(FileSystem):
+ def __init__(self, current_time: dt.datetime | None = None) -> None: ...
+
+
+class PyFileSystem(FileSystem):
+ def __init__(self, handler: FileSystemHandler | None) -> None: ...
+ @property
+ def handler(self) -> FileSystemHandler: ...
+
+
+class FileSystemHandler(ABC):
+ @abstractmethod
+ def get_type_name(self) -> str: ...
+
+ @abstractmethod
+ def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: ...
+
+ @abstractmethod
+ def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ...
+
+ @abstractmethod
+ def create_dir(self, path: str, recursive: bool) -> None: ...
+
+ @abstractmethod
+ def delete_dir(self, path: str) -> None: ...
+
+ @abstractmethod
+ def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ...
+
+ @abstractmethod
+ def delete_root_dir_contents(self) -> None: ...
+
+ @abstractmethod
+ def delete_file(self, path: str) -> None: ...
+
+ @abstractmethod
+ def move(self, src: str, dest: str) -> None: ...
+
+ @abstractmethod
+ def copy_file(self, src: str, dest: str) -> None: ...
+
+ @abstractmethod
+ def open_input_stream(self, path: str) -> NativeFile: ...
+
+ @abstractmethod
+ def open_input_file(self, path: str) -> NativeFile: ...
+
+ @abstractmethod
+ def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ...
+
+ @abstractmethod
+ def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ...
+
+ @abstractmethod
+ def normalize_path(self, path: str) -> str: ...
+
+
+SupportedFileSystem: TypeAlias = AbstractFileSystem | FileSystem
+
+
+def _copy_files(
+ source_fs: FileSystem,
+ source_path: str,
+ destination_fs: SupportedFileSystem | None,
+ destination_path: str,
+ chunk_size: int = 1048576,
+ use_threads: bool = True,
+) -> None: ...
+
+
+def _copy_files_selector(
+ source_fs: FileSystem,
+ source_sel: FileSelector,
+ destination_fs: SupportedFileSystem | None,
+ destination_base_dir: str,
+ chunk_size: int = 1048576,
+ use_threads: bool = True,
+) -> None: ...
diff --git a/python/pyarrow-stubs/pyarrow/_gcsfs.pyi b/python/pyarrow-stubs/pyarrow/_gcsfs.pyi
new file mode 100644
index 00000000000..a0af3fa3871
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_gcsfs.pyi
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime as dt
+
+from ._fs import FileSystem
+from .lib import KeyValueMetadata
+
+
+class GcsFileSystem(FileSystem):
+ def __init__(
+ self,
+ *,
+ anonymous: bool = False,
+ access_token: str | None = None,
+ target_service_account: str | None = None,
+ credential_token_expiration: dt.datetime | None = None,
+ default_bucket_location: str = "US",
+ scheme: str = "https",
+ endpoint_override: str | None = None,
+ default_metadata: dict | KeyValueMetadata | None = None,
+ retry_time_limit: dt.timedelta | None = None,
+ project_id: str | None = None,
+ ): ...
+ @property
+ def default_bucket_location(self) -> str: ...
+
+ @property
+ def project_id(self) -> str: ...
diff --git a/python/pyarrow-stubs/pyarrow/_hdfs.pyi b/python/pyarrow-stubs/pyarrow/_hdfs.pyi
new file mode 100644
index 00000000000..370eaf70927
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_hdfs.pyi
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from _typeshed import StrPath
+
+from ._fs import FileSystem
+
+
+class HadoopFileSystem(FileSystem):
+ def __init__(
+ self,
+ host: str | None = None,
+ port: int = 8020,
+ *,
+ user: str | None = None,
+ replication: int = 3,
+ buffer_size: int = 0,
+ default_block_size: int | None = None,
+ kerb_ticket: StrPath | None = None,
+ extra_conf: dict | None = None,
+ ): ...
+ @staticmethod
+ def from_uri(uri: str | int) -> HadoopFileSystem: ... # type: ignore[override]
diff --git a/python/pyarrow-stubs/pyarrow/_ipc.pyi b/python/pyarrow-stubs/pyarrow/_ipc.pyi
new file mode 100644
index 00000000000..5a87f243904
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_ipc.pyi
@@ -0,0 +1,317 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import enum
+import sys
+
+from io import IOBase
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+
+from collections.abc import Iterable, Iterator, Mapping
+from typing import Any, Literal, NamedTuple
+
+import pandas as pd
+
+from pyarrow._stubs_typing import SupportPyBuffer
+from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable
+
+from .io import Buffer, Codec, NativeFile, BufferReader
+from ._types import DictionaryMemo, KeyValueMetadata
+
+
+class MetadataVersion(enum.IntEnum):
+ V1 = enum.auto()
+ V2 = enum.auto()
+ V3 = enum.auto()
+ V4 = enum.auto()
+ V5 = enum.auto()
+
+
+class Alignment(enum.IntEnum):
+ Any = enum.auto()
+ At64Byte = enum.auto()
+ DataTypeSpecific = enum.auto()
+
+
+class WriteStats(NamedTuple):
+ num_messages: int
+ num_record_batches: int
+ num_dictionary_batches: int
+ num_dictionary_deltas: int
+ num_replaced_dictionaries: int
+
+
+class ReadStats(NamedTuple):
+ num_messages: int
+ num_record_batches: int
+ num_dictionary_batches: int
+ num_dictionary_deltas: int
+ num_replaced_dictionaries: int
+
+
+class IpcReadOptions(_Weakrefable):
+ ensure_native_endian: bool
+ use_threads: bool
+ ensure_alignment: Alignment
+ included_fields: list[int] | None
+
+ def __init__(
+ self,
+ *,
+ ensure_native_endian: bool = True,
+ use_threads: bool = True,
+ ensure_alignment: Alignment = ...,
+ included_fields: list[int] | None = None,
+ ) -> None: ...
+
+
+class IpcWriteOptions(_Weakrefable):
+ metadata_version: Any
+ allow_64bit: bool
+ use_legacy_format: bool
+ compression: Any
+ use_threads: bool
+ emit_dictionary_deltas: bool
+ unify_dictionaries: bool
+
+ def __init__(
+ self,
+ *,
+ metadata_version: MetadataVersion = MetadataVersion.V5,
+ allow_64bit: bool = False,
+ use_legacy_format: bool = False,
+ compression: Codec | Literal["lz4", "zstd"] | None = None,
+ use_threads: bool = True,
+ emit_dictionary_deltas: bool = False,
+ unify_dictionaries: bool = False,
+ ) -> None: ...
+
+
+class Message(_Weakrefable):
+ @property
+ def type(self) -> str: ...
+ @property
+ def metadata(self) -> Buffer: ...
+ @property
+ def metadata_version(self) -> MetadataVersion: ...
+ @property
+ def body(self) -> Buffer | None: ...
+ def equals(self, other: Message) -> bool: ...
+
+ def serialize_to(self, sink: NativeFile, alignment: int = 8,
+ memory_pool: MemoryPool | None = None): ...
+
+ def serialize(self, alignment: int = 8, memory_pool: MemoryPool |
+ None = None) -> Buffer: ...
+
+
+class MessageReader(_Weakrefable):
+ @classmethod
+ def open_stream(cls, source: bytes | NativeFile |
+ IOBase | SupportPyBuffer) -> Self: ...
+
+ def __iter__(self) -> Self: ...
+ def read_next_message(self) -> Message: ...
+
+ __next__ = read_next_message
+
+# ----------------------------------------------------------------------
+# File and stream readers and writers
+
+
+class _CRecordBatchWriter(_Weakrefable):
+ def write(self, table_or_batch: Table | RecordBatch): ...
+
+ def write_batch(
+ self,
+ batch: RecordBatch,
+ custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None,
+ ): ...
+
+ def write_table(self, table: Table, max_chunksize: int | None = None) -> None: ...
+
+ def close(self) -> None: ...
+
+ def __enter__(self) -> Self: ...
+ def __exit__(self, exc_type, exc_val, exc_tb): ...
+ @property
+ def stats(self) -> WriteStats: ...
+
+
+class _RecordBatchStreamWriter(_CRecordBatchWriter):
+ @property
+ def _use_legacy_format(self) -> bool: ...
+ @property
+ def _metadata_version(self) -> MetadataVersion: ...
+
+ def _open(
+ self,
+ sink,
+ schema: Schema,
+ options: IpcWriteOptions = IpcWriteOptions(), # noqa: Y011
+ metadata: dict[bytes, bytes] | None = None,
+ ): ...
+
+
+class _ReadPandasMixin:
+ def read_pandas(self, **options) -> pd.DataFrame: ...
+
+
+class RecordBatchReader(_ReadPandasMixin, _Weakrefable):
+ def __iter__(self) -> Self: ...
+ def read_next_batch(self) -> RecordBatch: ...
+
+ __next__ = read_next_batch
+ @property
+ def schema(self) -> Schema: ...
+
+ def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: ...
+
+ def iter_batches_with_custom_metadata(
+ self,
+ ) -> Iterator[RecordBatchWithMetadata]: ...
+
+ def read_all(self) -> Table: ...
+
+ def close(self) -> None: ...
+
+ def __enter__(self) -> Self: ...
+ def __exit__(self, exc_type, exc_val, exc_tb): ...
+ def cast(self, target_schema: Schema) -> Self: ...
+
+ def _export_to_c(self, out_ptr: int) -> None: ...
+
+ @classmethod
+ def _import_from_c(cls, in_ptr: int) -> Self: ...
+
+ def __arrow_c_stream__(self, requested_schema=None): ...
+
+ @classmethod
+ def _import_from_c_capsule(cls, stream) -> Self: ...
+
+ @classmethod
+ def from_stream(cls, data: Any,
+ schema: Any = None) -> Self: ...
+
+ @classmethod
+ def from_batches(cls, schema: Any, batches: Iterable[RecordBatch]) -> Self: ...
+
+
+class _RecordBatchStreamReader(RecordBatchReader):
+ @property
+ def stats(self) -> ReadStats: ...
+
+ def _open(
+ self,
+ source,
+ options: IpcReadOptions | None = None,
+ memory_pool: MemoryPool | None = None,
+ ) -> Self: ...
+
+
+class _RecordBatchFileWriter(_RecordBatchStreamWriter):
+ ...
+
+
+class RecordBatchWithMetadata(NamedTuple):
+ batch: RecordBatch
+ custom_metadata: KeyValueMetadata
+
+
+class _RecordBatchFileReader(_ReadPandasMixin, _Weakrefable):
+ @property
+ def num_record_batches(self) -> int: ...
+
+ def get_batch(self, i: int) -> RecordBatch: ...
+
+ get_record_batch = get_batch
+ def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ...
+
+ def read_all(self) -> Table: ...
+
+ def __enter__(self) -> Self: ...
+ def __exit__(self, exc_type, exc_val, exc_tb): ...
+ @property
+ def schema(self) -> Schema: ...
+ @property
+ def stats(self) -> ReadStats: ...
+ @property
+ def metadata(self) -> KeyValueMetadata | None: ...
+
+ def _open(
+ self,
+ source,
+ footer_offset: int | None = None,
+ options: IpcReadOptions | None = None,
+ memory_pool: MemoryPool | None = None,
+ ) -> Self: ...
+
+
+def get_tensor_size(tensor: Tensor) -> int: ...
+
+
+def get_record_batch_size(batch: RecordBatch) -> int: ...
+
+
+def write_tensor(tensor: Tensor, dest: NativeFile) -> int: ...
+
+
+def read_tensor(source: NativeFile) -> Tensor: ...
+
+
+def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: ...
+
+
+def read_schema(obj: Buffer | Message | BufferReader, dictionary_memo: DictionaryMemo |
+ None = None) -> Schema: ...
+
+
+def read_record_batch(
+ obj: Message | SupportPyBuffer,
+ schema: Schema,
+ dictionary_memo: DictionaryMemo | None = None) -> RecordBatch: ...
+
+
+__all__ = [
+ "MetadataVersion",
+ "Alignment",
+ "WriteStats",
+ "ReadStats",
+ "IpcReadOptions",
+ "IpcWriteOptions",
+ "Message",
+ "MessageReader",
+ "_CRecordBatchWriter",
+ "_RecordBatchStreamWriter",
+ "_ReadPandasMixin",
+ "RecordBatchReader",
+ "_RecordBatchStreamReader",
+ "_RecordBatchFileWriter",
+ "RecordBatchWithMetadata",
+ "_RecordBatchFileReader",
+ "get_tensor_size",
+ "get_record_batch_size",
+ "write_tensor",
+ "read_tensor",
+ "read_message",
+ "read_schema",
+ "read_record_batch",
+]
diff --git a/python/pyarrow-stubs/pyarrow/_json.pyi b/python/pyarrow-stubs/pyarrow/_json.pyi
new file mode 100644
index 00000000000..bae2ff404f0
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_json.pyi
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import IO, Any, Literal
+
+from _typeshed import StrPath
+
+from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable
+
+
+class ReadOptions(_Weakrefable):
+ use_threads: bool
+ block_size: int
+
+ def __init__(self, use_threads: bool | None = None,
+ block_size: int | None = None): ...
+
+ def equals(self, other: ReadOptions) -> bool: ...
+
+
+class ParseOptions(_Weakrefable):
+ explicit_schema: Schema
+ newlines_in_values: bool
+ unexpected_field_behavior: Literal["ignore", "error", "infer"]
+
+ def __init__(
+ self,
+ explicit_schema: Schema | None = None,
+ newlines_in_values: bool | None = None,
+ unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer",
+ ): ...
+ def equals(self, other: ParseOptions) -> bool: ...
+
+
+class JSONStreamingReader(RecordBatchReader):
+ ...
+
+
+def read_json(
+ input_file: StrPath | IO[Any],
+ read_options: ReadOptions | None = None,
+ parse_options: ParseOptions | None = None,
+ memory_pool: MemoryPool | None = None,
+) -> Table: ...
+
+
+def open_json(
+ input_file: StrPath | IO[Any],
+ read_options: ReadOptions | None = None,
+ parse_options: ParseOptions | None = None,
+ memory_pool: MemoryPool | None = None,
+) -> JSONStreamingReader: ...
diff --git a/python/pyarrow-stubs/pyarrow/_orc.pyi b/python/pyarrow-stubs/pyarrow/_orc.pyi
new file mode 100644
index 00000000000..faa0f57c1fd
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_orc.pyi
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import IO, Literal, Any
+
+from .lib import (
+ Buffer,
+ KeyValueMetadata,
+ MemoryPool,
+ NativeFile,
+ RecordBatch,
+ Schema,
+ Table,
+ _Weakrefable,
+)
+
+
+class ORCReader(_Weakrefable):
+ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ...
+ def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ...
+ def metadata(self) -> KeyValueMetadata: ...
+ def schema(self) -> Schema: ...
+ def nrows(self) -> int: ...
+ def nstripes(self) -> int: ...
+ def file_version(self) -> str: ...
+ def software_version(self) -> str: ...
+ def compression(self) -> Literal["UNCOMPRESSED",
+ "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ...
+
+ def compression_size(self) -> int: ...
+ def row_index_stride(self) -> int: ...
+ def writer(self) -> str: ...
+ def writer_version(self) -> str: ...
+ def nstripe_statistics(self) -> int: ...
+ def content_length(self) -> int: ...
+ def stripe_statistics_length(self) -> int: ...
+ def file_footer_length(self) -> int: ...
+ def file_postscript_length(self) -> int: ...
+ def file_length(self) -> int: ...
+ def serialized_file_tail(self) -> int: ...
+ def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ...
+ def read(self, columns: list[str] | None = None) -> Table: ...
+
+
+class ORCWriter(_Weakrefable):
+ def open(
+ self,
+ where: str | NativeFile | IO,
+ *,
+ file_version: str | None = None,
+ batch_size: int | None = None,
+ stripe_size: int | None = None,
+ compression: Any = 'UNCOMPRESSED',
+ compression_block_size: int | None = None,
+ compression_strategy: Any = 'SPEED',
+ row_index_stride: int | None = None,
+ padding_tolerance: float | None = None,
+ dictionary_key_size_threshold: float | None = None,
+ bloom_filter_columns: list[int] | None = None,
+ bloom_filter_fpp: float | None = None,
+ ) -> None: ...
+ def write(self, table: Table) -> None: ...
+ def close(self) -> None: ...
diff --git a/python/pyarrow-stubs/pyarrow/_parquet.pyi b/python/pyarrow-stubs/pyarrow/_parquet.pyi
new file mode 100644
index 00000000000..2521936ad5c
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_parquet.pyi
@@ -0,0 +1,524 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Iterable, Iterator, Sequence
+from typing import IO, Any, Literal, TypeAlias, TypedDict
+
+from _typeshed import StrPath
+
+from ._stubs_typing import Order
+from .lib import (
+ Buffer,
+ ChunkedArray,
+ KeyValueMetadata,
+ MemoryPool,
+ NativeFile,
+ RecordBatch,
+ Schema,
+ Table,
+ _Weakrefable,
+ DataType,
+ ListType,
+ LargeListType
+)
+
+_PhysicalType: TypeAlias = Literal[
+ "BOOLEAN",
+ "INT32",
+ "INT64",
+ "INT96",
+ "FLOAT",
+ "DOUBLE",
+ "BYTE_ARRAY",
+ "FIXED_LEN_BYTE_ARRAY",
+ "UNKNOWN",
+]
+_LogicTypeName: TypeAlias = Literal[
+ "UNDEFINED",
+ "STRING",
+ "MAP",
+ "LIST",
+ "ENUM",
+ "DECIMAL",
+ "DATE",
+ "TIME",
+ "TIMESTAMP",
+ "INT",
+ "FLOAT16",
+ "JSON",
+ "BSON",
+ "UUID",
+ "NONE",
+ "UNKNOWN",
+]
+_ConvertedType: TypeAlias = Literal[
+ "NONE",
+ "UTF8",
+ "MAP",
+ "MAP_KEY_VALUE",
+ "LIST",
+ "ENUM",
+ "DECIMAL",
+ "DATE",
+ "TIME_MILLIS",
+ "TIME_MICROS",
+ "TIMESTAMP_MILLIS",
+ "TIMESTAMP_MICROS",
+ "UINT_8",
+ "UINT_16",
+ "UINT_32",
+ "UINT_64",
+ "INT_8",
+ "INT_16",
+ "INT_32",
+ "INT_64",
+ "JSON",
+ "BSON",
+ "INTERVAL",
+ "UNKNOWN",
+]
+_Encoding: TypeAlias = Literal[
+ "PLAIN",
+ "PLAIN_DICTIONARY",
+ "RLE",
+ "BIT_PACKED",
+ "DELTA_BINARY_PACKED",
+ "DELTA_LENGTH_BYTE_ARRAY",
+ "DELTA_BYTE_ARRAY",
+ "RLE_DICTIONARY",
+ "BYTE_STREAM_SPLIT",
+ "UNKNOWN",
+]
+_Compression: TypeAlias = Literal[
+ "UNCOMPRESSED",
+ "SNAPPY",
+ "GZIP",
+ "LZO",
+ "BROTLI",
+ "LZ4",
+ "ZSTD",
+ "UNKNOWN",
+]
+
+
+class _Statistics(TypedDict):
+ has_min_max: bool
+ min: Any | None
+ max: Any | None
+ null_count: int | None
+ distinct_count: int | None
+ num_values: int
+ physical_type: _PhysicalType
+
+
+class Statistics(_Weakrefable):
+ def to_dict(self) -> _Statistics: ...
+ def equals(self, other: Statistics) -> bool: ...
+ @property
+ def has_min_max(self) -> bool: ...
+ @property
+ def has_null_count(self) -> bool: ...
+ @property
+ def has_distinct_count(self) -> bool: ...
+ @property
+ def min_raw(self) -> Any | None: ...
+ @property
+ def max_raw(self) -> Any | None: ...
+ @property
+ def min(self) -> Any | None: ...
+ @property
+ def max(self) -> Any | None: ...
+ @property
+ def null_count(self) -> int | None: ...
+ @property
+ def distinct_count(self) -> int | None: ...
+ @property
+ def num_values(self) -> int: ...
+ @property
+ def physical_type(self) -> _PhysicalType: ...
+ @property
+ def logical_type(self) -> ParquetLogicalType: ...
+ @property
+ def converted_type(self) -> _ConvertedType | None: ...
+ @property
+ def is_min_exact(self) -> bool: ...
+ @property
+ def is_max_exact(self) -> bool: ...
+
+
+class ParquetLogicalType(_Weakrefable):
+ def to_json(self) -> str: ...
+ @property
+ def type(self) -> _LogicTypeName: ...
+
+
+class _ColumnChunkMetaData(TypedDict):
+ file_offset: int
+ file_path: str | None
+ physical_type: _PhysicalType
+ num_values: int
+ path_in_schema: str
+ is_stats_set: bool
+ statistics: Statistics | None
+ compression: _Compression
+ encodings: tuple[_Encoding, ...]
+ has_dictionary_page: bool
+ dictionary_page_offset: int | None
+ data_page_offset: int
+ total_compressed_size: int
+ total_uncompressed_size: int
+
+
+class ColumnChunkMetaData(_Weakrefable):
+ def to_dict(self) -> _ColumnChunkMetaData: ...
+ def equals(self, other: ColumnChunkMetaData) -> bool: ...
+ @property
+ def file_offset(self) -> int: ...
+ @property
+ def file_path(self) -> str | None: ...
+ @property
+ def physical_type(self) -> _PhysicalType: ...
+ @property
+ def num_values(self) -> int: ...
+ @property
+ def path_in_schema(self) -> str: ...
+ @property
+ def is_stats_set(self) -> bool: ...
+ @property
+ def statistics(self) -> Statistics | None: ...
+ @property
+ def compression(self) -> _Compression: ...
+ @property
+ def encodings(self) -> tuple[_Encoding, ...]: ...
+ @property
+ def has_dictionary_page(self) -> bool: ...
+ @property
+ def dictionary_page_offset(self) -> int | None: ...
+ @property
+ def data_page_offset(self) -> int: ...
+ @property
+ def has_index_page(self) -> bool: ...
+ @property
+ def index_page_offset(self) -> int: ...
+ @property
+ def total_compressed_size(self) -> int: ...
+ @property
+ def total_uncompressed_size(self) -> int: ...
+ @property
+ def has_offset_index(self) -> bool: ...
+ @property
+ def has_column_index(self) -> bool: ...
+ @property
+ def metadata(self) -> dict[bytes, bytes] | None: ...
+ @property
+ def name(self) -> str: ...
+ @property
+ def max_definition_level(self) -> int: ...
+ @property
+ def max_repetition_level(self) -> int: ...
+ @property
+ def converted_type(self) -> _ConvertedType: ...
+ @property
+ def logical_type(self) -> ParquetLogicalType: ...
+
+
+class _SortingColumn(TypedDict):
+ column_index: int
+ descending: bool
+ nulls_first: bool
+
+
+class SortingColumn:
+ def __init__(
+ self, column_index: int, descending: bool = False, nulls_first: bool = False
+ ) -> None: ...
+
+ @classmethod
+ def from_ordering(
+ cls,
+ schema: Schema,
+ sort_keys: Sequence[str]
+ | Sequence[tuple[str, Order]]
+ | Sequence[str | tuple[str, Order]],
+ null_placement: Literal["at_start", "at_end"] = "at_end",
+ ) -> tuple[SortingColumn, ...]: ...
+
+ @staticmethod
+ def to_ordering(
+ schema: Schema, sorting_columns: tuple[SortingColumn, ...] | list[SortingColumn]
+ ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ...
+ def __hash__(self) -> int: ...
+ @property
+ def column_index(self) -> int: ...
+ @property
+ def descending(self) -> bool: ...
+ @property
+ def nulls_first(self) -> bool: ...
+ def to_dict(self) -> _SortingColumn: ...
+
+
+class _RowGroupMetaData(TypedDict):
+ num_columns: int
+ num_rows: int
+ total_byte_size: int
+ columns: list[ColumnChunkMetaData]
+ sorting_columns: list[SortingColumn]
+
+
+class RowGroupMetaData(_Weakrefable):
+ def __init__(self, parent: FileMetaData, index: int) -> None: ...
+ def equals(self, other: RowGroupMetaData) -> bool: ...
+ def column(self, i: int) -> ColumnChunkMetaData: ...
+ def to_dict(self) -> _RowGroupMetaData: ...
+ @property
+ def num_columns(self) -> int: ...
+ @property
+ def num_rows(self) -> int: ...
+ @property
+ def total_byte_size(self) -> int: ...
+ @property
+ def sorting_columns(self) -> list[SortingColumn]: ...
+
+
+class _FileMetaData(TypedDict):
+ created_by: str
+ num_columns: int
+ num_rows: int
+ num_row_groups: int
+ format_version: str
+ serialized_size: int
+ row_groups: list[Any] # List of row group metadata dictionaries
+
+
+class FileMetaData(_Weakrefable):
+ def __hash__(self) -> int: ...
+ def to_dict(self) -> _FileMetaData: ...
+ def equals(self, other: FileMetaData) -> bool: ...
+ @property
+ def schema(self) -> ParquetSchema: ...
+ @property
+ def serialized_size(self) -> int: ...
+ @property
+ def num_columns(self) -> int: ...
+ @property
+ def num_rows(self) -> int: ...
+ @property
+ def num_row_groups(self) -> int: ...
+ @property
+ def format_version(self) -> str: ...
+ @property
+ def created_by(self) -> str: ...
+ @property
+ def metadata(self) -> dict[bytes, bytes] | None: ...
+ def row_group(self, i: int) -> RowGroupMetaData: ...
+ def set_file_path(self, path: str) -> None: ...
+ def append_row_groups(self, other: FileMetaData) -> None: ...
+ def write_metadata_file(self, where: StrPath | Buffer |
+ NativeFile | IO) -> None: ...
+
+
+class ParquetSchema(_Weakrefable):
+ def __init__(self, container: FileMetaData) -> None: ...
+ def __getitem__(self, i: int) -> ColumnSchema: ...
+ def __hash__(self) -> int: ...
+ def __len__(self) -> int: ...
+ @property
+ def names(self) -> list[str]: ...
+ def to_arrow_schema(self) -> Schema: ...
+ def equals(self, other: ParquetSchema) -> bool: ...
+ def column(self, i: int) -> ColumnSchema: ...
+
+
+class ColumnSchema(_Weakrefable):
+ def __init__(self, schema: ParquetSchema, index: int) -> None: ...
+ def equals(self, other: ColumnSchema) -> bool: ...
+ @property
+ def name(self) -> str: ...
+ @property
+ def path(self) -> str: ...
+ @property
+ def max_definition_level(self) -> int: ...
+ @property
+ def max_repetition_level(self) -> int: ...
+ @property
+ def physical_type(self) -> _PhysicalType: ...
+ @property
+ def logical_type(self) -> ParquetLogicalType: ...
+ @property
+ def converted_type(self) -> _ConvertedType | None: ...
+ @property
+ def length(self) -> int | None: ...
+ @property
+ def precision(self) -> int | None: ...
+ @property
+ def scale(self) -> int | None: ...
+
+
+class ParquetReader(_Weakrefable):
+ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ...
+
+ def open(
+ self,
+ source: StrPath | Buffer | NativeFile | IO,
+ *,
+ use_memory_map: bool = False,
+ read_dictionary: Iterable[int] | Iterable[str] | None = None,
+ metadata: FileMetaData | None = None,
+ binary_type: DataType | None = None,
+ list_type: ListType | LargeListType | None = None,
+ buffer_size: int = 0,
+ pre_buffer: bool = False,
+ coerce_int96_timestamp_unit: str | None = None,
+ decryption_properties: FileDecryptionProperties | None = None,
+ thrift_string_size_limit: int | None = None,
+ thrift_container_size_limit: int | None = None,
+ page_checksum_verification: bool = False,
+ arrow_extensions_enabled: bool | None = None,
+ ) -> None: ...
+
+ @property
+ def column_paths(self) -> list[str]: ...
+ @property
+ def metadata(self) -> FileMetaData: ...
+ @property
+ def schema_arrow(self) -> Schema: ...
+ @property
+ def num_row_groups(self) -> int: ...
+ def set_use_threads(self, use_threads: bool) -> None: ...
+ def set_batch_size(self, batch_size: int) -> None: ...
+
+ def iter_batches(
+ self,
+ batch_size: int = 65536,
+ row_groups: list[int] | range | None = None,
+ column_indices: list[str] | list[int] | None = None,
+ use_threads: bool = True,
+ use_pandas_metadata: bool = False,
+ ) -> Iterator[RecordBatch]: ...
+
+ def read_row_group(
+ self, i: int, column_indices: list[int] | None = None, use_threads: bool = True
+ ) -> Table: ...
+
+ def read_row_groups(
+ self,
+ row_groups: Sequence[int] | range,
+ column_indices: list[str] | list[int] | None = None,
+ use_threads: bool = True,
+ use_pandas_metadata: bool = False,
+ ) -> Table: ...
+
+ def read_all(
+ self, column_indices: list[int] | None = None, use_threads: bool = True
+ ) -> Table: ...
+
+ def scan_contents(
+ self, columns: Sequence[str] | Sequence[int] | None = None,
+ batch_size: int = 65536
+ ) -> int: ...
+
+ def column_name_idx(self, column_name: str) -> int: ...
+ def read_column(self, column_index: int) -> ChunkedArray: ...
+ def close(self) -> None: ...
+ @property
+ def closed(self) -> bool: ...
+
+
+class ParquetWriter(_Weakrefable):
+ def __init__(
+ self,
+ where: StrPath | NativeFile | IO,
+ schema: Schema,
+ use_dictionary: bool | list[str] | None = None,
+ compression: _Compression | dict[str, _Compression] | str | None = None,
+ version: str | None = None,
+ write_statistics: bool | list[str] | None = None,
+ memory_pool: MemoryPool | None = None,
+ use_deprecated_int96_timestamps: bool = False,
+ coerce_timestamps: Literal["ms", "us"] | None = None,
+ data_page_size: int | None = None,
+ allow_truncated_timestamps: bool = False,
+ compression_level: int | dict[str, int] | None = None,
+ use_byte_stream_split: bool | list[str] = False,
+ column_encoding: _Encoding | dict[str, _Encoding] | None = None,
+ writer_engine_version: str | None = None,
+ data_page_version: str | None = None,
+ use_compliant_nested_type: bool = True,
+ encryption_properties: FileDecryptionProperties | None = None,
+ write_batch_size: int | None = None,
+ dictionary_pagesize_limit: int | None = None,
+ store_schema: bool = True,
+ write_page_index: bool = False,
+ write_page_checksum: bool = False,
+ sorting_columns: tuple[SortingColumn, ...] | None = None,
+ store_decimal_as_integer: bool = False,
+ write_time_adjusted_to_utc: bool = False,
+ max_rows_per_page: int | None = None,
+ ): ...
+ def close(self) -> None: ...
+ def write_table(self, table: Table, row_group_size: int | None = None) -> None: ...
+ def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ...
+ @property
+ def metadata(self) -> FileMetaData: ...
+ @property
+ def use_dictionary(self) -> bool | list[str] | None: ...
+ @property
+ def use_deprecated_int96_timestamps(self) -> bool: ...
+ @property
+ def use_byte_stream_split(self) -> bool | list[str]: ...
+ @property
+ def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ...
+ @property
+ def coerce_timestamps(self) -> Literal["ms", "us"] | None: ...
+ @property
+ def allow_truncated_timestamps(self) -> bool: ...
+ @property
+ def compression(self) -> _Compression | dict[str, _Compression] | None: ...
+ @property
+ def compression_level(self) -> int | dict[str, int] | None: ...
+ @property
+ def data_page_version(self) -> str | None: ...
+ @property
+ def use_compliant_nested_type(self) -> bool: ...
+ @property
+ def version(self) -> str | None: ...
+ @property
+ def write_statistics(self) -> bool | list[str] | None: ...
+ @property
+ def writer_engine_version(self) -> str: ...
+ @property
+ def row_group_size(self) -> int: ...
+ @property
+ def data_page_size(self) -> int: ...
+ @property
+ def encryption_properties(self) -> FileDecryptionProperties: ...
+ @property
+ def write_batch_size(self) -> int: ...
+ @property
+ def dictionary_pagesize_limit(self) -> int: ...
+ @property
+ def store_schema(self) -> bool: ...
+ @property
+ def store_decimal_as_integer(self) -> bool: ...
+
+
+class FileEncryptionProperties:
+ ...
+
+
+class FileDecryptionProperties:
+ ...
diff --git a/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi b/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi
new file mode 100644
index 00000000000..74b50ce665d
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime as dt
+import pathlib
+
+from collections.abc import Callable
+
+from pyarrow._fs import FileSystem
+from ._parquet import FileDecryptionProperties, FileEncryptionProperties
+from .lib import _Weakrefable
+
+
+class EncryptionConfiguration(_Weakrefable):
+ footer_key: str
+ column_keys: dict[str, list[str]]
+ encryption_algorithm: str
+ plaintext_footer: bool
+ double_wrapping: bool
+ cache_lifetime: dt.timedelta
+ internal_key_material: bool
+ data_key_length_bits: int
+ uniform_encryption: bool
+
+ def __init__(
+ self,
+ footer_key: str,
+ *,
+ column_keys: dict[str, str | list[str]] | None = None,
+ encryption_algorithm: str | None = None,
+ plaintext_footer: bool | None = None,
+ double_wrapping: bool | None = None,
+ cache_lifetime: dt.timedelta | None = None,
+ internal_key_material: bool | None = None,
+ data_key_length_bits: int | None = None,
+ uniform_encryption: bool | None = None,
+ ) -> None: ...
+
+
+class DecryptionConfiguration(_Weakrefable):
+ cache_lifetime: dt.timedelta
+ def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ...
+
+
+class KmsConnectionConfig(_Weakrefable):
+ kms_instance_id: str
+ kms_instance_url: str
+ key_access_token: str
+ custom_kms_conf: dict[str, str]
+
+ def __init__(
+ self,
+ *,
+ kms_instance_id: str | None = None,
+ kms_instance_url: str | None = None,
+ key_access_token: str | None = None,
+ custom_kms_conf: dict[str, str] | None = None,
+ ) -> None: ...
+ def refresh_key_access_token(self, value: str) -> None: ...
+
+
+class KmsClient(_Weakrefable):
+ def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ...
+ def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> bytes: ...
+
+
+class CryptoFactory(_Weakrefable):
+ def __init__(self, kms_client_factory: Callable[[
+ KmsConnectionConfig], KmsClient]): ...
+
+ def file_encryption_properties(
+ self,
+ kms_connection_config: KmsConnectionConfig,
+ encryption_config: EncryptionConfiguration,
+ ) -> FileEncryptionProperties: ...
+
+ def file_decryption_properties(
+ self,
+ kms_connection_config: KmsConnectionConfig,
+ decryption_config: DecryptionConfiguration | None = None,
+ ) -> FileDecryptionProperties: ...
+ def remove_cache_entries_for_token(self, access_token: str) -> None: ...
+ def remove_cache_entries_for_all_tokens(self) -> None: ...
+ def rotate_master_keys(
+ self,
+ kms_connection_config: KmsConnectionConfig,
+ parquet_file_path: str | pathlib.Path,
+ filesystem: FileSystem | None = None,
+ double_wrapping: bool = True,
+ cache_lifetime_seconds: int | float = 600,
+ ) -> None: ...
+
+
+class KeyMaterial(_Weakrefable):
+ @property
+ def is_footer_key(self) -> bool: ...
+ @property
+ def is_double_wrapped(self) -> bool: ...
+ @property
+ def master_key_id(self) -> str: ...
+ @property
+ def wrapped_dek(self) -> str: ...
+ @property
+ def kek_id(self) -> str: ...
+ @property
+ def wrapped_kek(self) -> str: ...
+ @property
+ def kms_instance_id(self) -> str: ...
+ @property
+ def kms_instance_url(self) -> str: ...
+ @staticmethod
+ def wrap(key_material: KeyMaterial) -> KeyMaterial: ...
+ @staticmethod
+ def parse(key_material_string: str) -> KeyMaterial: ...
+
+
+
+class FileSystemKeyMaterialStore(_Weakrefable):
+ def get_key_material(self, key_id: str) -> KeyMaterial: ...
+ def get_key_id_set(self) -> list[str]: ...
+ @classmethod
+ def for_file(
+ cls,
+ parquet_file_path: str | pathlib.Path, /,
+ filesystem: FileSystem | None = None
+ ) -> FileSystemKeyMaterialStore:
+ ...
diff --git a/python/pyarrow-stubs/pyarrow/_s3fs.pyi b/python/pyarrow-stubs/pyarrow/_s3fs.pyi
new file mode 100644
index 00000000000..f82f34d2cae
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_s3fs.pyi
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import enum
+
+from typing import Literal, TypedDict
+from typing_extensions import Required, NotRequired
+
+from ._fs import FileSystem
+from .lib import KeyValueMetadata
+
+
+class _ProxyOptions(TypedDict):
+ scheme: Required[Literal["http", "https"]]
+ host: Required[str]
+ port: Required[int]
+ username: NotRequired[str]
+ password: NotRequired[str]
+
+
+class S3LogLevel(enum.IntEnum):
+ Off = enum.auto()
+ Fatal = enum.auto()
+ Error = enum.auto()
+ Warn = enum.auto()
+ Info = enum.auto()
+ Debug = enum.auto()
+ Trace = enum.auto()
+
+
+Off = S3LogLevel.Off
+Fatal = S3LogLevel.Fatal
+Error = S3LogLevel.Error
+Warn = S3LogLevel.Warn
+Info = S3LogLevel.Info
+Debug = S3LogLevel.Debug
+Trace = S3LogLevel.Trace
+
+
+def initialize_s3(
+ log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1
+) -> None: ...
+def ensure_s3_initialized() -> None: ...
+def finalize_s3() -> None: ...
+def ensure_s3_finalized() -> None: ...
+def resolve_s3_region(bucket: str) -> str: ...
+
+
+class S3RetryStrategy:
+ max_attempts: int
+ def __init__(self, max_attempts=3) -> None: ...
+
+
+class AwsStandardS3RetryStrategy(S3RetryStrategy):
+ ...
+
+
+class AwsDefaultS3RetryStrategy(S3RetryStrategy):
+ ...
+
+
+class S3FileSystem(FileSystem):
+ def __init__(
+ self,
+ *,
+ access_key: str | None = None,
+ secret_key: str | None = None,
+ session_token: str | None = None,
+ anonymous: bool = False,
+ region: str | None = None,
+ request_timeout: float | None = None,
+ connect_timeout: float | None = None,
+ scheme: Literal["http", "https"] = "https",
+ endpoint_override: str | None = None,
+ background_writes: bool = True,
+ default_metadata: dict | list | KeyValueMetadata | None = None,
+ role_arn: str | None = None,
+ session_name: str | None = None,
+ external_id: str | None = None,
+ load_frequency: int = 900,
+ proxy_options: _ProxyOptions | dict | tuple | str | None = None,
+ allow_bucket_creation: bool = False,
+ allow_bucket_deletion: bool = False,
+ allow_delayed_open: bool = False,
+ check_directory_existence_before_creation: bool = False,
+ tls_ca_file_path: str | None = None,
+ retry_strategy: S3RetryStrategy =
+ AwsStandardS3RetryStrategy(max_attempts=3), # noqa: Y011
+ force_virtual_addressing: bool = False,
+ ): ...
+ @property
+ def region(self) -> str: ...
diff --git a/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi b/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi
new file mode 100644
index 00000000000..0715012fddc
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime as dt
+
+from collections.abc import Collection, Iterator, Sequence
+from decimal import Decimal
+from typing import Any, Literal, Protocol, TypeAlias, TypeVar
+
+import numpy as np
+
+from numpy.typing import NDArray
+
+from pyarrow.lib import BooleanArray, IntegerArray, ChunkedArray
+
+ArrayLike: TypeAlias = Any
+ScalarLike: TypeAlias = Any
+Order: TypeAlias = Literal["ascending", "descending"]
+JoinType: TypeAlias = Literal[
+ "left semi",
+ "right semi",
+ "left anti",
+ "right anti",
+ "inner",
+ "left outer",
+ "right outer",
+ "full outer",
+]
+Compression: TypeAlias = Literal[
+ "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy"
+]
+NullEncoding: TypeAlias = Literal["mask", "encode"]
+NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"]
+TimeUnit: TypeAlias = Literal["s", "ms", "us", "ns"]
+Mask: TypeAlias = (
+ Sequence[bool | None]
+ | NDArray[np.bool_]
+ | BooleanArray
+ | ChunkedArray[Any]
+)
+Indices: TypeAlias = (
+ Sequence[int | None]
+ | NDArray[np.integer[Any]]
+ | IntegerArray
+ | ChunkedArray[Any]
+)
+
+PyScalar: TypeAlias = (bool | int | float | Decimal | str | bytes |
+ dt.date | dt.datetime | dt.time | dt.timedelta)
+
+_T = TypeVar("_T")
+_V = TypeVar("_V", covariant=True)
+
+SingleOrList: TypeAlias = list[_T] | _T
+
+
+class SupportEq(Protocol):
+ def __eq__(self, other) -> bool: ...
+
+
+class SupportLt(Protocol):
+ def __lt__(self, other) -> bool: ...
+
+
+class SupportGt(Protocol):
+ def __gt__(self, other) -> bool: ...
+
+
+class SupportLe(Protocol):
+ def __le__(self, other) -> bool: ...
+
+
+class SupportGe(Protocol):
+ def __ge__(self, other) -> bool: ...
+
+
+FilterTuple: TypeAlias = (
+ tuple[str, Literal["=", "==", "!="], SupportEq]
+ | tuple[str, Literal["<"], SupportLt]
+ | tuple[str, Literal[">"], SupportGt]
+ | tuple[str, Literal["<="], SupportLe]
+ | tuple[str, Literal[">="], SupportGe]
+ | tuple[str, Literal["in", "not in"], Collection]
+ | tuple[str, str, Any] # Allow general str for operator to avoid type errors
+)
+
+
+class Buffer(Protocol):
+ ...
+
+
+class SupportPyBuffer(Protocol):
+ ...
+
+
+class SupportArrowStream(Protocol):
+ def __arrow_c_stream__(self, requested_schema=None) -> Any: ...
+
+
+class SupportPyArrowArray(Protocol):
+ def __arrow_array__(self, type=None) -> Any: ...
+
+
+class SupportArrowArray(Protocol):
+ def __arrow_c_array__(self, requested_schema=None) -> Any: ...
+
+
+class SupportArrowDeviceArray(Protocol):
+ def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ...
+
+
+class SupportArrowSchema(Protocol):
+ def __arrow_c_schema__(self) -> Any: ...
+
+
+class NullableCollection(Protocol[_V]): # type: ignore[reportInvalidTypeVarUse]
+ def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ...
+ def __len__(self) -> int: ...
+ def __contains__(self, item: Any, /) -> bool: ...
diff --git a/python/pyarrow-stubs/pyarrow/_substrait.pyi b/python/pyarrow-stubs/pyarrow/_substrait.pyi
new file mode 100644
index 00000000000..6818d9822ab
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_substrait.pyi
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Callable
+from typing import Any
+
+from ._compute import Expression
+from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable
+
+
+def run_query(
+ plan: Buffer | int,
+ *,
+ table_provider: Callable[[list[str], Schema], Table] | None = None,
+ use_threads: bool = True,
+) -> RecordBatchReader: ...
+def _parse_json_plan(plan: bytes) -> Buffer: ...
+
+
+class SubstraitSchema:
+ schema: bytes
+ expression: bytes
+ def __init__(self, schema: bytes, expression: bytes) -> None: ...
+ def to_pysubstrait(self) -> Any: ...
+
+
+def serialize_schema(schema: Schema) -> SubstraitSchema: ...
+def deserialize_schema(buf: Buffer | bytes | SubstraitSchema) -> Schema: ...
+
+
+def serialize_expressions(
+ exprs: list[Expression],
+ names: list[str],
+ schema: Schema,
+ *,
+ allow_arrow_extensions: bool = False,
+) -> Buffer: ...
+
+
+class BoundExpressions(_Weakrefable):
+ @property
+ def schema(self) -> Schema: ...
+ @property
+ def expressions(self) -> dict[str, Expression]: ...
+ @classmethod
+ def from_substrait(cls, message: Buffer | bytes | Any) -> BoundExpressions: ...
+
+
+def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ...
+def get_supported_functions() -> list[str]: ...
diff --git a/python/pyarrow-stubs/pyarrow/_types.pyi b/python/pyarrow-stubs/pyarrow/_types.pyi
new file mode 100644
index 00000000000..6b7a58ccfe6
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/_types.pyi
@@ -0,0 +1,966 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime as dt # noqa: F401
+import sys
+
+from collections.abc import Mapping, Sequence, Iterable, Iterator
+from decimal import Decimal # noqa: F401
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+
+from typing import Any, Generic, Literal
+
+import numpy as np
+import pandas as pd
+
+from pyarrow._stubs_typing import SupportArrowSchema
+from pyarrow.lib import ( # noqa: F401
+ Array,
+ ChunkedArray,
+ ExtensionArray,
+ MemoryPool,
+ MonthDayNano,
+ Table,
+)
+from typing_extensions import TypeVar, deprecated
+
+from .io import Buffer
+from .scalar import ExtensionScalar
+from ._stubs_typing import TimeUnit
+
+class _Weakrefable:
+ ...
+
+
+class _Metadata(_Weakrefable):
+ ...
+
+
+class DataType(_Weakrefable):
+ def field(self, i: int) -> Field: ...
+
+ @property
+ def id(self) -> int: ...
+ @property
+ def bit_width(self) -> int: ...
+
+ @property
+ def byte_width(self) -> int: ...
+
+ @property
+ def num_fields(self) -> int: ...
+
+ @property
+ def num_buffers(self) -> int: ...
+
+ @property
+ def has_variadic_buffers(self) -> bool: ...
+
+ # Properties that exist on specific subtypes but accessed generically
+ @property
+ def list_size(self) -> int: ...
+
+ def __hash__(self) -> int: ...
+
+ def equals(self, other: DataType | str, *,
+ check_metadata: bool = False) -> bool: ...
+
+ def to_pandas_dtype(self) -> np.generic: ...
+
+ def _export_to_c(self, out_ptr: int) -> None: ...
+
+ @classmethod
+ def _import_from_c(cls, in_ptr: int) -> Self: ...
+
+ def __arrow_c_schema__(self) -> Any: ...
+
+ @classmethod
+ def _import_from_c_capsule(cls, schema) -> Self: ...
+
+
+_AsPyType = TypeVar("_AsPyType")
+_DataTypeT = TypeVar("_DataTypeT", bound=DataType)
+
+
+class _BasicDataType(DataType, Generic[_AsPyType]):
+ ...
+
+
+class NullType(_BasicDataType[None]):
+ ...
+
+
+class BoolType(_BasicDataType[bool]):
+ ...
+
+
+class UInt8Type(_BasicDataType[int]):
+ ...
+
+
+class Int8Type(_BasicDataType[int]):
+ ...
+
+
+class UInt16Type(_BasicDataType[int]):
+ ...
+
+
+class Int16Type(_BasicDataType[int]):
+ ...
+
+
+class UInt32Type(_BasicDataType[int]):
+ ...
+
+
+class Int32Type(_BasicDataType[int]):
+ ...
+
+
+class UInt64Type(_BasicDataType[int]):
+ ...
+
+
+class Int64Type(_BasicDataType[int]):
+ ...
+
+
+class Float16Type(_BasicDataType[float]):
+ ...
+
+
+class Float32Type(_BasicDataType[float]):
+ ...
+
+
+class Float64Type(_BasicDataType[float]):
+ ...
+
+
+class Date32Type(_BasicDataType[dt.date]):
+ ...
+
+
+class Date64Type(_BasicDataType[dt.date]):
+ ...
+
+
+class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]):
+ ...
+
+
+class StringType(_BasicDataType[str]):
+ ...
+
+
+class LargeStringType(_BasicDataType[str]):
+ ...
+
+
+class StringViewType(_BasicDataType[str]):
+ ...
+
+
+class BinaryType(_BasicDataType[bytes]):
+ ...
+
+
+class LargeBinaryType(_BasicDataType[bytes]):
+ ...
+
+
+class BinaryViewType(_BasicDataType[bytes]):
+ ...
+
+
+_Unit = TypeVar("_Unit", bound=TimeUnit, default=Literal["us"])
+_Tz = TypeVar("_Tz", str, None, default=None)
+
+
+class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]):
+
+ @property
+ def unit(self) -> _Unit: ...
+
+ @property
+ def tz(self) -> _Tz: ...
+
+
+_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"])
+
+
+class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]):
+ @property
+ def unit(self) -> _Time32Unit: ...
+
+
+_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"])
+
+
+class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]):
+ @property
+ def unit(self) -> _Time64Unit: ...
+
+
+class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]):
+ @property
+ def unit(self) -> _Unit: ...
+
+
+class FixedSizeBinaryType(_BasicDataType[Decimal]):
+ ...
+
+
+_Precision = TypeVar("_Precision", default=Any)
+_Scale = TypeVar("_Scale", default=Any)
+
+
+class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]):
+ @property
+ def precision(self) -> _Precision: ...
+
+ @property
+ def scale(self) -> _Scale: ...
+
+
+class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]):
+ @property
+ def precision(self) -> _Precision: ...
+
+ @property
+ def scale(self) -> _Scale: ...
+
+
+class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]):
+ @property
+ def precision(self) -> _Precision: ...
+
+ @property
+ def scale(self) -> _Scale: ...
+
+
+class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]):
+ @property
+ def precision(self) -> _Precision: ...
+
+ @property
+ def scale(self) -> _Scale: ...
+
+
+class ListType(DataType, Generic[_DataTypeT]):
+ @property
+ def value_field(self) -> Field[_DataTypeT]: ...
+
+ @property
+ def value_type(self) -> _DataTypeT: ...
+
+
+class LargeListType(DataType, Generic[_DataTypeT]):
+ @property
+ def value_field(self) -> Field[_DataTypeT]: ...
+ @property
+ def value_type(self) -> _DataTypeT: ...
+
+
+class ListViewType(DataType, Generic[_DataTypeT]):
+ @property
+ def value_field(self) -> Field[_DataTypeT]: ...
+
+ @property
+ def value_type(self) -> _DataTypeT: ...
+
+
+class LargeListViewType(DataType, Generic[_DataTypeT]):
+ @property
+ def value_field(self) -> Field[_DataTypeT]: ...
+
+ @property
+ def value_type(self) -> _DataTypeT: ...
+
+
+class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]):
+ @property
+ def value_field(self) -> Field[_DataTypeT]: ...
+
+ @property
+ def value_type(self) -> _DataTypeT: ...
+
+ @property
+ def list_size(self) -> int: ...
+
+
+class DictionaryMemo(_Weakrefable):
+ ...
+
+
+_IndexT = TypeVar(
+ "_IndexT",
+ UInt8Type,
+ Int8Type,
+ UInt16Type,
+ Int16Type,
+ UInt32Type,
+ Int32Type,
+ UInt64Type,
+ Int64Type,
+)
+_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType)
+_ValueT = TypeVar("_ValueT", bound=DataType)
+_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False])
+
+
+class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]):
+ @property
+ def ordered(self) -> _Ordered: ...
+
+ @property
+ def index_type(self) -> _IndexT: ...
+
+ @property
+ def value_type(self) -> _BasicValueT: ...
+
+
+_K = TypeVar("_K", bound=DataType)
+
+
+class MapType(DataType, Generic[_K, _ValueT, _Ordered]):
+ @property
+ def key_field(self) -> Field[_K]: ...
+
+ @property
+ def key_type(self) -> _K: ...
+
+ @property
+ def item_field(self) -> Field[_ValueT]: ...
+
+ @property
+ def item_type(self) -> _ValueT: ...
+
+ @property
+ def keys_sorted(self) -> _Ordered: ...
+
+
+_Size = TypeVar("_Size", default=int)
+
+
+class StructType(DataType):
+ def get_field_index(self, name: str) -> int: ...
+
+ def field(self, i: int | str) -> Field: ...
+
+ def get_all_field_indices(self, name: str) -> list[int]: ...
+
+ def __len__(self) -> int: ...
+
+ def __iter__(self) -> Iterator[Field]: ...
+
+ __getitem__ = field
+ @property
+ def names(self) -> list[str]: ...
+
+ @property
+ def fields(self) -> list[Field]: ...
+
+
+class UnionType(DataType):
+ @property
+ def mode(self) -> Literal["sparse", "dense"]: ...
+
+ @property
+ def type_codes(self) -> list[int]: ...
+
+ def __len__(self) -> int: ...
+
+ def __iter__(self) -> Iterator[Field]: ...
+
+ def field(self, i: int) -> Field: ...
+
+ __getitem__ = field
+
+
+class SparseUnionType(UnionType):
+ @property
+ def mode(self) -> Literal["sparse"]: ...
+
+
+class DenseUnionType(UnionType):
+ @property
+ def mode(self) -> Literal["dense"]: ...
+
+
+_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type)
+
+
+class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]):
+ @property
+ def run_end_type(self) -> _RunEndType: ...
+ @property
+ def value_type(self) -> _BasicValueT: ...
+
+
+_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray)
+
+
+class BaseExtensionType(DataType):
+ def __arrow_ext_class__(self) -> type[ExtensionArray]: ...
+
+ def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ...
+
+ @property
+ def extension_name(self) -> str: ...
+
+ @property
+ def storage_type(self) -> DataType: ...
+
+ def wrap_array(self, storage: _StorageT) -> _StorageT: ...
+
+
+class ExtensionType(BaseExtensionType):
+ def __init__(self, storage_type: DataType, extension_name: str) -> None: ...
+
+ def __arrow_ext_serialize__(self) -> bytes: ...
+
+ @classmethod
+ def __arrow_ext_deserialize__(
+ cls, storage_type: DataType, serialized: bytes) -> Self: ...
+
+
+class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]):
+ @property
+ def value_type(self) -> _ValueT: ...
+
+ @property
+ def shape(self) -> list[int]: ...
+
+ @property
+ def dim_names(self) -> list[str] | None: ...
+
+ @property
+ def permutation(self) -> list[int] | None: ...
+
+
+class Bool8Type(BaseExtensionType):
+ ...
+
+
+class UuidType(BaseExtensionType):
+ ...
+
+
+class JsonType(BaseExtensionType):
+ ...
+
+
+class OpaqueType(BaseExtensionType):
+ @property
+ def type_name(self) -> str: ...
+
+ @property
+ def vendor_name(self) -> str: ...
+
+
+class UnknownExtensionType(ExtensionType):
+ def __init__(self, storage_type: DataType, serialized: bytes) -> None: ...
+
+
+def register_extension_type(ext_type: ExtensionType) -> None: ...
+
+
+def unregister_extension_type(type_name: str) -> None: ...
+
+
+class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]):
+ def __init__(
+ self, __arg0__: Mapping[str | bytes, str | bytes]
+ | Iterable[tuple[str, str]]
+ | KeyValueMetadata
+ | None = None, **kwargs: str
+ ) -> None: ...
+
+ def equals(self, other: KeyValueMetadata) -> bool: ...
+
+ def __len__(self) -> int: ...
+
+ def __contains__(self, /, __key: object) -> bool: ... # type: ignore[override]
+
+ def __getitem__(self, /, __key: Any) -> Any: ... # type: ignore[override]
+
+ def __iter__(self) -> Iterator[bytes]: ...
+
+ def get_all(self, key: str) -> list[bytes]: ...
+
+ def to_dict(self) -> dict[bytes, bytes]: ...
+
+
+class Field(_Weakrefable, Generic[_DataTypeT]):
+ def equals(self, other: Field, check_metadata: bool = False) -> bool: ...
+
+ def __hash__(self) -> int: ...
+
+ @property
+ def nullable(self) -> bool: ...
+
+ @property
+ def name(self) -> str: ...
+
+ @property
+ def metadata(self) -> dict[bytes, bytes] | None: ...
+
+ @property
+ def type(self) -> _DataTypeT: ...
+ def with_metadata(self, metadata: dict[bytes | str, bytes | str] |
+ Mapping[bytes | str, bytes | str] | Any) -> Self: ...
+
+ def remove_metadata(self) -> Self: ...
+
+ def with_type(self, new_type: DataType) -> Field: ...
+
+ def with_name(self, name: str) -> Self: ...
+
+ def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: ...
+
+ def flatten(self) -> list[Field]: ...
+
+ def _export_to_c(self, out_ptr: int) -> None: ...
+
+ @classmethod
+ def _import_from_c(cls, in_ptr: int) -> Self: ...
+
+ def __arrow_c_schema__(self) -> Any: ...
+
+ @classmethod
+ def _import_from_c_capsule(cls, schema) -> Self: ...
+
+
+class Schema(_Weakrefable):
+ def __len__(self) -> int: ...
+
+ def __getitem__(self, key: str | int) -> Field: ...
+
+ _field = __getitem__
+ def __iter__(self) -> Iterator[Field]: ...
+
+ def __hash__(self) -> int: ...
+
+ def __sizeof__(self) -> int: ...
+ @property
+ def pandas_metadata(self) -> dict: ...
+
+ @property
+ def names(self) -> list[str]: ...
+
+ @property
+ def types(self) -> list[DataType]: ...
+
+ @property
+ def metadata(self) -> dict[bytes, bytes]: ...
+
+ def empty_table(self) -> Table: ...
+
+ def equals(self, other: Schema, check_metadata: bool = False) -> bool: ...
+
+ @classmethod
+ def from_pandas(cls, df: pd.DataFrame, preserve_index: bool |
+ None = None) -> Schema: ...
+
+ def field(self, i: int | str | bytes) -> Field: ...
+
+ @deprecated("Use 'field' instead")
+ def field_by_name(self, name: str) -> Field: ...
+
+ def get_field_index(self, name: str) -> int: ...
+
+ def get_all_field_indices(self, name: str) -> list[int]: ...
+
+ def append(self, field: Field) -> Schema: ...
+
+ def insert(self, i: int, field: Field) -> Schema: ...
+
+ def remove(self, i: int) -> Schema: ...
+
+ def set(self, i: int, field: Field) -> Schema: ...
+
+ @deprecated("Use 'with_metadata' instead")
+ def add_metadata(self, metadata: dict) -> Schema: ...
+
+ def with_metadata(self, metadata: dict) -> Schema: ...
+
+ def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ...
+
+ def remove_metadata(self) -> Schema: ...
+
+ def to_string(
+ self,
+ truncate_metadata: bool = True,
+ show_field_metadata: bool = True,
+ show_schema_metadata: bool = True,
+ element_size_limit: int | None = None,
+ ) -> str: ...
+
+ def _export_to_c(self, out_ptr: int) -> None: ...
+
+ @classmethod
+ def _import_from_c(cls, in_ptr: int) -> Schema: ...
+
+ def __arrow_c_schema__(self) -> Any: ...
+
+ @staticmethod
+ def _import_from_c_capsule(schema: Any) -> Schema: ...
+
+
+def unify_schemas(
+ schemas: Sequence[Schema],
+ *,
+ promote_options: Literal["default", "permissive"] = "default"
+) -> Schema: ...
+
+
+def field(
+ name: SupportArrowSchema | str | Any, type: _DataTypeT | str | None = None,
+ nullable: bool = ...,
+ metadata: dict[Any, Any] | None = None
+) -> Field[_DataTypeT] | Field[Any]: ...
+
+
+def null() -> NullType: ...
+
+
+def bool_() -> BoolType: ...
+
+
+def uint8() -> UInt8Type: ...
+
+
+def int8() -> Int8Type: ...
+
+
+def uint16() -> UInt16Type: ...
+
+
+def int16() -> Int16Type: ...
+
+
+def uint32() -> UInt32Type: ...
+
+
+def int32() -> Int32Type: ...
+
+
+def int64() -> Int64Type: ...
+
+
+def uint64() -> UInt64Type: ...
+
+
+def timestamp(
+ unit: _Unit | str, tz: _Tz | None = None) -> TimestampType[_Unit, _Tz]: ...
+
+
+def time32(unit: _Time32Unit | str) -> Time32Type[_Time32Unit]: ...
+
+
+def time64(unit: _Time64Unit | str) -> Time64Type[_Time64Unit]: ...
+
+
+def duration(unit: _Unit | str) -> DurationType[_Unit]: ...
+
+
+def month_day_nano_interval() -> MonthDayNanoIntervalType: ...
+
+
+def date32() -> Date32Type: ...
+
+
+def date64() -> Date64Type: ...
+
+
+def float16() -> Float16Type: ...
+
+
+def float32() -> Float32Type: ...
+
+
+def float64() -> Float64Type: ...
+
+
+def decimal32(precision: _Precision, scale: _Scale |
+ None = None) -> Decimal32Type[_Precision, _Scale | Literal[0]]: ...
+
+
+def decimal64(precision: _Precision, scale: _Scale |
+ None = None) -> Decimal64Type[_Precision, _Scale | Literal[0]]: ...
+
+
+def decimal128(precision: _Precision, scale: _Scale |
+ None = None) -> Decimal128Type[_Precision, _Scale | Literal[0]]: ...
+
+
+def decimal256(precision: _Precision, scale: _Scale |
+ None = None) -> Decimal256Type[_Precision, _Scale | Literal[0]]: ...
+
+
+def string() -> StringType: ...
+
+
+utf8 = string
+
+
+def binary(length: Literal[-1] | int = ...) -> BinaryType | FixedSizeBinaryType: ...
+
+
+def large_binary() -> LargeBinaryType: ...
+
+
+def large_string() -> LargeStringType: ...
+
+
+large_utf8 = large_string
+
+
+def binary_view() -> BinaryViewType: ...
+
+
+def string_view() -> StringViewType: ...
+
+
+def list_(
+ value_type: _DataTypeT | Field[_DataTypeT] | None = None,
+ list_size: Literal[-1] | _Size | None = None
+) -> ListType[_DataTypeT] | FixedSizeListType[_DataTypeT, _Size]: ...
+
+
+def large_list(value_type: _DataTypeT |
+ Field[_DataTypeT] | None = None) -> LargeListType[_DataTypeT]: ...
+
+
+def list_view(value_type: _DataTypeT |
+ Field[_DataTypeT] | None = None) -> ListViewType[_DataTypeT]: ...
+
+
+def large_list_view(
+ value_type: _DataTypeT | Field[_DataTypeT] | None = None
+) -> LargeListViewType[_DataTypeT]: ...
+
+
+def map_(
+ key_type: _K | Field | str | None = None,
+ item_type: _ValueT | Field | str | None = None,
+ keys_sorted: bool | None = None
+) -> MapType[_K, _ValueT, Literal[False]]: ...
+
+
+def dictionary(
+ index_type: _IndexT | str,
+ value_type: _BasicValueT | str,
+ ordered: _Ordered | None = None
+) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ...
+
+
+def struct(
+ fields: Iterable[
+ Field[Any]
+ | tuple[str, Field[Any] | None]
+ | tuple[str, DataType | None]
+ ] | Mapping[str, Field[Any] | DataType | None],
+) -> StructType: ...
+
+
+def sparse_union(
+ child_fields: list[Field[Any]], type_codes: list[int] | None = None
+) -> SparseUnionType: ...
+
+
+def dense_union(
+ child_fields: list[Field[Any]], type_codes: list[int] | None = None
+) -> DenseUnionType: ...
+
+
+def union(
+ child_fields: list[Field[Any]], mode: Literal["sparse" | "dense"] | int | str,
+ type_codes: list[int] | None = None) -> SparseUnionType | DenseUnionType: ...
+
+
+def run_end_encoded(
+ run_end_type: _RunEndType | str | None, value_type: _BasicValueT | str | None
+) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ...
+
+
+def json_(storage_type: DataType = ...) -> JsonType: ...
+
+
+def uuid() -> UuidType: ...
+
+
+def fixed_shape_tensor(
+ value_type: _ValueT,
+ shape: Sequence[int],
+ dim_names: Sequence[str] | None = None,
+ permutation: Sequence[int] | None = None,
+) -> FixedShapeTensorType[_ValueT]: ...
+
+
+def bool8() -> Bool8Type: ...
+
+
+def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: ...
+
+
+def type_for_alias(name: Any) -> DataType: ...
+
+
+def schema(
+ fields: (
+ Iterable[Field[Any]]
+ | Iterable[tuple[str, DataType | str | None]]
+ | Mapping[Any, DataType | str | None]
+ ),
+ metadata: Mapping[bytes, bytes]
+ | Mapping[str, str]
+ | Mapping[bytes, str]
+ | Mapping[str, bytes] | None = None,
+) -> Schema: ...
+
+
+def from_numpy_dtype(dtype: np.dtype[Any] | type | str) -> DataType: ...
+
+
+__all__ = [
+ "_Weakrefable",
+ "_Metadata",
+ "DataType",
+ "_BasicDataType",
+ "NullType",
+ "BoolType",
+ "UInt8Type",
+ "Int8Type",
+ "UInt16Type",
+ "Int16Type",
+ "UInt32Type",
+ "Int32Type",
+ "UInt64Type",
+ "Int64Type",
+ "Float16Type",
+ "Float32Type",
+ "Float64Type",
+ "Date32Type",
+ "Date64Type",
+ "MonthDayNanoIntervalType",
+ "StringType",
+ "LargeStringType",
+ "StringViewType",
+ "BinaryType",
+ "LargeBinaryType",
+ "BinaryViewType",
+ "TimestampType",
+ "Time32Type",
+ "Time64Type",
+ "DurationType",
+ "FixedSizeBinaryType",
+ "Decimal32Type",
+ "Decimal64Type",
+ "Decimal128Type",
+ "Decimal256Type",
+ "ListType",
+ "LargeListType",
+ "ListViewType",
+ "LargeListViewType",
+ "FixedSizeListType",
+ "DictionaryMemo",
+ "DictionaryType",
+ "MapType",
+ "StructType",
+ "UnionType",
+ "SparseUnionType",
+ "DenseUnionType",
+ "RunEndEncodedType",
+ "BaseExtensionType",
+ "ExtensionType",
+ "FixedShapeTensorType",
+ "Bool8Type",
+ "UuidType",
+ "JsonType",
+ "OpaqueType",
+ "UnknownExtensionType",
+ "register_extension_type",
+ "unregister_extension_type",
+ "KeyValueMetadata",
+ "Field",
+ "Schema",
+ "unify_schemas",
+ "field",
+ "null",
+ "bool_",
+ "uint8",
+ "int8",
+ "uint16",
+ "int16",
+ "uint32",
+ "int32",
+ "int64",
+ "uint64",
+ "timestamp",
+ "time32",
+ "time64",
+ "duration",
+ "month_day_nano_interval",
+ "date32",
+ "date64",
+ "float16",
+ "float32",
+ "float64",
+ "decimal32",
+ "decimal64",
+ "decimal128",
+ "decimal256",
+ "string",
+ "utf8",
+ "binary",
+ "large_binary",
+ "large_string",
+ "large_utf8",
+ "binary_view",
+ "string_view",
+ "list_",
+ "large_list",
+ "list_view",
+ "large_list_view",
+ "map_",
+ "dictionary",
+ "struct",
+ "sparse_union",
+ "dense_union",
+ "union",
+ "run_end_encoded",
+ "json_",
+ "uuid",
+ "fixed_shape_tensor",
+ "bool8",
+ "opaque",
+ "type_for_alias",
+ "schema",
+ "from_numpy_dtype",
+ "_Unit",
+ "_Tz",
+ "_Time32Unit",
+ "_Time64Unit",
+ "_DataTypeT",
+]
diff --git a/python/pyarrow-stubs/pyarrow/array.pyi b/python/pyarrow-stubs/pyarrow/array.pyi
new file mode 100644
index 00000000000..547e9c949d5
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/array.pyi
@@ -0,0 +1,894 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+from collections.abc import Iterable, Iterator, Sequence
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+
+from typing import (
+ Any,
+ Generic,
+ Literal,
+ TypeVar,
+)
+
+import numpy as np
+import pandas as pd
+
+from pyarrow._compute import CastOptions
+from pyarrow._stubs_typing import (
+ ArrayLike,
+ Indices,
+ Mask,
+ Order,
+ SupportArrowArray,
+ SupportArrowDeviceArray,
+ SupportPyArrowArray,
+)
+from pyarrow.lib import (
+ Buffer,
+ Device,
+ MemoryManager,
+ MemoryPool,
+ Tensor,
+ _Weakrefable,
+)
+from typing_extensions import deprecated
+import builtins
+
+from .scalar import ( # noqa: F401
+ BinaryScalar,
+ BinaryViewScalar,
+ BooleanScalar,
+ Date32Scalar,
+ Date64Scalar,
+ DictionaryScalar,
+ DoubleScalar,
+ DurationScalar,
+ ExtensionScalar,
+ FixedSizeBinaryScalar,
+ FixedSizeListScalar,
+ FloatScalar,
+ HalfFloatScalar,
+ Int16Scalar,
+ Int32Scalar,
+ Int64Scalar,
+ Int8Scalar,
+ LargeBinaryScalar,
+ LargeListScalar,
+ LargeStringScalar,
+ ListScalar,
+ ListViewScalar,
+ MapScalar,
+ MonthDayNanoIntervalScalar,
+ NullScalar,
+ RunEndEncodedScalar,
+ Scalar,
+ StringScalar,
+ StringViewScalar,
+ StructScalar,
+ Time32Scalar,
+ Time64Scalar,
+ TimestampScalar,
+ UInt16Scalar,
+ UInt32Scalar,
+ UInt64Scalar,
+ UInt8Scalar,
+ UnionScalar,
+)
+from .device import DeviceAllocationType
+from ._types import ( # noqa: F401
+ BaseExtensionType,
+ BinaryType,
+ DataType,
+ Field,
+ Float64Type,
+ Int64Type,
+ MapType,
+ StringType,
+ StructType,
+ _AsPyType,
+ _BasicDataType,
+ _BasicValueT,
+ _DataTypeT,
+ _IndexT,
+ _RunEndType,
+ _Size,
+ _Time32Unit,
+ _Time64Unit,
+ _Tz,
+ _Unit,
+)
+from ._stubs_typing import NullableCollection
+
+
+def array(
+ values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray
+ | SupportArrowDeviceArray | SupportPyArrowArray,
+ type: Any | None = None,
+ mask: Mask | pd.Series[bool] | None = None,
+ size: int | None = None,
+ from_pandas: bool | None = None,
+ safe: bool = True,
+ memory_pool: MemoryPool | None = None,
+) -> ArrayLike: ...
+
+
+def asarray(
+ values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray
+ | SupportArrowDeviceArray,
+ type: _DataTypeT | Any | None = None,
+) -> Array[Scalar[_DataTypeT]] | ArrayLike: ...
+
+
+def nulls(
+ size: int,
+ type: Any | None = None,
+ memory_pool: MemoryPool | None = None,
+) -> ArrayLike: ...
+
+
+def repeat(
+ value: Any,
+ size: int,
+ memory_pool: MemoryPool | None = None,
+) -> ArrayLike: ...
+
+
+def infer_type(values: Iterable[Any], mask: Mask | None = None,
+ from_pandas: bool = False) -> DataType: ...
+
+
+class ArrayStatistics(_Weakrefable):
+ @property
+ def null_count(self) -> int | None: ...
+
+ @property
+ def distinct_count(self) -> int | None: ...
+
+ @property
+ def is_null_count_exact(self) -> bool | None: ...
+
+ @property
+ def is_distinct_count_exact(self) -> bool | None: ...
+
+ @property
+ def min(self) -> Any | None: ...
+
+ @property
+ def is_min_exact(self) -> bool | None: ...
+
+ @property
+ def max(self) -> Any | None: ...
+
+ @property
+ def is_max_exact(self) -> bool | None: ...
+
+
+_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series)
+
+
+class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]):
+ def to_pandas(
+ self,
+ memory_pool: MemoryPool | None = None,
+ categories: list | tuple | None = None,
+ strings_to_categorical: bool = False,
+ zero_copy_only: bool = False,
+ integer_object_nulls: bool = False,
+ date_as_object: bool = True,
+ timestamp_as_object: bool = False,
+ use_threads: bool = True,
+ deduplicate_objects: bool = True,
+ ignore_metadata: bool = False,
+ safe: bool = True,
+ split_blocks: bool = False,
+ self_destruct: bool = False,
+ maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None,
+ types_mapper: Any = None, # Callable[[DataType], ExtensionDtype | None] | None
+ coerce_temporal_nanoseconds: bool = False,
+ ) -> _ConvertAs: ...
+
+
+_CastAs = TypeVar("_CastAs", bound=DataType)
+_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True)
+_ScalarT = TypeVar("_ScalarT", bound=Scalar)
+
+
+class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]):
+ def as_py(self) -> list[Any]: ...
+
+ def diff(self, other: Self) -> str: ...
+
+ # Private attribute used internally (e.g., for column names in batches)
+ _name: str | None
+
+ def cast(
+ self,
+ target_type: _CastAs | str,
+ safe: bool = True,
+ options: CastOptions | None = None,
+ memory_pool: MemoryPool | None = None,
+ ) -> Array[Scalar[_CastAs]]: ...
+
+ def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ...
+
+ def sum(self, **kwargs) -> _Scalar_co: ...
+
+ @property
+ def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ...
+ def unique(self) -> Self: ...
+
+ def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ...
+
+ def value_counts(self) -> StructArray: ...
+
+ @staticmethod
+ def from_pandas(
+ obj: pd.Series | np.ndarray | ArrayLike,
+ *,
+ mask: Mask | None = None,
+ type: _DataTypeT | None = None,
+ safe: bool = True,
+ memory_pool: MemoryPool | None = None,
+ ) -> Array[Scalar[_DataTypeT]] | Array[Scalar]: ...
+
+ @staticmethod
+ def from_buffers(
+ type: _DataTypeT,
+ length: int,
+ buffers: Sequence[Buffer | None],
+ null_count: int = -1,
+ offset=0,
+ children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None,
+ ) -> Array[Scalar[_DataTypeT]]: ...
+
+ @property
+ def null_count(self) -> int: ...
+ @property
+ def nbytes(self) -> int: ...
+
+ def get_total_buffer_size(self) -> int: ...
+
+ def __sizeof__(self) -> int: ...
+ def __iter__(self) -> Iterator[_Scalar_co]: ...
+
+ def to_string(
+ self,
+ *,
+ indent: int = 2,
+ top_level_indent: int = 0,
+ window: int = 10,
+ container_window: int = 2,
+ skip_new_lines: bool = False,
+ ) -> str: ...
+
+ format = to_string
+ def equals(self, other: Array | Any) -> bool: ...
+
+ def __len__(self) -> int: ...
+
+ def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: ...
+
+ def is_nan(self) -> BooleanArray: ...
+
+ def is_valid(self) -> BooleanArray: ...
+
+ def fill_null(
+ self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType
+ ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ...
+
+ def __getitem__(self, key: int | builtins.slice) -> _Scalar_co | Self: ...
+
+ def slice(self, offset: int = 0, length: int | None = None) -> Self: ...
+
+ def take(self, indices: Indices) -> Self: ...
+
+ def drop_null(self) -> Self: ...
+
+ def filter(
+ self,
+ mask: Mask,
+ *,
+ null_selection_behavior: Literal["drop", "emit_null"] = "drop",
+ ) -> Self: ...
+
+ def index(
+ self: Array[_ScalarT] | Array[Scalar[_BasicDataType[_AsPyType]]],
+ value: _ScalarT | _AsPyType,
+ start: int | None = None,
+ end: int | None = None,
+ *,
+ memory_pool: MemoryPool | None = None,
+ ) -> Int64Scalar: ...
+
+ def sort(self, order: Order = "ascending", **kwargs) -> Self: ...
+
+ def __array__(self, dtype: np.dtype | None = None,
+ copy: bool | None = None) -> np.ndarray: ...
+
+ def to_numpy(self, zero_copy_only: bool = True,
+ writable: bool = False) -> np.ndarray: ...
+
+ def to_pylist(
+ self,
+ *,
+ maps_as_pydicts: Literal["lossy", "strict"] | None = None,
+ ) -> list[Any]: ...
+
+ tolist = to_pylist
+ def validate(self, *, full: bool = False) -> None: ...
+
+ @property
+ def offset(self) -> int: ...
+
+ def buffers(self) -> list[Buffer | None]: ...
+
+ def copy_to(self, destination: MemoryManager | Device) -> Self: ...
+
+ def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ...
+
+ @classmethod
+ def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ...
+
+ def __arrow_c_array__(self, requested_schema=None) -> Any: ...
+
+ @classmethod
+ def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ...
+ def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ...
+
+ @classmethod
+ def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: ...
+
+ def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ...
+
+ @classmethod
+ def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ...
+ def __dlpack__(self, stream: int | None = None) -> Any: ...
+
+ def __dlpack_device__(self) -> tuple[int, int]: ...
+
+ @property
+ def device_type(self) -> DeviceAllocationType: ...
+
+ @property
+ def is_cpu(self) -> bool: ...
+
+ @property
+ def statistics(self) -> ArrayStatistics | None: ...
+
+
+class NullArray(Array[NullScalar]):
+ ...
+
+
+class BooleanArray(Array[BooleanScalar]):
+ @property
+ def false_count(self) -> int: ...
+ @property
+ def true_count(self) -> int: ...
+
+
+class NumericArray(Array[_ScalarT]):
+ ...
+
+
+class IntegerArray(NumericArray[_ScalarT]):
+ ...
+
+
+class FloatingPointArray(NumericArray[_ScalarT]):
+ ...
+
+
+class Int8Array(IntegerArray[Int8Scalar]):
+ ...
+
+
+class UInt8Array(IntegerArray[UInt8Scalar]):
+ ...
+
+
+class Int16Array(IntegerArray[Int16Scalar]):
+ ...
+
+
+class UInt16Array(IntegerArray[UInt16Scalar]):
+ ...
+
+
+class Int32Array(IntegerArray[Int32Scalar]):
+ ...
+
+
+class UInt32Array(IntegerArray[UInt32Scalar]):
+ ...
+
+
+class Int64Array(IntegerArray[Int64Scalar]):
+ ...
+
+
+class UInt64Array(IntegerArray[UInt64Scalar]):
+ ...
+
+
+class Date32Array(NumericArray[Date32Scalar]):
+ ...
+
+
+class Date64Array(NumericArray[Date64Scalar]):
+ ...
+
+
+class TimestampArray(NumericArray[TimestampScalar[_Unit, _Tz]]):
+ ...
+
+
+class Time32Array(NumericArray[Time32Scalar[_Time32Unit]]):
+ ...
+
+
+class Time64Array(NumericArray[Time64Scalar[_Time64Unit]]):
+ ...
+
+
+class DurationArray(NumericArray[DurationScalar[_Unit]]):
+ ...
+
+
+class MonthDayNanoIntervalArray(Array[MonthDayNanoIntervalScalar]):
+ ...
+
+
+class HalfFloatArray(FloatingPointArray[HalfFloatScalar]):
+ ...
+
+
+class FloatArray(FloatingPointArray[FloatScalar]):
+ ...
+
+
+class DoubleArray(FloatingPointArray[DoubleScalar]):
+ ...
+
+
+class FixedSizeBinaryArray(Array[FixedSizeBinaryScalar]):
+ ...
+
+
+class Decimal32Array(FixedSizeBinaryArray):
+ ...
+
+
+class Decimal64Array(FixedSizeBinaryArray):
+ ...
+
+
+class Decimal128Array(FixedSizeBinaryArray):
+ ...
+
+
+class Decimal256Array(FixedSizeBinaryArray):
+ ...
+
+
+class BaseListArray(Array[_ScalarT]):
+ def flatten(self, recursive: bool = False) -> Array: ...
+
+ def value_parent_indices(self) -> Int64Array: ...
+
+ def value_lengths(self) -> Int32Array: ...
+
+
+class ListArray(BaseListArray[_ScalarT]):
+ @classmethod
+ def from_arrays(
+ cls,
+ offsets: Int32Array | list[int] | list[int | None],
+ values: Array[Scalar[_DataTypeT]] | list[int] | list[float] | list[str]
+ | list[bytes] | list,
+ *,
+ type: _DataTypeT | None = None,
+ pool: MemoryPool | None = None,
+ mask: Mask | None = None,
+ ) -> (ListArray[ListScalar[
+ _DataTypeT | Int64Type | Float64Type | StringType | BinaryType
+ ]] | ListArray): ...
+
+ @property
+ def values(self) -> Array: ...
+
+ @property
+ def offsets(self) -> Int32Array: ...
+
+
+class LargeListArray(BaseListArray[LargeListScalar[_DataTypeT]]):
+ @classmethod
+ def from_arrays(
+ cls,
+ offsets: Int64Array | list[int] | list[int | None],
+ values: Array[Scalar[_DataTypeT]] | Array,
+ *,
+ type: _DataTypeT | None = None,
+ pool: MemoryPool | None = None,
+ mask: Mask | None = None,
+ ) -> LargeListArray[_DataTypeT]: ...
+
+ @property
+ def values(self) -> Array: ...
+
+ @property
+ def offsets(self) -> Int64Array: ...
+
+
+class ListViewArray(BaseListArray[ListViewScalar[_DataTypeT]]):
+ @classmethod
+ def from_arrays(
+ cls,
+ offsets: Int32Array,
+ values: Array[Scalar[_DataTypeT]] | Array,
+ *,
+ type: _DataTypeT | None = None,
+ pool: MemoryPool | None = None,
+ mask: Mask | None = None,
+ ) -> ListViewArray[_DataTypeT]: ...
+
+ @property
+ def values(self) -> Array: ...
+
+ @property
+ def offsets(self) -> Int32Array: ...
+
+ @property
+ def sizes(self) -> Int32Array: ...
+
+
+class LargeListViewArray(BaseListArray[LargeListScalar[_DataTypeT]]):
+ @classmethod
+ def from_arrays(
+ cls,
+ offsets: Int64Array,
+ values: Array[Scalar[_DataTypeT]] | Array,
+ *,
+ type: _DataTypeT | None = None,
+ pool: MemoryPool | None = None,
+ mask: Mask | None = None,
+ ) -> LargeListViewArray[_DataTypeT]: ...
+
+ @property
+ def values(self) -> Array: ...
+
+ @property
+ def offsets(self) -> Int64Array: ...
+
+ @property
+ def sizes(self) -> Int64Array: ...
+
+
+class FixedSizeListArray(BaseListArray[FixedSizeListScalar[_DataTypeT, _Size]]):
+ @classmethod
+ def from_arrays(
+ cls,
+ values: Array[Scalar[_DataTypeT]],
+ list_size: _Size | None = None,
+ *,
+ type: DataType | None = None,
+ mask: Mask | None = None,
+ ) -> FixedSizeListArray[_DataTypeT, _Size | None]: ...
+
+ @property
+ def values(self) -> BaseListArray[ListScalar[_DataTypeT]]: ...
+
+
+_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType)
+_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType)
+
+
+class MapArray(BaseListArray[MapScalar[_MapKeyT, _MapItemT]]):
+ @classmethod
+ def from_arrays(
+ cls,
+ offsets: Int64Array | list[int] | None,
+ keys: Array[Scalar[_MapKeyT]] | np.ndarray | list | None = None,
+ items: Array[Scalar[_MapItemT]] | np.ndarray | list | None = None,
+ values: Array | DataType | None = None,
+ *,
+ type: DataType | None = None,
+ pool: MemoryPool | None = None,
+ mask: Mask | None = None,
+ ) -> MapArray[_MapKeyT, _MapItemT]: ...
+
+ @property
+ def keys(self) -> Array: ...
+
+ @property
+ def items(self) -> Array: ...
+
+
+class UnionArray(Array[UnionScalar]):
+ @deprecated("Use fields() instead")
+ def child(self, pos: int) -> Field: ...
+
+ def field(self, pos: int) -> Array: ...
+
+ @property
+ def type_codes(self) -> Int8Array: ...
+
+ @property
+ def offsets(self) -> Int32Array: ...
+
+ @staticmethod
+ def from_dense(
+ types: Int8Array,
+ value_offsets: Int32Array,
+ children: NullableCollection[Array],
+ field_names: list[str] | None = None,
+ type_codes: Int8Array | list[int] | None = None,
+ ) -> UnionArray: ...
+
+ @staticmethod
+ def from_sparse(
+ types: Int8Array,
+ children: NullableCollection[Array],
+ field_names: list[str] | None = None,
+ type_codes: Int8Array | list[int] | None = None,
+ ) -> UnionArray: ...
+
+
+class StringArray(Array[StringScalar]):
+ @staticmethod
+ def from_buffers( # type: ignore[override]
+ length: int,
+ value_offsets: Buffer,
+ data: Buffer,
+ null_bitmap: Buffer | None = None,
+ null_count: int | None = -1,
+ offset: int | None = 0,
+ ) -> StringArray: ...
+
+
+class LargeStringArray(Array[LargeStringScalar]):
+ @staticmethod
+ def from_buffers( # type: ignore[override]
+ length: int,
+ value_offsets: Buffer,
+ data: Buffer,
+ null_bitmap: Buffer | None = None,
+ null_count: int | None = -1,
+ offset: int | None = 0,
+ ) -> StringArray: ...
+
+
+class StringViewArray(Array[StringViewScalar]):
+ ...
+
+
+class BinaryArray(Array[BinaryScalar]):
+ @property
+ def total_values_length(self) -> int: ...
+
+
+class LargeBinaryArray(Array[LargeBinaryScalar]):
+ @property
+ def total_values_length(self) -> int: ...
+
+
+class BinaryViewArray(Array[BinaryViewScalar]):
+ ...
+
+
+class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]):
+ def dictionary_encode(self) -> Self: ... # type: ignore[override]
+ def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: ...
+
+ @property
+ def indices(self) -> Array[Scalar[_IndexT]]: ...
+ @property
+ def dictionary(self) -> Array[Scalar[_BasicValueT]]: ...
+
+ @staticmethod
+ def from_buffers( # type: ignore[override]
+ type: _BasicValueT,
+ length: int,
+ buffers: list[Buffer],
+ dictionary: Array | np.ndarray | pd.Series,
+ null_count: int = -1,
+ offset: int = 0,
+ ) -> DictionaryArray[Any, _BasicValueT]: ...
+
+ @staticmethod
+ def from_arrays(
+ indices: Indices | Sequence[int | None],
+ dictionary: Array | np.ndarray | pd.Series | list[Any],
+ mask: np.ndarray | pd.Series | BooleanArray | None = None,
+ ordered: bool = False,
+ from_pandas: bool = False,
+ safe: bool = True,
+ memory_pool: MemoryPool | None = None,
+ ) -> DictionaryArray: ...
+
+
+class StructArray(Array[StructScalar]):
+ def field(self, index: int | str) -> Array: ...
+
+ def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ...
+
+ @staticmethod
+ def from_arrays(
+ arrays: Iterable[Array | np.ndarray | list],
+ names: Sequence[str] | list[Field] | None = None,
+ fields: list[Field] | None = None,
+ mask=None,
+ memory_pool: MemoryPool | None = None,
+ type: StructType | None = None,
+ ) -> StructArray: ...
+
+ def sort(self, order: Order = "ascending", by: str |
+ None = None, **kwargs) -> StructArray: ...
+
+
+class RunEndEncodedArray(Array[RunEndEncodedScalar[_RunEndType, _BasicValueT]]):
+ @staticmethod
+ def from_arrays(
+ run_ends: Int16Array | Int32Array | Int64Array | list[int],
+ values: Array | list[Any], type: DataType | None = None,
+ ) -> RunEndEncodedArray[Any, _BasicValueT]: ...
+
+ @staticmethod
+ def from_buffers( # type: ignore[override]
+ type: DataType,
+ length: int,
+ buffers: list[Buffer] | list[None],
+ null_count: int = -1,
+ offset=0,
+ children: tuple[Array, Array] | list[list[int]] | None = None,
+ ) -> RunEndEncodedArray[Any, _BasicValueT]: ...
+
+ @property
+ def run_ends(self) -> Array[Scalar[_RunEndType]]: ...
+
+ @property
+ def values(self) -> Array[Scalar[_BasicValueT]]: ...
+
+ def find_physical_offset(self) -> int: ...
+
+ def find_physical_length(self) -> int: ...
+
+
+_ArrayT = TypeVar("_ArrayT", bound=Array)
+
+
+class ExtensionArray(Array[ExtensionScalar], Generic[_ArrayT]):
+ @property
+ def storage(self) -> Any: ...
+
+ @staticmethod
+ def from_storage(typ: BaseExtensionType,
+ storage: _ArrayT) -> ExtensionArray[_ArrayT]: ...
+
+
+class JsonArray(ExtensionArray[_ArrayT]):
+ ...
+
+
+class UuidArray(ExtensionArray[_ArrayT]):
+ ...
+
+
+class FixedShapeTensorArray(ExtensionArray[_ArrayT]):
+ def to_numpy_ndarray(self) -> np.ndarray: ...
+
+ def to_tensor(self) -> Tensor: ...
+
+ @classmethod
+ def from_numpy_ndarray(
+ cls, obj: np.ndarray,
+ dim_names: list[str] | tuple[str, ...] | None = None
+ ) -> Self: ...
+
+
+class OpaqueArray(ExtensionArray[_ArrayT]):
+ ...
+
+
+class Bool8Array(ExtensionArray):
+ def to_numpy(self, zero_copy_only: bool = ...,
+ writable: bool = ...) -> np.ndarray: ...
+
+ @classmethod
+ def from_storage(cls, storage: Int8Array) -> Self: ... # type: ignore[override]
+
+ @classmethod
+ def from_numpy(cls, obj: np.ndarray) -> Self: ...
+
+
+def concat_arrays(arrays: Iterable[_ArrayT],
+ memory_pool: MemoryPool | None = None) -> _ArrayT: ...
+
+
+def _empty_array(type: _DataTypeT) -> Array[Scalar[_DataTypeT]]: ...
+
+
+__all__ = [
+ "array",
+ "asarray",
+ "nulls",
+ "repeat",
+ "infer_type",
+ "_PandasConvertible",
+ "Array",
+ "NullArray",
+ "BooleanArray",
+ "NumericArray",
+ "IntegerArray",
+ "FloatingPointArray",
+ "Int8Array",
+ "UInt8Array",
+ "Int16Array",
+ "UInt16Array",
+ "Int32Array",
+ "UInt32Array",
+ "Int64Array",
+ "UInt64Array",
+ "Date32Array",
+ "Date64Array",
+ "TimestampArray",
+ "Time32Array",
+ "Time64Array",
+ "DurationArray",
+ "MonthDayNanoIntervalArray",
+ "HalfFloatArray",
+ "FloatArray",
+ "DoubleArray",
+ "FixedSizeBinaryArray",
+ "Decimal32Array",
+ "Decimal64Array",
+ "Decimal128Array",
+ "Decimal256Array",
+ "BaseListArray",
+ "ListArray",
+ "LargeListArray",
+ "ListViewArray",
+ "LargeListViewArray",
+ "FixedSizeListArray",
+ "MapArray",
+ "UnionArray",
+ "StringArray",
+ "LargeStringArray",
+ "StringViewArray",
+ "BinaryArray",
+ "LargeBinaryArray",
+ "BinaryViewArray",
+ "DictionaryArray",
+ "StructArray",
+ "RunEndEncodedArray",
+ "ExtensionArray",
+ "Bool8Array",
+ "UuidArray",
+ "JsonArray",
+ "OpaqueArray",
+ "FixedShapeTensorArray",
+ "concat_arrays",
+ "_empty_array",
+ "_CastAs",
+]
diff --git a/python/pyarrow-stubs/pyarrow/builder.pyi b/python/pyarrow-stubs/pyarrow/builder.pyi
new file mode 100644
index 00000000000..9001d9835b6
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/builder.pyi
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Iterable
+
+from pyarrow.lib import MemoryPool, _Weakrefable
+
+from .array import StringArray, StringViewArray
+
+
+class StringBuilder(_Weakrefable):
+ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ...
+ def append(self, value: str | bytes | float | None): ...
+
+ def append_values(self, values: Iterable[str | bytes | float | None]): ...
+
+ def finish(self) -> StringArray: ...
+
+ @property
+ def null_count(self) -> int: ...
+ def __len__(self) -> int: ...
+
+
+class StringViewBuilder(_Weakrefable):
+ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ...
+ def append(self, value: str | bytes | float | None): ...
+
+ def append_values(self, values: Iterable[str | bytes | float | None]): ...
+
+ def finish(self) -> StringViewArray: ...
+
+ @property
+ def null_count(self) -> int: ...
+ def __len__(self) -> int: ...
+
+
+__all__ = ["StringBuilder", "StringViewBuilder"]
diff --git a/python/pyarrow-stubs/pyarrow/cffi.pyi b/python/pyarrow-stubs/pyarrow/cffi.pyi
new file mode 100644
index 00000000000..e4f077d7155
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/cffi.pyi
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import cffi
+
+c_source: str
+ffi: cffi.FFI
diff --git a/python/pyarrow-stubs/pyarrow/compat.pyi b/python/pyarrow-stubs/pyarrow/compat.pyi
new file mode 100644
index 00000000000..30e3ec13e0d
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/compat.pyi
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+def encode_file_path(path: str | bytes) -> bytes: ...
+def tobytes(o: str | bytes) -> bytes: ...
+def frombytes(o: bytes, *, safe: bool = False): ...
+
+
+__all__ = ["encode_file_path", "tobytes", "frombytes"]
diff --git a/python/pyarrow-stubs/pyarrow/compute.pyi b/python/pyarrow-stubs/pyarrow/compute.pyi
new file mode 100644
index 00000000000..809bccd1b92
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/compute.pyi
@@ -0,0 +1,1834 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Callable, Hashable, Iterable, Sequence, Mapping
+from typing import Literal, TypeAlias, TypeVar, Any, ParamSpec
+
+import numpy as np
+
+# Option classes
+from pyarrow._compute import ArraySortOptions as ArraySortOptions
+from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions
+from pyarrow._compute import CastOptions as CastOptions
+from pyarrow._compute import CountOptions as CountOptions
+from pyarrow._compute import CumulativeOptions as CumulativeOptions # noqa: F401
+from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions
+from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions
+from pyarrow._compute import ( # noqa: F401
+ DictionaryEncodeOptions as DictionaryEncodeOptions)
+from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions
+
+# Expressions
+from pyarrow._compute import Expression as Expression
+from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions
+from pyarrow._compute import ( # noqa: F401
+ ExtractRegexSpanOptions as ExtractRegexSpanOptions)
+from pyarrow._compute import FilterOptions as FilterOptions
+from pyarrow._compute import FunctionOptions as FunctionOptions # noqa: F401
+from pyarrow._compute import IndexOptions as IndexOptions # noqa: F401
+from pyarrow._compute import JoinOptions as JoinOptions # noqa: F401
+from pyarrow._compute import ListFlattenOptions as ListFlattenOptions
+from pyarrow._compute import ListSliceOptions as ListSliceOptions
+from pyarrow._compute import MakeStructOptions as MakeStructOptions
+from pyarrow._compute import MapLookupOptions as MapLookupOptions
+from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions
+from pyarrow._compute import ModeOptions as ModeOptions
+from pyarrow._compute import NullOptions as NullOptions
+from pyarrow._compute import PadOptions as PadOptions
+from pyarrow._compute import PairwiseOptions as PairwiseOptions
+from pyarrow._compute import PartitionNthOptions as PartitionNthOptions
+from pyarrow._compute import PivotWiderOptions as PivotWiderOptions
+from pyarrow._compute import QuantileOptions as QuantileOptions
+from pyarrow._compute import RandomOptions as RandomOptions
+from pyarrow._compute import RankOptions as RankOptions
+from pyarrow._compute import RankQuantileOptions as RankQuantileOptions
+from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions
+from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions
+from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions
+from pyarrow._compute import RoundOptions as RoundOptions
+from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions
+from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions
+from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions
+from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions
+from pyarrow._compute import SelectKOptions as SelectKOptions
+from pyarrow._compute import SetLookupOptions as SetLookupOptions
+from pyarrow._compute import SkewOptions as SkewOptions
+from pyarrow._compute import SliceOptions as SliceOptions
+from pyarrow._compute import SortOptions as SortOptions
+from pyarrow._compute import SplitOptions as SplitOptions
+from pyarrow._compute import SplitPatternOptions as SplitPatternOptions # noqa: F401
+from pyarrow._compute import StrftimeOptions as StrftimeOptions
+from pyarrow._compute import StrptimeOptions as StrptimeOptions
+from pyarrow._compute import StructFieldOptions as StructFieldOptions
+from pyarrow._compute import TakeOptions as TakeOptions
+from pyarrow._compute import TDigestOptions as TDigestOptions
+from pyarrow._compute import TrimOptions as TrimOptions
+from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions
+from pyarrow._compute import VarianceOptions as VarianceOptions
+from pyarrow._compute import WeekOptions as WeekOptions
+from pyarrow._compute import WinsorizeOptions as WinsorizeOptions
+from pyarrow._compute import ZeroFillOptions as ZeroFillOptions
+
+# Functions
+from pyarrow._compute import call_function as call_function # noqa: F401
+from pyarrow._compute import ( # noqa: F401
+ call_tabular_function as call_tabular_function)
+from pyarrow._compute import get_function as get_function # noqa: F401
+from pyarrow._compute import list_functions as list_functions # noqa: F401
+from pyarrow._compute import ( # noqa: F401
+ register_scalar_function as register_scalar_function)
+from pyarrow._compute import ( # noqa: F401
+ register_aggregate_function as register_aggregate_function)
+from pyarrow._compute import ( # noqa: F401
+ register_vector_function as register_vector_function)
+from pyarrow._compute import ( # noqa: F401
+ register_tabular_function as register_tabular_function)
+
+# Function and Kernel classes
+from pyarrow._compute import Function as Function # noqa: F401
+from pyarrow._compute import Kernel as Kernel # noqa: F401
+from pyarrow._compute import ScalarFunction as ScalarFunction # noqa: F401
+from pyarrow._compute import ScalarKernel as ScalarKernel # noqa: F401
+from pyarrow._compute import VectorFunction as VectorFunction # noqa: F401
+from pyarrow._compute import VectorKernel as VectorKernel # noqa: F401
+from pyarrow._compute import ( # noqa: F401
+ ScalarAggregateFunction as ScalarAggregateFunction)
+from pyarrow._compute import ( # noqa: F401
+ ScalarAggregateKernel as ScalarAggregateKernel)
+from pyarrow._compute import ( # noqa: F401
+ HashAggregateFunction as HashAggregateFunction)
+from pyarrow._compute import HashAggregateKernel as HashAggregateKernel # noqa: F401
+
+# Udf
+
+from pyarrow._compute import _Order, _Placement
+from pyarrow._stubs_typing import ArrayLike, ScalarLike, PyScalar, TimeUnit
+from pyarrow._types import _RunEndType
+from . import lib
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
+class _ExprComparable(Expression):
+ def __ge__(self, other: Any) -> Expression: ...
+ def __le__(self, other: Any) -> Expression: ...
+ def __gt__(self, other: Any) -> Expression: ...
+ def __lt__(self, other: Any) -> Expression: ...
+
+
+def field(*name_or_index: str | bytes | tuple[str | int, ...] | int) -> Expression: ...
+def __ge__(self, other: Any) -> Expression: ...
+
+
+def scalar(value: PyScalar | lib.Scalar[Any] | Mapping | lib.int64()) -> Expression: ...
+
+
+def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ...
+
+
+# ============= compute functions =============
+_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType)
+_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True)
+_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar)
+_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray)
+_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array |
+ lib.Scalar | lib.ChunkedArray)
+ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT]
+ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT
+
+SignedIntegerScalar: TypeAlias = (
+ lib.Scalar[lib.Int8Type]
+ | lib.Scalar[lib.Int16Type]
+ | lib.Scalar[lib.Int32Type]
+ | lib.Scalar[lib.Int64Type]
+)
+UnsignedIntegerScalar: TypeAlias = (
+ lib.Scalar[lib.UInt8Type]
+ | lib.Scalar[lib.UInt16Type]
+ | lib.Scalar[lib.UInt32Type]
+ | lib.Scalar[lib.UInt64Type]
+)
+IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar
+FloatScalar: TypeAlias = (lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type]
+ | lib.Scalar[lib.Float64Type])
+DecimalScalar: TypeAlias = (
+ lib.Scalar[lib.Decimal32Type]
+ | lib.Scalar[lib.Decimal64Type]
+ | lib.Scalar[lib.Decimal128Type]
+ | lib.Scalar[lib.Decimal256Type]
+)
+NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar
+NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar
+BinaryScalar: TypeAlias = (
+ lib.Scalar[lib.BinaryType]
+ | lib.Scalar[lib.LargeBinaryType]
+ | lib.Scalar[lib.FixedSizeBinaryType]
+)
+StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType]
+StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar
+_ListScalar: TypeAlias = (
+ lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any]
+)
+_LargeListScalar: TypeAlias = (
+ lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT]
+)
+ListScalar: TypeAlias = (
+ lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT]
+)
+TemporalScalar: TypeAlias = (
+ lib.Date32Scalar
+ | lib.Date64Scalar
+ | lib.Time32Scalar[Any]
+ | lib.Time64Scalar[Any]
+ | lib.TimestampScalar[Any]
+ | lib.DurationScalar[Any]
+ | lib.MonthDayNanoIntervalScalar
+)
+NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar
+NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar
+
+_NumericOrTemporalScalarT = TypeVar(
+ "_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar)
+_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar)
+NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT]
+_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray)
+_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar)
+NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar]
+_NumericOrDurationArrayT = TypeVar(
+ "_NumericOrDurationArrayT", bound=NumericOrDurationArray)
+NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT]
+_NumericOrTemporalArrayT = TypeVar(
+ "_NumericOrTemporalArrayT", bound=NumericOrTemporalArray)
+BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar]
+_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray)
+IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar]
+_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar)
+FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar]
+_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray)
+_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar)
+StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar]
+_StringArrayT = TypeVar("_StringArrayT", bound=StringArray)
+_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar)
+BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar]
+_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray)
+_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar)
+StringOrBinaryArray: TypeAlias = StringArray | BinaryArray
+_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray)
+_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar)
+TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar]
+_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray)
+_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]]
+_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]]
+ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]]
+
+# =============================== 1. Aggregation ===============================
+
+
+def array_take(
+ array: _ArrayT | lib.Scalar | lib.Table | Expression,
+ indices: list[int]
+ | list[int | None]
+ | lib.Int16Array
+ | lib.Int32Array
+ | lib.Int64Array
+ | lib.UInt64Array
+ | lib.ChunkedArray[lib.Int16Scalar]
+ | lib.ChunkedArray[lib.Int32Scalar]
+ | lib.ChunkedArray[lib.Int64Scalar]
+ | np.ndarray
+ | Expression,
+ /,
+ *,
+ boundscheck: bool | None = None,
+ options: TakeOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ArrayT | Expression: ...
+
+
+# ========================= 1.1 functions =========================
+
+
+def all(
+ array: lib.BooleanScalar | BooleanArray,
+ /,
+ *,
+ skip_nulls: bool = True,
+ min_count: int = 1,
+ options: ScalarAggregateOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.BooleanScalar: ...
+
+
+any = _clone_signature(all)
+
+
+def approximate_median(
+ array: NumericScalar | NumericArray,
+ /,
+ *,
+ skip_nulls: bool = True,
+ min_count: int = 1,
+ options: ScalarAggregateOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleScalar: ...
+
+
+def count(
+ array: lib.Array | lib.ChunkedArray,
+ /,
+ mode: Literal["only_valid", "only_null", "all"] = "only_valid",
+ *,
+ options: CountOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int64Scalar: ...
+
+
+def count_distinct(
+ array: lib.Array | lib.ChunkedArray,
+ /,
+ mode: Literal["only_valid", "only_null", "all"] = "only_valid",
+ *,
+ options: CountOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int64Scalar: ...
+
+
+def first(
+ array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT],
+ /,
+ *,
+ skip_nulls: bool = True,
+ min_count: int = 1,
+ options: ScalarAggregateOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ScalarT: ...
+
+last = _clone_signature(first)
+
+def first_last(
+ array: lib.Array[Any] | lib.ChunkedArray[Any] | list[Any],
+ /,
+ *,
+ skip_nulls: bool = True,
+ min_count: int = 1,
+ options: ScalarAggregateOptions | Mapping[Any, Any] | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.StructScalar: ...
+
+
+def index(
+ data: lib.Array[Any] | lib.ChunkedArray[Any],
+ value: ScalarLike,
+ start: int | None = None,
+ end: int | None = None,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int64Scalar: ...
+
+
+last = _clone_signature(first)
+max = _clone_signature(first)
+min = _clone_signature(first)
+min_max = _clone_signature(first_last)
+
+
+def mean(
+ array: FloatScalar | FloatArray
+ | lib.NumericArray[lib.Scalar[Any]]
+ | lib.ChunkedArray[lib.Scalar[Any]]
+ | lib.Scalar[Any],
+ /,
+ *,
+ skip_nulls: bool = True,
+ min_count: int = 1,
+ options: ScalarAggregateOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Scalar[Any]: ...
+
+
+def mode(
+ array: NumericScalar | NumericArray,
+ /,
+ n: int = 1,
+ *,
+ skip_nulls: bool = True,
+ min_count: int = 0,
+ options: ModeOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.StructArray: ...
+
+
+def product(
+ array: _ScalarT | lib.NumericArray[_ScalarT],
+ /,
+ *,
+ skip_nulls: bool = True,
+ min_count: int = 1,
+ options: ScalarAggregateOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ScalarT: ...
+
+
+def quantile(
+ array: NumericScalar | NumericArray,
+ /,
+ q: float | Sequence[float] = 0.5,
+ *,
+ interpolation: Literal["linear", "lower",
+ "higher", "nearest", "midpoint"] = "linear",
+ skip_nulls: bool = True,
+ min_count: int = 0,
+ options: QuantileOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleArray: ...
+
+
+def stddev(
+ array: NumericScalar | NumericArray,
+ /,
+ *,
+ ddof: float = 0,
+ skip_nulls: bool = True,
+ min_count: int = 0,
+ options: VarianceOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleScalar: ...
+
+
+def sum(
+ array: _NumericScalarT | NumericArray[_NumericScalarT] | lib.Expression,
+ /,
+ *,
+ skip_nulls: bool = True,
+ min_count: int = 1,
+ options: ScalarAggregateOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericScalarT | lib.Expression: ...
+
+
+def tdigest(
+ array: NumericScalar | NumericArray,
+ /,
+ q: float | Sequence[float] = 0.5,
+ *,
+ delta: int = 100,
+ buffer_size: int = 500,
+ skip_nulls: bool = True,
+ min_count: int = 0,
+ options: TDigestOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleArray: ...
+
+
+def variance(
+ array: NumericScalar | NumericArray | ArrayLike,
+ /,
+ *,
+ ddof: int = 0,
+ skip_nulls: bool = True,
+ min_count: int = 0,
+ options: VarianceOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleScalar: ...
+
+
+def winsorize(
+ array: _NumericArrayT,
+ /,
+ lower_limit: float = 0.0,
+ upper_limit: float = 1.0,
+ *,
+ options: WinsorizeOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericArrayT: ...
+
+
+def skew(
+ array: NumericScalar | NumericArray | ArrayLike,
+ /,
+ *,
+ skip_nulls: bool = True,
+ biased: bool = True,
+ min_count: int = 0,
+ options: SkewOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleScalar: ...
+
+
+def kurtosis(
+ array: NumericScalar | NumericArray | ArrayLike,
+ /,
+ *,
+ skip_nulls: bool = True,
+ biased: bool = True,
+ min_count: int = 0,
+ options: SkewOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleScalar: ...
+
+
+def top_k_unstable(
+ values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table,
+ k: int,
+ sort_keys: list | None = None,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Array: ...
+
+
+def bottom_k_unstable(
+ values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table,
+ k: int,
+ sort_keys: list | None = None,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Array: ...
+
+
+# ========================= 2. Element-wise (“scalar”) functions =========
+
+# ========================= 2.1 Arithmetic =========================
+def abs(x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None) -> (
+ _NumericOrDurationT | _NumericOrDurationArrayT | Expression): ...
+
+
+abs_checked = _clone_signature(abs)
+
+
+def add(
+ x: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT
+ | ArrayLike | int | Expression),
+ y: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT
+ | ArrayLike | int | Expression),
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ...
+
+
+add_checked = _clone_signature(add)
+
+
+def divide(
+ x: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT
+ | Expression),
+ y: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT
+ | Expression),
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ...
+
+
+divide_checked = _clone_signature(divide)
+
+
+def exp(
+ exponent: _FloatArrayT | ArrayOrChunkedArray[NonFloatNumericScalar] | _FloatScalarT
+ | NonFloatNumericScalar | lib.DoubleScalar | Expression,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> (
+ _FloatArrayT | lib.DoubleArray | _FloatScalarT | lib.DoubleScalar | Expression): ...
+
+
+expm1 = _clone_signature(exp)
+multiply = _clone_signature(add)
+multiply_checked = _clone_signature(add)
+
+
+def negate(
+ x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None) -> (
+ _NumericOrDurationT | _NumericOrDurationArrayT | Expression): ...
+
+
+negate_checked = _clone_signature(negate)
+
+
+def power(
+ base: _NumericScalarT | Expression | _NumericArrayT | NumericScalar,
+ exponent: _NumericScalarT | Expression | _NumericArrayT | NumericScalar,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericScalarT | _NumericArrayT | Expression: ...
+
+
+power_checked = _clone_signature(power)
+
+
+def sign(
+ x: NumericOrDurationArray | NumericOrDurationScalar | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> (
+ lib.NumericArray[lib.Int8Scalar]
+ | lib.NumericArray[lib.FloatScalar]
+ | lib.NumericArray[lib.DoubleScalar]
+ | lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar | Expression
+): ...
+
+
+def sqrt(
+ x: NumericArray | NumericScalar | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None) -> (
+ FloatArray | FloatScalar | Expression): ...
+
+
+sqrt_checked = _clone_signature(sqrt)
+
+subtract = _clone_signature(add)
+subtract_checked = _clone_signature(add)
+
+# ========================= 2.1 Bit-wise functions =========================
+
+
+def bit_wise_and(
+ x: _NumericScalarT | _NumericArrayT | NumericScalar | Expression
+ | ArrayOrChunkedArray[NumericScalar],
+ y: _NumericScalarT | _NumericArrayT | NumericScalar | Expression
+ | ArrayOrChunkedArray[NumericScalar],
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> _NumericScalarT | _NumericArrayT | Expression: ...
+
+
+def bit_wise_not(
+ x: _NumericScalarT | _NumericArrayT | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> _NumericScalarT | _NumericArrayT | Expression: ...
+
+
+bit_wise_or = _clone_signature(bit_wise_and)
+bit_wise_xor = _clone_signature(bit_wise_and)
+shift_left = _clone_signature(bit_wise_and)
+shift_left_checked = _clone_signature(bit_wise_and)
+shift_right = _clone_signature(bit_wise_and)
+shift_right_checked = _clone_signature(bit_wise_and)
+
+# ========================= 2.2 Rounding functions =========================
+
+
+def ceil(
+ x: _FloatScalarT | _FloatArrayT | Expression, /, *, memory_pool: lib.MemoryPool |
+ None = None) -> _FloatScalarT | _FloatArrayT | Expression: ...
+
+
+floor = _clone_signature(ceil)
+
+
+def round(
+ x: _NumericScalarT | _NumericArrayT | Expression | list,
+ /,
+ ndigits: int = 0,
+ round_mode: Literal[
+ "down",
+ "up",
+ "towards_zero",
+ "towards_infinity",
+ "half_down",
+ "half_up",
+ "half_towards_zero",
+ "half_towards_infinity",
+ "half_to_even",
+ "half_to_odd",
+ ] = "half_to_even",
+ *,
+ options: RoundOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericScalarT | _NumericArrayT | Expression: ...
+
+
+def round_to_multiple(
+ x: _NumericScalarT | _NumericArrayT | list | Expression,
+ /,
+ multiple: int | float | NumericScalar = 1.0,
+ round_mode: Literal[
+ "down",
+ "up",
+ "towards_zero",
+ "towards_infinity",
+ "half_down",
+ "half_up",
+ "half_towards_zero",
+ "half_towards_infinity",
+ "half_to_even",
+ "half_to_odd",
+ ] = "half_to_even",
+ *,
+ options: RoundToMultipleOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericScalarT | _NumericArrayT | Expression: ...
+
+
+def round_binary(
+ x: _NumericScalarT | _NumericArrayT | float | list | Expression,
+ s: lib.Int8Scalar
+ | lib.Int16Scalar
+ | lib.Int32Scalar
+ | lib.Int64Scalar
+ | lib.Scalar
+ | Iterable
+ | float
+ | Expression,
+ /,
+ round_mode: Literal[
+ "down",
+ "up",
+ "towards_zero",
+ "towards_infinity",
+ "half_down",
+ "half_up",
+ "half_towards_zero",
+ "half_towards_infinity",
+ "half_to_even",
+ "half_to_odd",
+ ] = "half_to_even",
+ *,
+ options: RoundBinaryOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> (
+ _NumericScalarT | lib.NumericArray[_NumericScalarT] | _NumericArrayT
+ | Expression): ...
+
+
+trunc = _clone_signature(ceil)
+
+# ========================= 2.3 Logarithmic functions =========================
+
+
+def ln(
+ x: FloatScalar | FloatArray | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> (
+ lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar]
+ | lib.NumericArray[lib.DoubleScalar] | Expression): ...
+
+
+ln_checked = _clone_signature(ln)
+log10 = _clone_signature(ln)
+log10_checked = _clone_signature(ln)
+log1p = _clone_signature(ln)
+log1p_checked = _clone_signature(ln)
+log2 = _clone_signature(ln)
+log2_checked = _clone_signature(ln)
+
+
+def logb(
+ x: FloatScalar | FloatArray | Expression | Any,
+ b: FloatScalar | FloatArray | Expression | Any,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> (
+ lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar]
+ | lib.NumericArray[lib.DoubleScalar] | Expression | Any): ...
+
+
+logb_checked = _clone_signature(logb)
+
+# ========================= 2.4 Trigonometric functions =========================
+acos = _clone_signature(ln)
+acos_checked = _clone_signature(ln)
+acosh = _clone_signature(ln)
+acosh_checked = _clone_signature(ln)
+asin = _clone_signature(ln)
+asin_checked = _clone_signature(ln)
+asinh = _clone_signature(ln)
+atan = _clone_signature(ln)
+atanh_checked = _clone_signature(ln)
+atanh = _clone_signature(ln)
+cos = _clone_signature(ln)
+cos_checked = _clone_signature(ln)
+cosh = _clone_signature(ln)
+sin = _clone_signature(ln)
+sin_checked = _clone_signature(ln)
+sinh = _clone_signature(ln)
+tan = _clone_signature(ln)
+tan_checked = _clone_signature(ln)
+tanh = _clone_signature(ln)
+
+
+def atan2(
+ y: FloatScalar | FloatArray | Expression | Any,
+ x: FloatScalar | FloatArray | Expression | Any,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> (
+ lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar]
+ | lib.NumericArray[lib.DoubleScalar] | Expression): ...
+
+
+# ========================= 2.5 Comparisons functions =========================
+def equal(
+ x: lib.Scalar | lib.Array | lib.ChunkedArray | list | Expression | Any,
+ y: lib.Scalar | lib.Array | lib.ChunkedArray | list | Expression | Any,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+greater = _clone_signature(equal)
+greater_equal = _clone_signature(equal)
+less = _clone_signature(equal)
+less_equal = _clone_signature(equal)
+not_equal = _clone_signature(equal)
+
+
+def max_element_wise(
+ *args: ScalarOrArray[_Scalar_CoT] | Expression | ScalarLike | ArrayLike,
+ skip_nulls: bool = True,
+ options: ElementWiseAggregateOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _Scalar_CoT | Expression | lib.Scalar | lib.Array: ...
+
+
+min_element_wise = _clone_signature(max_element_wise)
+
+# ========================= 2.6 Logical functions =========================
+
+
+def and_(
+ x: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar],
+ y: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar],
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> (
+ lib.BooleanScalar | lib.BooleanArray | Expression
+ | ScalarOrArray[lib.BooleanScalar]): ...
+
+
+and_kleene = _clone_signature(and_)
+and_not = _clone_signature(and_)
+and_not_kleene = _clone_signature(and_)
+or_ = _clone_signature(and_)
+or_kleene = _clone_signature(and_)
+xor = _clone_signature(and_)
+
+
+def invert(
+ x: lib.BooleanScalar | _BooleanArrayT | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> lib.BooleanScalar | _BooleanArrayT | Expression: ...
+
+
+# ========================= 2.10 String predicates =========================
+def ascii_is_alnum(
+ strings: StringScalar | StringArray | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+ascii_is_alpha = _clone_signature(ascii_is_alnum)
+ascii_is_decimal = _clone_signature(ascii_is_alnum)
+ascii_is_lower = _clone_signature(ascii_is_alnum)
+ascii_is_printable = _clone_signature(ascii_is_alnum)
+ascii_is_space = _clone_signature(ascii_is_alnum)
+ascii_is_upper = _clone_signature(ascii_is_alnum)
+utf8_is_alnum = _clone_signature(ascii_is_alnum)
+utf8_is_alpha = _clone_signature(ascii_is_alnum)
+utf8_is_decimal = _clone_signature(ascii_is_alnum)
+utf8_is_digit = _clone_signature(ascii_is_alnum)
+utf8_is_lower = _clone_signature(ascii_is_alnum)
+utf8_is_numeric = _clone_signature(ascii_is_alnum)
+utf8_is_printable = _clone_signature(ascii_is_alnum)
+utf8_is_space = _clone_signature(ascii_is_alnum)
+utf8_is_upper = _clone_signature(ascii_is_alnum)
+ascii_is_title = _clone_signature(ascii_is_alnum)
+utf8_is_title = _clone_signature(ascii_is_alnum)
+string_is_ascii = _clone_signature(ascii_is_alnum)
+
+# ========================= 2.11 String transforms =========================
+
+
+def ascii_capitalize(
+ strings: _StringScalarT | _StringArrayT | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+ascii_lower = _clone_signature(ascii_capitalize)
+ascii_reverse = _clone_signature(ascii_capitalize)
+ascii_swapcase = _clone_signature(ascii_capitalize)
+ascii_title = _clone_signature(ascii_capitalize)
+ascii_upper = _clone_signature(ascii_capitalize)
+
+
+def binary_length(
+ strings: ScalarOrArray[StringOrBinaryScalar] | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> (
+ lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array
+ | Expression
+): ...
+
+
+def binary_repeat(
+ strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression,
+ num_repeats: int | list[int] | list[int | None],
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> (
+ _StringOrBinaryScalarT | lib.Array[_StringOrBinaryScalarT] | _StringOrBinaryArrayT
+ | Expression): ...
+
+
+def binary_replace_slice(
+ strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression,
+ /,
+ start: int,
+ stop: int,
+ replacement: str | bytes,
+ *,
+ options: ReplaceSliceOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ...
+
+
+def binary_reverse(
+ strings: _BinaryScalarT | _BinaryArrayT | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> _BinaryScalarT | _BinaryArrayT | Expression: ...
+
+
+def replace_substring(
+ strings: _StringScalarT | _StringArrayT | Expression,
+ /,
+ pattern: str | bytes,
+ replacement: str | bytes,
+ *,
+ max_replacements: int | None = None,
+ options: ReplaceSubstringOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+replace_substring_regex = _clone_signature(replace_substring)
+
+
+def utf8_capitalize(
+ strings: _StringScalarT | _StringArrayT | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+def utf8_length(
+ strings: lib.StringScalar | lib.LargeStringScalar | lib.StringArray
+ | lib.ChunkedArray[lib.StringScalar] | lib.LargeStringArray
+ | lib.ChunkedArray[lib.LargeStringScalar] | Expression,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> (
+ lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array
+ | Expression): ...
+
+
+utf8_lower = _clone_signature(utf8_capitalize)
+
+
+def utf8_replace_slice(
+ strings: _StringScalarT | _StringArrayT | Expression,
+ /,
+ start: int,
+ stop: int,
+ replacement: str | bytes,
+ *,
+ options: ReplaceSliceOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+utf8_reverse = _clone_signature(utf8_capitalize)
+utf8_swapcase = _clone_signature(utf8_capitalize)
+utf8_title = _clone_signature(utf8_capitalize)
+utf8_upper = _clone_signature(utf8_capitalize)
+
+# ========================= 2.12 String padding =========================
+
+
+def ascii_center(
+ strings: _StringScalarT | _StringArrayT | Expression,
+ /,
+ width: int | None = None,
+ padding: str = " ",
+ lean_left_on_odd_padding: bool = True,
+ *,
+ options: PadOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+ascii_lpad = _clone_signature(ascii_center)
+ascii_rpad = _clone_signature(ascii_center)
+utf8_center = _clone_signature(ascii_center)
+utf8_lpad = _clone_signature(ascii_center)
+utf8_rpad = _clone_signature(ascii_center)
+
+
+def utf8_zero_fill(
+ strings: _StringScalarT | _StringArrayT | Expression,
+ /,
+ width: int | None = None,
+ padding: str = "0",
+ *,
+ options: ZeroFillOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+utf8_zfill = utf8_zero_fill
+
+# ========================= 2.13 String trimming =========================
+
+
+def ascii_ltrim(
+ strings: _StringScalarT | _StringArrayT | Expression,
+ /,
+ characters: str,
+ *,
+ options: TrimOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+ascii_rtrim = _clone_signature(ascii_ltrim)
+ascii_trim = _clone_signature(ascii_ltrim)
+utf8_ltrim = _clone_signature(ascii_ltrim)
+utf8_rtrim = _clone_signature(ascii_ltrim)
+utf8_trim = _clone_signature(ascii_ltrim)
+
+
+def ascii_ltrim_whitespace(
+ strings: _StringScalarT | _StringArrayT | Expression,
+ /,
+ *,
+ options: TrimOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace)
+ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace)
+utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace)
+utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace)
+utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace)
+
+# ========================= 2.14 String splitting =========================
+
+
+def ascii_split_whitespace(
+ strings: _StringScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression,
+ /,
+ *,
+ max_splits: int | None = None,
+ reverse: bool = False,
+ options: SplitOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> (
+ lib.ListArray[_StringScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]]
+ | Expression): ...
+
+
+def split_pattern(
+ strings: _StringOrBinaryScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression,
+ /,
+ pattern: str,
+ *,
+ max_splits: int | None = None,
+ reverse: bool = False,
+ options: SplitOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> (
+ lib.ListArray[_StringOrBinaryScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]]
+ | Expression): ...
+
+
+split_pattern_regex = _clone_signature(split_pattern)
+utf8_split_whitespace = _clone_signature(ascii_split_whitespace)
+
+# ========================= 2.15 String component extraction =========================
+
+
+def extract_regex(
+ strings: StringOrBinaryScalar | StringOrBinaryArray | Expression,
+ /,
+ pattern: str,
+ *,
+ options: ExtractRegexOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.StructScalar | lib.StructArray | Expression: ...
+
+
+extract_regex_span = _clone_signature(extract_regex)
+
+
+# ========================= 2.16 String join =========================
+def binary_join(
+ strings, separator, /, *, memory_pool: lib.MemoryPool | None = None
+) -> StringScalar | StringArray: ...
+
+
+def binary_join_element_wise(
+ *strings: str
+ | bytes
+ | _StringOrBinaryScalarT
+ | _StringOrBinaryArrayT
+ | Expression
+ | list,
+ null_handling: Literal["emit_null", "skip", "replace"] = "emit_null",
+ null_replacement: str = "",
+ options: JoinOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ...
+
+
+# ========================= 2.17 String Slicing =========================
+def binary_slice(
+ strings: _BinaryScalarT | _BinaryArrayT | Expression | lib.Scalar,
+ /,
+ start: int,
+ stop: int | None = None,
+ step: int = 1,
+ *,
+ options: SliceOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _BinaryScalarT | _BinaryArrayT | Expression: ...
+
+
+def utf8_slice_codeunits(
+ strings: _StringScalarT | _StringArrayT | Expression,
+ /,
+ start: int,
+ stop: int | None = None,
+ step: int = 1,
+ *,
+ options: SliceOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+def utf8_normalize(
+ strings: _StringScalarT | _StringArrayT | Expression,
+ /,
+ form: Literal["NFC", "NFKC", "NFD", "NFKD"] = "NFC",
+ *,
+ options: Utf8NormalizeOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _StringScalarT | _StringArrayT | Expression: ...
+
+
+# ========================= 2.18 Containment tests =========================
+def count_substring(
+ strings: lib.StringScalar | lib.BinaryScalar | lib.LargeStringScalar
+ | lib.LargeBinaryScalar | lib.StringArray | lib.BinaryArray
+ | lib.ChunkedArray[lib.StringScalar] | lib.ChunkedArray[lib.BinaryScalar]
+ | lib.LargeStringArray | lib.LargeBinaryArray
+ | lib.ChunkedArray[lib.LargeStringScalar] | lib.ChunkedArray[lib.LargeBinaryScalar]
+ | Expression,
+ /,
+ pattern: str,
+ *,
+ ignore_case: bool = False,
+ options: MatchSubstringOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> (
+ lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array
+ | Expression): ...
+
+
+count_substring_regex = _clone_signature(count_substring)
+
+
+def ends_with(
+ strings: StringScalar | BinaryScalar | StringArray | BinaryArray | Expression,
+ /,
+ pattern: str,
+ *,
+ ignore_case: bool = False,
+ options: MatchSubstringOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+find_substring = _clone_signature(count_substring)
+find_substring_regex = _clone_signature(count_substring)
+
+
+def index_in(
+ values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression,
+ /,
+ value_set: lib.Array | lib.ChunkedArray | Expression,
+ *,
+ skip_nulls: bool = False,
+ options: SetLookupOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int32Scalar | lib.Int32Array | Expression: ...
+
+def index_in_meta_binary(
+ values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression,
+ value_set: lib.Array | lib.ChunkedArray | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int32Scalar | lib.Int32Array | Expression: ...
+
+def is_in(
+ values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression,
+ /,
+ value_set: lib.Array | lib.ChunkedArray | Expression,
+ *,
+ skip_nulls: bool = False,
+ options: SetLookupOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+is_in_meta_binary = _clone_signature(index_in_meta_binary)
+match_like = _clone_signature(ends_with)
+match_substring = _clone_signature(ends_with)
+match_substring_regex = _clone_signature(ends_with)
+starts_with = _clone_signature(ends_with)
+
+# ========================= 2.19 Categorizations =========================
+
+
+def is_finite(
+ values: NumericScalar | lib.NullScalar | NumericArray | lib.NullArray | Expression,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+is_inf = _clone_signature(is_finite)
+is_nan = _clone_signature(is_finite)
+
+
+def is_null(
+ values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression,
+ /,
+ *,
+ nan_is_null: bool = False,
+ options: NullOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+def is_valid(
+ values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression | ArrayLike,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+true_unless_null = _clone_signature(is_valid)
+
+# ========================= 2.20 Selecting / multiplexing =========================
+
+
+def case_when(
+ cond: lib.StructScalar
+ | lib.StructArray
+ | lib.ChunkedArray[lib.StructScalar]
+ | Expression,
+ /,
+ *cases: _ScalarOrArrayT | ArrayLike, memory_pool: lib.MemoryPool | None = None
+) -> _ScalarOrArrayT | lib.Array | Expression: ...
+
+
+def choose(
+ indices: ArrayLike | ScalarLike,
+ /,
+ *values: ArrayLike | ScalarLike,
+ memory_pool: lib.MemoryPool | None = None,
+) -> ArrayLike | ScalarLike: ...
+
+
+def coalesce(
+ *values: _ScalarOrArrayT | Expression, memory_pool: lib.MemoryPool | None = None
+) -> _ScalarOrArrayT | Expression: ...
+
+
+def fill_null(
+ values: _ScalarOrArrayT | ScalarLike, fill_value: ArrayLike | ScalarLike
+) -> _ScalarOrArrayT | ScalarLike: ...
+
+
+def if_else(
+ cond: ArrayLike | ScalarLike,
+ left: ArrayLike | ScalarLike,
+ right: ArrayLike | ScalarLike,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> ArrayLike | ScalarLike: ...
+
+
+# ========================= 2.21 Structural transforms =========================
+
+def list_value_length(
+ lists: _ListArray[Any] | _LargeListArray[Any] | ListArray[Any] | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int32Array | lib.Int64Array | Expression: ...
+
+
+def make_struct(
+ *args: lib.Scalar | lib.Array | lib.ChunkedArray | Expression | ArrayLike,
+ field_names: list[str] | tuple[str, ...] = (),
+ field_nullability: bool | None = None,
+ field_metadata: list[lib.KeyValueMetadata] | None = None,
+ options: MakeStructOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.StructScalar | lib.StructArray | Expression: ...
+
+
+# ========================= 2.22 Conversions =========================
+def ceil_temporal(
+ timestamps: _TemporalScalarT | _TemporalArrayT | Expression,
+ /,
+ multiple: int = 1,
+ unit: Literal[
+ "year",
+ "quarter",
+ "month",
+ "week",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "millisecond",
+ "microsecond",
+ "nanosecond",
+ ] = "day",
+ *,
+ week_starts_monday: bool = True,
+ ceil_is_strictly_greater: bool = False,
+ calendar_based_origin: bool = False,
+ options: RoundTemporalOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _TemporalScalarT | _TemporalArrayT | Expression: ...
+
+
+floor_temporal = _clone_signature(ceil_temporal)
+round_temporal = _clone_signature(ceil_temporal)
+
+
+def cast(
+ arr: lib.Scalar | lib.Array | lib.ChunkedArray | lib.Table,
+ target_type: _DataTypeT | str | None = None,
+ safe: bool | None = None,
+ options: CastOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> (
+ lib.Scalar[_DataTypeT] | lib.Scalar[Any] | lib.Array[lib.Scalar[_DataTypeT]]
+ | lib.Array[lib.Scalar[Any]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]]
+ | lib.ChunkedArray[lib.Scalar[Any]] | lib.Table
+): ...
+
+
+def strftime(
+ timestamps: TemporalScalar | TemporalArray | Expression,
+ /,
+ format: str = "%Y-%m-%dT%H:%M:%S",
+ locale: str = "C",
+ *,
+ options: StrftimeOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.StringScalar | lib.StringArray | Expression: ...
+
+
+def strptime(
+ strings: StringScalar | StringArray | Expression,
+ /,
+ format: str,
+ unit: TimeUnit,
+ error_is_null: bool = False,
+ *,
+ options: StrptimeOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.TimestampScalar | lib.TimestampArray | Expression: ...
+
+
+# ========================= 2.23 Temporal component extraction =========================
+def day(
+ values: TemporalScalar | TemporalArray | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None) -> (
+ lib.Int64Scalar | lib.Int64Array | Expression
+): ...
+
+
+def day_of_week(
+ values: TemporalScalar | TemporalArray | Expression,
+ /,
+ *,
+ count_from_zero: bool = True,
+ week_start: int = 1,
+ options: DayOfWeekOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int64Scalar | lib.Int64Array | Expression: ...
+
+
+day_of_year = _clone_signature(day)
+
+
+def hour(
+ values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any]
+ | lib.TimestampArray[Any] | lib.Time32Array[Any] | lib.Time64Array[Any]
+ | lib.ChunkedArray[lib.TimestampScalar[Any]]
+ | lib.ChunkedArray[lib.Time32Scalar[Any]]
+ | lib.ChunkedArray[lib.Time64Scalar[Any]] | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int64Scalar | lib.Int64Array | Expression: ...
+
+
+def is_dst(
+ values: lib.TimestampScalar | lib.TimestampArray[Any]
+ | lib.ChunkedArray[lib.TimestampScalar] | Expression,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+def iso_week(
+ values: lib.TimestampScalar | lib.TimestampArray[Any]
+ | lib.ChunkedArray[lib.TimestampScalar[Any]] | Expression,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> lib.Int64Scalar | lib.Int64Array | Expression: ...
+
+
+iso_year = _clone_signature(iso_week)
+
+
+def is_leap_year(
+ values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar
+ | lib.TimestampArray
+ | lib.Date32Array
+ | lib.Date64Array
+ | lib.ChunkedArray[lib.TimestampScalar]
+ | lib.ChunkedArray[lib.Date32Scalar]
+ | lib.ChunkedArray[lib.Date64Scalar] | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.BooleanScalar | lib.BooleanArray | Expression: ...
+
+
+microsecond = _clone_signature(iso_week)
+millisecond = _clone_signature(iso_week)
+minute = _clone_signature(iso_week)
+month = _clone_signature(day_of_week)
+nanosecond = _clone_signature(hour)
+quarter = _clone_signature(day_of_week)
+second = _clone_signature(hour)
+subsecond = _clone_signature(hour)
+us_week = _clone_signature(iso_week)
+us_year = _clone_signature(iso_week)
+year = _clone_signature(iso_week)
+
+
+def week(
+ values: lib.TimestampScalar | lib.TimestampArray
+ | lib.ChunkedArray[lib.TimestampScalar] | Expression,
+ /,
+ *,
+ week_starts_monday: bool = True,
+ count_from_zero: bool = False,
+ first_week_is_fully_in_year: bool = False,
+ options: WeekOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int64Scalar | lib.Int64Array | Expression: ...
+
+
+def year_month_day(
+ values: TemporalScalar | TemporalArray | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> lib.StructScalar | lib.StructArray | Expression: ...
+
+
+iso_calendar = _clone_signature(year_month_day)
+
+
+# ========================= 2.24 Temporal difference =========================
+def day_time_interval_between(start, end, /, *,
+ memory_pool: lib.MemoryPool | None = None): ...
+
+
+def days_between(
+ start, end, /, *, memory_pool: lib.MemoryPool | None = None
+) -> lib.Int64Scalar | lib.Int64Array: ...
+
+
+hours_between = _clone_signature(days_between)
+microseconds_between = _clone_signature(days_between)
+milliseconds_between = _clone_signature(days_between)
+minutes_between = _clone_signature(days_between)
+
+
+def month_day_nano_interval_between(
+ start, end, /, *, memory_pool: lib.MemoryPool | None = None
+) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: ...
+
+
+def month_interval_between(start, end, /, *,
+ memory_pool: lib.MemoryPool | None = None): ...
+
+
+nanoseconds_between = _clone_signature(days_between)
+quarters_between = _clone_signature(days_between)
+seconds_between = _clone_signature(days_between)
+
+
+def weeks_between(
+ start,
+ end,
+ /,
+ *,
+ count_from_zero: bool = True,
+ week_start: int = 1,
+ options: DayOfWeekOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.Int64Scalar | lib.Int64Array: ...
+
+
+years_between = _clone_signature(days_between)
+
+# ========================= 2.25 Timezone handling =========================
+
+
+def assume_timezone(
+ timestamps: lib.TimestampScalar | lib.Scalar[lib.TimestampType] | lib.TimestampArray
+ | lib.ChunkedArray[lib.TimestampScalar] | Expression,
+ /,
+ timezone: str | None = None,
+ *,
+ ambiguous: Literal["raise", "earliest", "latest"] = "raise",
+ nonexistent: Literal["raise", "earliest", "latest"] = "raise",
+ options: AssumeTimezoneOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> (
+ lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar]
+ | Expression
+): ...
+
+
+def local_timestamp(
+ timestamps: lib.TimestampScalar | lib.TimestampArray
+ | lib.ChunkedArray[lib.TimestampScalar] | Expression,
+ /, *, memory_pool: lib.MemoryPool | None = None
+) -> lib.TimestampScalar | lib.TimestampArray | Expression: ...
+
+
+# ========================= 2.26 Random number generation =========================
+def random(
+ n: int,
+ *,
+ initializer: Hashable = "system",
+ options: RandomOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleArray: ...
+
+
+# ========================= 3. Array-wise (“vector”) functions =========================
+
+# ========================= 3.1 Cumulative Functions =========================
+def cumulative_sum(
+ values: _NumericArrayT | ArrayLike | Expression,
+ /,
+ start: int | float | lib.Scalar | None = None,
+ *,
+ skip_nulls: bool = False,
+ options: CumulativeSumOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericArrayT | Expression | lib.Array: ...
+
+
+cumulative_sum_checked = _clone_signature(cumulative_sum)
+cumulative_prod = _clone_signature(cumulative_sum)
+cumulative_prod_checked = _clone_signature(cumulative_sum)
+cumulative_max = _clone_signature(cumulative_sum)
+cumulative_min = _clone_signature(cumulative_sum)
+cumulative_mean = _clone_signature(cumulative_sum)
+# ========================= 3.2 Associative transforms =========================
+
+
+def dictionary_encode(
+ array: _ScalarOrArrayT | Expression,
+ /,
+ null_encoding: Literal["mask", "encode"] = "mask",
+ *,
+ options=None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ScalarOrArrayT | Expression: ...
+
+
+def dictionary_decode(
+ array: _ScalarOrArrayT | Expression,
+ /,
+ *,
+ options=None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ScalarOrArrayT | Expression: ...
+
+
+def unique(array: _ArrayT | Expression, /, *, memory_pool: lib.MemoryPool |
+ None = None) -> _ArrayT | Expression: ...
+
+
+def value_counts(
+ array: lib.Array | lib.ChunkedArray | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> lib.StructArray | Expression: ...
+
+# ========================= 3.3 Selections =========================
+
+
+def array_filter(
+ array: _ArrayT | Expression,
+ selection_filter: list[bool] | list[bool | None] | BooleanArray,
+ /,
+ null_selection_behavior: Literal["drop", "emit_null"] = "drop",
+ *,
+ options: FilterOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ArrayT | Expression: ...
+
+
+def drop_null(input: _ArrayT | Expression, /, *, memory_pool: lib.MemoryPool |
+ None = None) -> _ArrayT | Expression: ...
+
+
+filter = array_filter
+take = array_take
+
+# ========================= 3.4 Containment tests =========================
+
+
+def indices_nonzero(
+ values: lib.BooleanArray
+ | lib.NullArray
+ | NumericArray
+ | lib.Decimal128Array
+ | lib.Decimal256Array | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.UInt64Array | Expression: ...
+
+
+# ========================= 3.5 Sorts and partitions =========================
+def array_sort_indices(
+ array: lib.Array | lib.ChunkedArray | Expression,
+ /,
+ order: _Order = "ascending",
+ *,
+ null_placement: _Placement = "at_end",
+ options: ArraySortOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.UInt64Array | Expression: ...
+
+
+def partition_nth_indices(
+ array: lib.Array | lib.ChunkedArray | Expression | Iterable,
+ /,
+ pivot: int,
+ *,
+ null_placement: _Placement = "at_end",
+ options: PartitionNthOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.UInt64Array | Expression: ...
+
+
+def pivot_wider(
+ keys: lib.Array | lib.ChunkedArray | Sequence[str],
+ values: lib.Array | lib.ChunkedArray | Sequence[Any],
+ /,
+ key_names: Sequence[str] | None = None,
+ *,
+ unexpected_key_behavior: Literal["ignore", "raise"] = "ignore",
+ options: PivotWiderOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.StructScalar: ...
+
+
+def rank(
+ input: lib.Array | lib.ChunkedArray,
+ /,
+ sort_keys: _Order = "ascending",
+ *,
+ null_placement: _Placement = "at_end",
+ tiebreaker: Literal["min", "max", "first", "dense"] = "first",
+ options: RankOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.UInt64Array: ...
+
+
+def rank_quantile(
+ input: lib.Array | lib.ChunkedArray,
+ /,
+ sort_keys: _Order = "ascending",
+ *,
+ null_placement: _Placement = "at_end",
+ options: RankQuantileOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleArray: ...
+
+
+def rank_normal(
+ input: lib.Array | lib.ChunkedArray,
+ /,
+ sort_keys: _Order = "ascending",
+ *,
+ null_placement: _Placement = "at_end",
+ options: RankQuantileOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.DoubleArray: ...
+
+
+def select_k_unstable(
+ input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression,
+ /,
+ k: int | None = None,
+ sort_keys: Sequence[tuple[str | Expression, str]] | None = None,
+ *,
+ options: SelectKOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.UInt64Array | Expression: ...
+
+
+def sort_indices(
+ input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression,
+ /,
+ sort_keys: Sequence[tuple[str | Expression, _Order]] | None = None,
+ *,
+ null_placement: _Placement = "at_end",
+ options: SortOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.UInt64Array | Expression: ...
+
+
+# ========================= 3.6 Structural transforms =========================
+def list_element(
+ lists: lib.Array[ListScalar[_DataTypeT]] | lib.ChunkedArray[ListScalar[_DataTypeT]]
+ | ListScalar[_DataTypeT] | Expression,
+ index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None
+) -> (lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]]
+ | _DataTypeT | Expression): ...
+
+
+def list_flatten(
+ lists: ArrayOrChunkedArray[ListScalar[Any]] | Expression,
+ /,
+ recursive: bool = False,
+ *,
+ options: ListFlattenOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.ListArray[Any] | Expression: ...
+
+
+def list_parent_indices(
+ lists: ArrayOrChunkedArray[Any] | Expression, /, *,
+ memory_pool: lib.MemoryPool | None = None
+) -> lib.Int64Array | Expression: ...
+
+
+def list_slice(
+ lists: ArrayOrChunkedArray[Any] | Expression,
+ /,
+ start: int,
+ stop: int | None = None,
+ step: int = 1,
+ return_fixed_size_list: bool | None = None,
+ *,
+ options: ListSliceOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> lib.ListArray[Any] | Expression: ...
+
+
+def map_lookup(
+ container,
+ /,
+ query_key,
+ occurrence: str,
+ *,
+ options: MapLookupOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+): ...
+
+
+def struct_field(
+ values,
+ /,
+ indices,
+ *,
+ options: StructFieldOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+): ...
+
+
+def fill_null_backward(
+ values: _ScalarOrArrayT | ScalarLike | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ScalarOrArrayT | ScalarLike | Expression: ...
+
+
+def fill_null_forward(
+ values: _ScalarOrArrayT | ScalarLike | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ScalarOrArrayT | ScalarLike | Expression: ...
+
+
+def replace_with_mask(
+ values: _ScalarOrArrayT | Expression,
+ mask: list[bool] | list[bool | None] | BooleanArray,
+ replacements,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _ScalarOrArrayT | Expression: ...
+
+
+# ========================= 3.7 Pairwise functions =========================
+def pairwise_diff(
+ input: _NumericOrTemporalArrayT | Expression,
+ /,
+ period: int = 1,
+ *,
+ options: PairwiseOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> _NumericOrTemporalArrayT | Expression: ...
+
+
+def run_end_encode(
+ input: _NumericOrTemporalArrayT | Expression,
+ /,
+ *,
+ run_end_type: _RunEndType | None = None,
+ options: RunEndEncodeOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None
+) -> _NumericOrTemporalArrayT | Expression: ...
+
+
+def run_end_decode(
+ input: _NumericOrTemporalArrayT | Expression,
+ /,
+ *,
+ memory_pool: lib.MemoryPool | None = None
+) -> _NumericOrTemporalArrayT | Expression: ...
+
+
+pairwise_diff_checked = _clone_signature(pairwise_diff)
diff --git a/python/pyarrow-stubs/pyarrow/config.pyi b/python/pyarrow-stubs/pyarrow/config.pyi
new file mode 100644
index 00000000000..069b70e553a
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/config.pyi
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import NamedTuple
+
+
+class VersionInfo(NamedTuple):
+ major: int
+ minor: int
+ patch: int
+
+
+class CppBuildInfo(NamedTuple):
+ version: str
+ version_info: VersionInfo
+ so_version: str
+ full_so_version: str
+ compiler_id: str
+ compiler_version: str
+ compiler_flags: str
+ git_id: str
+ git_description: str
+ package_kind: str
+ build_type: str
+
+
+class BuildInfo(NamedTuple):
+ build_type: str
+ cpp_build_info: CppBuildInfo
+
+
+class RuntimeInfo(NamedTuple):
+ simd_level: str
+ detected_simd_level: str
+
+
+build_info: BuildInfo
+cpp_build_info: CppBuildInfo
+cpp_version: str
+cpp_version_info: VersionInfo
+
+
+def runtime_info() -> RuntimeInfo: ...
+def set_timezone_db_path(path: str) -> None: ...
+
+
+__all__ = [
+ "VersionInfo",
+ "BuildInfo",
+ "CppBuildInfo",
+ "RuntimeInfo",
+ "build_info",
+ "cpp_build_info",
+ "cpp_version",
+ "cpp_version_info",
+ "runtime_info",
+ "set_timezone_db_path",
+]
diff --git a/python/pyarrow-stubs/pyarrow/csv.pyi b/python/pyarrow-stubs/pyarrow/csv.pyi
new file mode 100644
index 00000000000..a7abd413aab
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/csv.pyi
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow._csv import (
+ ISO8601,
+ ConvertOptions,
+ CSVStreamingReader,
+ CSVWriter,
+ InvalidRow,
+ ParseOptions,
+ ReadOptions,
+ WriteOptions,
+ open_csv,
+ read_csv,
+ write_csv,
+)
+
+__all__ = [
+ "ISO8601",
+ "ConvertOptions",
+ "CSVStreamingReader",
+ "CSVWriter",
+ "InvalidRow",
+ "ParseOptions",
+ "ReadOptions",
+ "WriteOptions",
+ "open_csv",
+ "read_csv",
+ "write_csv",
+]
diff --git a/python/pyarrow-stubs/pyarrow/cuda.pyi b/python/pyarrow-stubs/pyarrow/cuda.pyi
new file mode 100644
index 00000000000..0394965bb73
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/cuda.pyi
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow._cuda import (
+ BufferReader,
+ BufferWriter,
+ Context,
+ CudaBuffer,
+ HostBuffer,
+ IpcMemHandle,
+ new_host_buffer,
+ read_message,
+ read_record_batch,
+ serialize_record_batch,
+)
+
+__all__ = [
+ "BufferReader",
+ "BufferWriter",
+ "Context",
+ "CudaBuffer",
+ "HostBuffer",
+ "IpcMemHandle",
+ "new_host_buffer",
+ "read_message",
+ "read_record_batch",
+ "serialize_record_batch",
+]
diff --git a/python/pyarrow-stubs/pyarrow/dataset.pyi b/python/pyarrow-stubs/pyarrow/dataset.pyi
new file mode 100644
index 00000000000..66d86b14a25
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/dataset.pyi
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Callable, Iterable, Sequence
+from typing import Literal, TypeAlias, Any
+
+from _typeshed import StrPath
+from pyarrow._dataset import (
+ CsvFileFormat,
+ CsvFragmentScanOptions,
+ Dataset,
+ DatasetFactory,
+ DirectoryPartitioning,
+ FeatherFileFormat,
+ FileFormat,
+ FileFragment,
+ FilenamePartitioning,
+ FileSystemDataset,
+ FileSystemDatasetFactory,
+ FileSystemFactoryOptions,
+ FileWriteOptions,
+ Fragment,
+ FragmentScanOptions,
+ HivePartitioning,
+ InMemoryDataset,
+ IpcFileFormat,
+ IpcFileWriteOptions,
+ JsonFileFormat,
+ JsonFragmentScanOptions,
+ Partitioning,
+ PartitioningFactory,
+ Scanner,
+ TaggedRecordBatch,
+ UnionDataset,
+ UnionDatasetFactory,
+ WrittenFile,
+ get_partition_keys,
+)
+from pyarrow._dataset_orc import OrcFileFormat
+from pyarrow._dataset_parquet import (
+ ParquetDatasetFactory,
+ ParquetFactoryOptions,
+ ParquetFileFormat,
+ ParquetFileFragment,
+ ParquetFileWriteOptions,
+ ParquetFragmentScanOptions,
+ ParquetReadOptions,
+ RowGroupInfo,
+)
+from pyarrow._dataset_parquet_encryption import (
+ ParquetDecryptionConfig,
+ ParquetEncryptionConfig,
+)
+from pyarrow.compute import Expression, field, scalar
+from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table
+
+from ._fs import SupportedFileSystem
+
+_orc_available: bool
+_parquet_available: bool
+
+__all__ = [
+ "CsvFileFormat",
+ "CsvFragmentScanOptions",
+ "Dataset",
+ "DatasetFactory",
+ "DirectoryPartitioning",
+ "FeatherFileFormat",
+ "FileFormat",
+ "FileFragment",
+ "FilenamePartitioning",
+ "FileSystemDataset",
+ "FileSystemDatasetFactory",
+ "FileSystemFactoryOptions",
+ "FileWriteOptions",
+ "Fragment",
+ "FragmentScanOptions",
+ "HivePartitioning",
+ "InMemoryDataset",
+ "IpcFileFormat",
+ "IpcFileWriteOptions",
+ "JsonFileFormat",
+ "JsonFragmentScanOptions",
+ "Partitioning",
+ "PartitioningFactory",
+ "Scanner",
+ "TaggedRecordBatch",
+ "UnionDataset",
+ "UnionDatasetFactory",
+ "WrittenFile",
+ "get_partition_keys",
+ # Orc
+ "OrcFileFormat",
+ # Parquet
+ "ParquetDatasetFactory",
+ "ParquetFactoryOptions",
+ "ParquetFileFormat",
+ "ParquetFileFragment",
+ "ParquetFileWriteOptions",
+ "ParquetFragmentScanOptions",
+ "ParquetReadOptions",
+ "RowGroupInfo",
+ # Parquet Encryption
+ "ParquetDecryptionConfig",
+ "ParquetEncryptionConfig",
+ # Compute
+ "Expression",
+ "field",
+ "scalar",
+ # Dataset
+ "partitioning",
+ "parquet_dataset",
+ "write_dataset",
+]
+
+_DatasetFormat: TypeAlias = (
+ Literal["parquet", "ipc", "arrow", "feather", "csv", "json", "orc", str]
+)
+
+
+def partitioning(
+ schema: Schema = None,
+ *,
+ field_names: list[str] = None,
+ flavor: Literal["hive"] = None,
+ dictionaries: dict[str, Array] | Literal["infer"] | None = None,
+) -> Partitioning | PartitioningFactory: ...
+
+
+def parquet_dataset(
+ metadata_path: StrPath,
+ schema: Schema | None = None,
+ filesystem: SupportedFileSystem | None = None,
+ format: ParquetFileFormat | None = None,
+ partitioning: Partitioning | PartitioningFactory | str | None = None,
+ partition_base_dir: str | None = None,
+) -> FileSystemDataset: ...
+
+
+def dataset(
+ source: StrPath
+ | Sequence[Dataset]
+ | Sequence[StrPath]
+ | Iterable[RecordBatch]
+ | Iterable[Table]
+ | RecordBatchReader
+ | RecordBatch
+ | Table,
+ schema: Schema | None = None,
+ format: FileFormat | _DatasetFormat | None = None,
+ filesystem: SupportedFileSystem | str | None = None,
+ partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None,
+ partition_base_dir: str | None = None,
+ exclude_invalid_files: bool | None = None,
+ ignore_prefixes: list[str] | None = None,
+) -> FileSystemDataset | UnionDataset | InMemoryDataset | Dataset: ...
+
+
+def write_dataset(
+ data: Any | Dataset | Table | RecordBatch | RecordBatchReader | list[Table]
+ | Iterable[RecordBatch] | Scanner,
+ base_dir: StrPath,
+ *,
+ basename_template: str | None = None,
+ format: FileFormat | _DatasetFormat | None = None,
+ partitioning: Partitioning | PartitioningFactory | list[str] | None = None,
+ partitioning_flavor: str | None = None,
+ schema: Schema | None = None,
+ filesystem: SupportedFileSystem | str | None = None,
+ file_options: FileWriteOptions | None = None,
+ use_threads: bool | None = True,
+ max_partitions: int = 1024,
+ max_open_files: int = 1024,
+ max_rows_per_file: int = 0,
+ min_rows_per_group: int = 0,
+ max_rows_per_group: int = 1024 * 1024, # noqa: Y011
+ file_visitor: Callable[[str], None] | None = None,
+ existing_data_behavior:
+ Literal["error", "overwrite_or_ignore", "delete_matching"] = "error",
+ create_dir: bool = True,
+ preserve_order: bool | None = None,
+): ...
+
+
+def _get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ...
diff --git a/python/pyarrow-stubs/pyarrow/device.pyi b/python/pyarrow-stubs/pyarrow/device.pyi
new file mode 100644
index 00000000000..7787ac44deb
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/device.pyi
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import enum
+
+from pyarrow.lib import _Weakrefable
+
+
+class DeviceAllocationType(enum.Enum):
+ CPU = enum.auto()
+ CUDA = enum.auto()
+ CUDA_HOST = enum.auto()
+ OPENCL = enum.auto()
+ VULKAN = enum.auto()
+ METAL = enum.auto()
+ VPI = enum.auto()
+ ROCM = enum.auto()
+ ROCM_HOST = enum.auto()
+ EXT_DEV = enum.auto()
+ CUDA_MANAGED = enum.auto()
+ ONEAPI = enum.auto()
+ WEBGPU = enum.auto()
+ HEXAGON = enum.auto()
+
+
+class Device(_Weakrefable):
+ @property
+ def type_name(self) -> str: ...
+
+ @property
+ def device_id(self) -> int: ...
+
+ @property
+ def is_cpu(self) -> bool: ...
+
+ @property
+ def device_type(self) -> DeviceAllocationType: ...
+
+
+class MemoryManager(_Weakrefable):
+ @property
+ def device(self) -> Device: ...
+
+ @property
+ def is_cpu(self) -> bool: ...
+
+
+def default_cpu_memory_manager() -> MemoryManager: ...
+
+
+__all__ = ["DeviceAllocationType", "Device",
+ "MemoryManager", "default_cpu_memory_manager"]
diff --git a/python/pyarrow-stubs/pyarrow/error.pyi b/python/pyarrow-stubs/pyarrow/error.pyi
new file mode 100644
index 00000000000..eac936afcb5
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/error.pyi
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+
+
+class ArrowException(Exception):
+ ...
+
+
+class ArrowInvalid(ValueError, ArrowException):
+ ...
+
+
+class ArrowMemoryError(MemoryError, ArrowException):
+ ...
+
+
+class ArrowKeyError(KeyError, ArrowException):
+ ...
+
+
+class ArrowTypeError(TypeError, ArrowException):
+ ...
+
+
+class ArrowNotImplementedError(NotImplementedError, ArrowException):
+ ...
+
+
+class ArrowCapacityError(ArrowException):
+ ...
+
+
+class ArrowIndexError(IndexError, ArrowException):
+ ...
+
+
+class ArrowSerializationError(ArrowException):
+ ...
+
+
+class ArrowCancelled(ArrowException):
+ signum: int | None
+ def __init__(self, message: str, signum: int | None = None) -> None: ...
+
+
+ArrowIOError = IOError
+
+
+class StopToken:
+ ...
+
+
+def enable_signal_handlers(enable: bool) -> None: ...
+
+
+have_signal_refcycle: bool
+
+
+class SignalStopHandler:
+ def __enter__(self) -> Self: ...
+ def __exit__(self, exc_type, exc_value, exc_tb) -> None: ...
+ def __dealloc__(self) -> None: ...
+ @property
+ def stop_token(self) -> StopToken: ...
+
+
+__all__ = [
+ "ArrowException",
+ "ArrowInvalid",
+ "ArrowMemoryError",
+ "ArrowKeyError",
+ "ArrowTypeError",
+ "ArrowNotImplementedError",
+ "ArrowCapacityError",
+ "ArrowIndexError",
+ "ArrowSerializationError",
+ "ArrowCancelled",
+ "ArrowIOError",
+ "StopToken",
+ "enable_signal_handlers",
+ "have_signal_refcycle",
+ "SignalStopHandler",
+]
diff --git a/python/pyarrow-stubs/pyarrow/feather.pyi b/python/pyarrow-stubs/pyarrow/feather.pyi
new file mode 100644
index 00000000000..cf9d3402091
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/feather.pyi
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Iterable
+from typing import IO, Literal
+
+import pandas as pd
+
+from pyarrow import lib
+from pyarrow.lib import Table
+from pyarrow._typing import StrPath
+from ._feather import FeatherError
+
+
+class FeatherDataset:
+ path_or_paths: str | list[str]
+ validate_schema: bool
+
+ def __init__(self, path_or_paths: str |
+ list[str], validate_schema: bool = True) -> None: ...
+
+ def read_table(self, columns: list[str] | None = None) -> Table: ...
+ def validate_schemas(self, piece, table: Table) -> None: ...
+
+ def read_pandas(
+ self, columns: list[str] | None = None, use_threads: bool = True
+ ) -> pd.DataFrame: ...
+
+
+def check_chunked_overflow(name: str, col) -> None: ...
+
+
+def write_feather(
+ df: pd.DataFrame | Table | lib.ChunkedArray,
+ dest: StrPath | IO,
+ compression: Literal["zstd", "lz4", "uncompressed", "snappy"] | None = None,
+ compression_level: int | None = None,
+ chunksize: int | None = None,
+ version: Literal[1, 2] = 2,
+) -> None: ...
+
+
+def read_feather(
+ source: StrPath | IO | lib.NativeFile,
+ columns: list[str] | None = None,
+ use_threads: bool = True,
+ memory_map: bool = False,
+ **kwargs,
+) -> pd.DataFrame: ...
+
+
+def read_table(
+ source: StrPath | IO | lib.NativeFile,
+ columns: list[str | int] | Iterable[str | int] | None = None,
+ memory_map: bool = False,
+ use_threads: bool = True,
+) -> Table: ...
+
+
+__all__ = [
+ "FeatherError",
+ "FeatherDataset",
+ "check_chunked_overflow",
+ "write_feather",
+ "read_feather",
+ "read_table",
+]
diff --git a/python/pyarrow-stubs/pyarrow/flight.pyi b/python/pyarrow-stubs/pyarrow/flight.pyi
new file mode 100644
index 00000000000..dcc6ee2244b
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/flight.pyi
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow._flight import (
+ Action,
+ ActionType,
+ BasicAuth,
+ CallInfo,
+ CertKeyPair,
+ ClientAuthHandler,
+ ClientMiddleware,
+ ClientMiddlewareFactory,
+ DescriptorType,
+ FlightCallOptions,
+ FlightCancelledError,
+ FlightClient,
+ FlightDataStream,
+ FlightDescriptor,
+ FlightEndpoint,
+ FlightError,
+ FlightInfo,
+ FlightInternalError,
+ FlightMetadataReader,
+ FlightMetadataWriter,
+ FlightMethod,
+ FlightServerBase,
+ FlightServerError,
+ FlightStreamChunk,
+ FlightStreamReader,
+ FlightStreamWriter,
+ FlightTimedOutError,
+ FlightUnauthenticatedError,
+ FlightUnauthorizedError,
+ FlightUnavailableError,
+ FlightWriteSizeExceededError,
+ GeneratorStream,
+ Location,
+ MetadataRecordBatchReader,
+ MetadataRecordBatchWriter,
+ RecordBatchStream,
+ Result,
+ SchemaResult,
+ ServerAuthHandler,
+ ServerCallContext,
+ ServerMiddleware,
+ ServerMiddlewareFactory,
+ Ticket,
+ TracingServerMiddlewareFactory,
+ connect,
+)
+
+__all__ = [
+ "Action",
+ "ActionType",
+ "BasicAuth",
+ "CallInfo",
+ "CertKeyPair",
+ "ClientAuthHandler",
+ "ClientMiddleware",
+ "ClientMiddlewareFactory",
+ "DescriptorType",
+ "FlightCallOptions",
+ "FlightCancelledError",
+ "FlightClient",
+ "FlightDataStream",
+ "FlightDescriptor",
+ "FlightEndpoint",
+ "FlightError",
+ "FlightInfo",
+ "FlightInternalError",
+ "FlightMetadataReader",
+ "FlightMetadataWriter",
+ "FlightMethod",
+ "FlightServerBase",
+ "FlightServerError",
+ "FlightStreamChunk",
+ "FlightStreamReader",
+ "FlightStreamWriter",
+ "FlightTimedOutError",
+ "FlightUnauthenticatedError",
+ "FlightUnauthorizedError",
+ "FlightUnavailableError",
+ "FlightWriteSizeExceededError",
+ "GeneratorStream",
+ "Location",
+ "MetadataRecordBatchReader",
+ "MetadataRecordBatchWriter",
+ "RecordBatchStream",
+ "Result",
+ "SchemaResult",
+ "ServerAuthHandler",
+ "ServerCallContext",
+ "ServerMiddleware",
+ "ServerMiddlewareFactory",
+ "Ticket",
+ "TracingServerMiddlewareFactory",
+ "connect",
+]
diff --git a/python/pyarrow-stubs/pyarrow/fs.pyi b/python/pyarrow-stubs/pyarrow/fs.pyi
new file mode 100644
index 00000000000..77bf9193900
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/fs.pyi
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow._fs import (
+ FileSelector,
+ FileType,
+ FileInfo,
+ FileSystem,
+ LocalFileSystem,
+ SubTreeFileSystem,
+ _MockFileSystem,
+ FileSystemHandler,
+ PyFileSystem,
+ SupportedFileSystem,
+)
+from pyarrow._azurefs import AzureFileSystem
+from pyarrow._hdfs import HadoopFileSystem
+from pyarrow._gcsfs import GcsFileSystem
+from pyarrow._s3fs import (
+ AwsDefaultS3RetryStrategy,
+ AwsStandardS3RetryStrategy,
+ S3FileSystem,
+ S3LogLevel,
+ S3RetryStrategy,
+ ensure_s3_initialized,
+ finalize_s3,
+ ensure_s3_finalized,
+ initialize_s3,
+ resolve_s3_region,
+)
+
+FileStats = FileInfo
+
+
+def copy_files(
+ source: str,
+ destination: str,
+ source_filesystem: SupportedFileSystem | None = None,
+ destination_filesystem: SupportedFileSystem | None = None,
+ *,
+ chunk_size: int = 1024 * 1024, # noqa: Y011
+ use_threads: bool = True,
+) -> None: ...
+
+
+def _ensure_filesystem(
+ filesystem: FileSystem | str | object,
+ *,
+ use_mmap: bool = False
+) -> FileSystem: ...
+
+
+def _resolve_filesystem_and_path(
+ path: str | object,
+ filesystem: FileSystem | str | object | None = None,
+ *,
+ memory_map: bool = False
+) -> tuple[FileSystem, str]: ...
+
+
+class FSSpecHandler(FileSystemHandler): # type: ignore[misc] # All abstract methods implemented via fsspec delegation # noqa: E501
+ fs: SupportedFileSystem
+ def __init__(self, fs: SupportedFileSystem) -> None: ...
+
+
+__all__ = [
+ # _fs
+ "FileSelector",
+ "FileType",
+ "FileInfo",
+ "FileSystem",
+ "LocalFileSystem",
+ "SubTreeFileSystem",
+ "_MockFileSystem",
+ "FileSystemHandler",
+ "PyFileSystem",
+ # _azurefs
+ "AzureFileSystem",
+ # _hdfs
+ "HadoopFileSystem",
+ # _gcsfs
+ "GcsFileSystem",
+ # _s3fs
+ "AwsDefaultS3RetryStrategy",
+ "AwsStandardS3RetryStrategy",
+ "S3FileSystem",
+ "S3LogLevel",
+ "S3RetryStrategy",
+ "ensure_s3_initialized",
+ "finalize_s3",
+ "ensure_s3_finalized",
+ "initialize_s3",
+ "resolve_s3_region",
+ # fs
+ "FileStats",
+ "copy_files",
+ "FSSpecHandler",
+]
diff --git a/python/pyarrow-stubs/pyarrow/gandiva.pyi b/python/pyarrow-stubs/pyarrow/gandiva.pyi
new file mode 100644
index 00000000000..7e129d3ed1d
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/gandiva.pyi
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Iterable
+from typing import Literal
+
+from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable
+
+
+class Node(_Weakrefable):
+ def return_type(self) -> DataType: ...
+
+
+class Expression(_Weakrefable):
+ def root(self) -> Node: ...
+ def result(self) -> Field: ...
+
+
+class Condition(_Weakrefable):
+ def root(self) -> Node: ...
+ def result(self) -> Field: ...
+
+
+class SelectionVector(_Weakrefable):
+ def to_array(self) -> Array: ...
+
+
+class Projector(_Weakrefable):
+ @property
+ def llvm_ir(self): ...
+
+ def evaluate(
+ self, batch: RecordBatch, selection: SelectionVector | None = None
+ ) -> list[Array]: ...
+
+
+class Filter(_Weakrefable):
+ @property
+ def llvm_ir(self): ...
+
+ def evaluate(
+ self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32"
+ ) -> SelectionVector: ...
+
+
+class TreeExprBuilder(_Weakrefable):
+ def make_literal(self, value: float | str | bytes |
+ bool, dtype: DataType | str | None) -> Node: ...
+
+ def make_expression(
+ self, root_node: Node | None, return_field: Field) -> Expression: ...
+
+ def make_function(
+ self, name: str, children: list[Node | None],
+ return_type: DataType) -> Node: ...
+
+ def make_field(self, field: Field | None) -> Node: ...
+
+ def make_if(
+ self, condition: Node, this_node: Node | None,
+ else_node: Node | None, return_type: DataType | None
+ ) -> Node: ...
+ def make_and(self, children: list[Node | None]) -> Node: ...
+ def make_or(self, children: list[Node | None]) -> Node: ...
+ def make_in_expression(self, node: Node | None, values: Iterable,
+ dtype: DataType) -> Node: ...
+
+ def make_condition(self, condition: Node | None) -> Condition: ...
+
+
+class Configuration(_Weakrefable):
+ def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ...
+
+
+def make_projector(
+ schema: Schema,
+ children: list[Expression | None],
+ pool: MemoryPool | None = None,
+ selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE",
+ configuration: Configuration | None = None,
+) -> Projector: ...
+
+
+def make_filter(
+ schema: Schema, condition: Condition | None,
+ configuration: Configuration | None = None
+) -> Filter: ...
+
+
+class FunctionSignature(_Weakrefable):
+ def return_type(self) -> DataType: ...
+ def param_types(self) -> list[DataType]: ...
+ def name(self) -> str: ...
+
+
+def get_registered_function_signatures() -> list[FunctionSignature]: ...
diff --git a/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi b/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi
new file mode 100644
index 00000000000..fd5ae83c569
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from .from_dataframe import from_dataframe as from_dataframe
+
+__all__ = ["from_dataframe"]
diff --git a/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi b/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi
new file mode 100644
index 00000000000..e1d8ae949c9
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import enum
+
+from pyarrow.lib import Buffer
+
+
+class DlpackDeviceType(enum.IntEnum):
+ CPU = 1
+ CUDA = 2
+ CPU_PINNED = 3
+ OPENCL = 4
+ VULKAN = 7
+ METAL = 8
+ VPI = 9
+ ROCM = 10
+
+
+class _PyArrowBuffer:
+ def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ...
+ @property
+ def bufsize(self) -> int: ...
+ @property
+ def ptr(self) -> int: ...
+ def __dlpack__(self): ...
+ def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: ...
diff --git a/python/pyarrow-stubs/pyarrow/interchange/column.pyi b/python/pyarrow-stubs/pyarrow/interchange/column.pyi
new file mode 100644
index 00000000000..67508ac0689
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/interchange/column.pyi
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import enum
+
+from collections.abc import Iterable
+from typing import Any, TypeAlias, TypedDict
+
+from pyarrow.lib import Array, ChunkedArray
+
+from .buffer import _PyArrowBuffer
+
+
+class DtypeKind(enum.IntEnum):
+ INT = 0
+ UINT = 1
+ FLOAT = 2
+ BOOL = 20
+ STRING = 21 # UTF-8
+ DATETIME = 22
+ CATEGORICAL = 23
+
+
+Dtype: TypeAlias = tuple[DtypeKind, int, str, str]
+
+
+class ColumnNullType(enum.IntEnum):
+ NON_NULLABLE = 0
+ USE_NAN = 1
+ USE_SENTINEL = 2
+ USE_BITMASK = 3
+ USE_BYTEMASK = 4
+
+
+class ColumnBuffers(TypedDict):
+ data: tuple[_PyArrowBuffer, Dtype]
+ validity: tuple[_PyArrowBuffer, Dtype] | None
+ offsets: tuple[_PyArrowBuffer, Dtype] | None
+
+
+class CategoricalDescription(TypedDict):
+ is_ordered: bool
+ is_dictionary: bool
+ categories: _PyArrowColumn | None
+
+
+class Endianness(enum.Enum):
+ LITTLE = "<"
+ BIG = ">"
+ NATIVE = "="
+ NA = "|"
+
+
+class NoBufferPresent(Exception):
+ ...
+
+
+class _PyArrowColumn:
+ _col: Array | ChunkedArray
+
+ def __init__(self, column: Array | ChunkedArray,
+ allow_copy: bool = True) -> None: ...
+
+ def size(self) -> int: ...
+ @property
+ def offset(self) -> int: ...
+ @property
+ def dtype(self) -> tuple[DtypeKind, int, str, str]: ...
+ @property
+ def describe_categorical(self) -> CategoricalDescription: ...
+ @property
+ def describe_null(self) -> tuple[ColumnNullType, Any]: ...
+ @property
+ def null_count(self) -> int: ...
+ @property
+ def metadata(self) -> dict[str, Any]: ...
+ def num_chunks(self) -> int: ...
+ def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: ...
+ def get_buffers(self) -> ColumnBuffers: ...
diff --git a/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi b/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi
new file mode 100644
index 00000000000..419b3e2cdb3
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+from collections.abc import Iterable, Sequence
+from typing import Any
+
+from pyarrow.interchange.column import _PyArrowColumn
+from pyarrow.lib import RecordBatch, Table
+
+
+class _PyArrowDataFrame:
+ def __init__(
+ self,
+ df: Table | RecordBatch,
+ nan_as_null: bool = False,
+ allow_copy: bool = True) -> None: ...
+
+ def __dataframe__(
+ self, nan_as_null: bool = False, allow_copy: bool = True
+ ) -> _PyArrowDataFrame: ...
+ @property
+ def metadata(self) -> dict[str, Any]: ...
+ def num_columns(self) -> int: ...
+ def num_rows(self) -> int: ...
+ def num_chunks(self) -> int: ...
+ def column_names(self) -> Iterable[str]: ...
+ def get_column(self, i: int) -> _PyArrowColumn: ...
+ def get_column_by_name(self, name: str) -> _PyArrowColumn: ...
+ def get_columns(self) -> Iterable[_PyArrowColumn]: ...
+ def select_columns(self, indices: Sequence[int]) -> Self: ...
+ def select_columns_by_name(self, names: Sequence[str]) -> Self: ...
+ def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: ...
diff --git a/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi b/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi
new file mode 100644
index 00000000000..d6ad272dfc6
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any, Protocol, TypeAlias
+
+from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table
+
+from .column import (
+ ColumnBuffers,
+ ColumnNullType,
+ Dtype,
+ DtypeKind,
+)
+
+
+class DataFrameObject(Protocol):
+ def __dataframe__(self, nan_as_null: bool = False,
+ allow_copy: bool = True) -> Any: ...
+
+
+ColumnObject: TypeAlias = Any
+
+
+def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ...
+
+
+def _from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ...
+
+
+def protocol_df_chunk_to_pyarrow(
+ df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ...
+
+
+def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ...
+
+
+def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ...
+
+
+def categorical_column_to_dictionary(
+ col: ColumnObject, allow_copy: bool = True
+) -> DictionaryArray: ...
+
+
+def parse_datetime_format_str(format_str: str) -> tuple[str, str]: ...
+
+
+def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: ...
+
+
+def buffers_to_array(
+ buffers: ColumnBuffers,
+ data_type: tuple[DtypeKind, int, str, str],
+ length: int,
+ describe_null: ColumnNullType,
+ offset: int = 0,
+ allow_copy: bool = True,
+) -> Array: ...
+
+
+def validity_buffer_from_mask(
+ validity_buff: Buffer,
+ validity_dtype: Dtype,
+ describe_null: ColumnNullType,
+ length: int,
+ offset: int = 0,
+ allow_copy: bool = True,
+) -> Buffer: ...
+
+
+def validity_buffer_nan_sentinel(
+ data_pa_buffer: Buffer,
+ data_type: Dtype,
+ describe_null: ColumnNullType,
+ length: int,
+ offset: int = 0,
+ allow_copy: bool = True,
+) -> Buffer: ...
diff --git a/python/pyarrow-stubs/pyarrow/io.pyi b/python/pyarrow-stubs/pyarrow/io.pyi
new file mode 100644
index 00000000000..be6a07d5418
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/io.pyi
@@ -0,0 +1,430 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+from collections.abc import Callable
+from io import IOBase
+
+from _typeshed import StrPath
+
+import numpy as np
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+if sys.version_info >= (3, 10):
+ from typing import TypeAlias
+else:
+ from typing_extensions import TypeAlias
+
+from typing import Any, Literal, SupportsIndex
+import builtins
+
+from pyarrow._stubs_typing import Compression, SupportPyBuffer
+from pyarrow.lib import MemoryPool, _Weakrefable
+
+from .device import Device, DeviceAllocationType, MemoryManager
+from ._types import KeyValueMetadata
+
+
+def have_libhdfs() -> bool: ...
+
+
+def io_thread_count() -> int: ...
+
+
+def set_io_thread_count(count: int) -> None: ...
+
+
+Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"]
+
+
+class NativeFile(_Weakrefable):
+ _default_chunk_size: int
+
+ def __enter__(self) -> Self: ...
+ def __exit__(self, *args) -> None: ...
+ @property
+ def mode(self) -> Mode: ...
+
+ def readable(self) -> bool: ...
+ def seekable(self) -> bool: ...
+ def isatty(self) -> bool: ...
+ def fileno(self) -> int: ...
+
+ @property
+ def closed(self) -> bool: ...
+ def close(self) -> None: ...
+ def size(self) -> int: ...
+
+ def metadata(self) -> KeyValueMetadata: ...
+
+ def tell(self) -> int: ...
+
+ def seek(self, position: int, whence: int = 0) -> int: ...
+
+ def flush(self) -> None: ...
+
+ def write(self, data: bytes | SupportPyBuffer) -> int: ...
+
+ def read(self, nbytes: int | None = None) -> bytes: ...
+
+ def get_stream(self, file_offset: int, nbytes: int) -> Self: ...
+
+ def read_at(self, nbytes: int, offset: int) -> bytes: ...
+
+ def read1(self, nbytes: int | None = None) -> bytes: ...
+
+ def readall(self) -> bytes: ...
+ def readinto(self, b: SupportPyBuffer) -> int: ...
+
+ def readline(self, size: int | None = None) -> bytes: ...
+
+ def readlines(self, hint: int | None = None) -> list[bytes]: ...
+
+ def __iter__(self) -> Self: ...
+
+ def __next__(self) -> bytes: ...
+ def read_buffer(self, nbytes: int | None = None) -> Buffer: ...
+
+ def truncate(self, pos: int | None = None) -> int: ...
+
+ def writelines(self, lines: list[bytes]): ...
+
+ def download(self, stream_or_path: StrPath | IOBase,
+ buffer_size: int | None = None) -> None: ...
+
+ def upload(self, stream: IOBase, buffer_size: int | None) -> None: ...
+
+ def writable(self): ...
+
+# ----------------------------------------------------------------------
+# Python file-like objects
+
+
+class PythonFile(NativeFile):
+ def __init__(self, handle: IOBase,
+ mode: Literal["r", "w"] | None = None) -> None: ...
+
+
+class MemoryMappedFile(NativeFile):
+ @classmethod
+ def create(cls, path: str, size: float) -> Self: ...
+
+ def _open(self, path: str,
+ mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ...
+
+ def resize(self, new_size: int) -> None: ...
+
+
+def memory_map(
+ path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"
+) -> MemoryMappedFile: ...
+
+
+create_memory_map = MemoryMappedFile.create
+
+
+class OSFile(NativeFile):
+ name: str
+
+ def __init__(
+ self,
+ path: str,
+ mode: Literal["r", "rb", "w", "wb", "a", "ab"] = "r",
+ memory_pool: MemoryPool | None = None,
+ ) -> None: ...
+
+
+class FixedSizeBufferWriter(NativeFile):
+ def __init__(self, buffer: Buffer) -> None: ...
+ def set_memcopy_threads(self, num_threads: int) -> None: ...
+
+ def set_memcopy_blocksize(self, blocksize: int) -> None: ...
+
+ def set_memcopy_threshold(self, threshold: int) -> None: ...
+
+
+# ----------------------------------------------------------------------
+# Arrow buffers
+
+class Buffer(_Weakrefable):
+ def __len__(self) -> int: ...
+
+ def _assert_cpu(self) -> None: ...
+ @property
+ def size(self) -> int: ...
+
+ @property
+ def address(self) -> int: ...
+
+ def hex(self) -> bytes: ...
+
+ @property
+ def is_mutable(self) -> bool: ...
+
+ @property
+ def is_cpu(self) -> bool: ...
+
+ @property
+ def device(self) -> Device: ...
+
+ @property
+ def memory_manager(self) -> MemoryManager: ...
+
+ @property
+ def device_type(self) -> DeviceAllocationType: ...
+
+ @property
+ def parent(self) -> Buffer | None: ...
+
+ def __getitem__(self, key: int | builtins.slice) -> int | Self: ...
+
+ def slice(self, offset: int = 0, length: int | None = None) -> Self: ...
+
+ def equals(self, other: Self) -> bool: ...
+
+ def __buffer__(self, flags: int) -> memoryview: ...
+
+ def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ...
+ def to_pybytes(self) -> bytes: ...
+
+
+class ResizableBuffer(Buffer):
+ def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: ...
+
+
+def allocate_buffer(
+ size: int,
+ memory_pool: MemoryPool | None = None,
+ resizable: Literal[False] | Literal[True] | None = None # noqa: Y030
+) -> Buffer | ResizableBuffer: ...
+
+
+# ----------------------------------------------------------------------
+# Arrow Stream
+class BufferOutputStream(NativeFile):
+ def __init__(self, memory_pool: MemoryPool | None = None) -> None: ...
+ def getvalue(self) -> Buffer: ...
+
+
+class MockOutputStream(NativeFile):
+ ...
+
+
+class BufferReader(NativeFile):
+ def __init__(self, obj) -> None: ...
+
+
+class CompressedInputStream(NativeFile):
+ def __init__(
+ self,
+ stream: StrPath | NativeFile | IOBase,
+ compression: str | None,
+ ) -> None: ...
+
+
+class CompressedOutputStream(NativeFile):
+ def __init__(
+ self,
+ stream: StrPath | NativeFile | IOBase,
+ compression: str,
+ ) -> None: ...
+
+
+class BufferedInputStream(NativeFile):
+ def __init__(self, stream: NativeFile, buffer_size: int,
+ memory_pool: MemoryPool | None = None) -> None: ...
+
+ def detach(self) -> NativeFile: ...
+
+
+class BufferedOutputStream(NativeFile):
+ def __init__(self, stream: NativeFile, buffer_size: int,
+ memory_pool: MemoryPool | None = None) -> None: ...
+
+ def detach(self) -> NativeFile: ...
+
+
+class TransformInputStream(NativeFile):
+ def __init__(self, stream: NativeFile,
+ transform_func: Callable[[Buffer], Any]) -> None: ...
+
+
+class Transcoder:
+ def __init__(self, decoder, encoder) -> None: ...
+ def __call__(self, buf: Buffer): ...
+
+
+def transcoding_input_stream(
+ stream: NativeFile, src_encoding: str, dest_encoding: str
+) -> TransformInputStream: ...
+
+
+def py_buffer(obj: SupportPyBuffer | np.ndarray) -> Buffer: ...
+
+
+def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: ...
+
+
+def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ...
+
+# ---------------------------------------------------------------------
+
+
+class CacheOptions(_Weakrefable):
+ hole_size_limit: int
+ range_size_limit: int
+ lazy: bool
+ prefetch_limit: int
+
+ def __init__(
+ self,
+ *,
+ hole_size_limit: int | None = None,
+ range_size_limit: int | None = None,
+ lazy: bool = True,
+ prefetch_limit: int = 0,
+ ) -> None: ...
+
+ @classmethod
+ def from_network_metrics(
+ cls,
+ time_to_first_byte_millis: int,
+ transfer_bandwidth_mib_per_sec: int,
+ ideal_bandwidth_utilization_frac: float = 0.9,
+ max_ideal_request_size_mib: int = 64,
+ ) -> Self: ...
+
+
+class Codec(_Weakrefable):
+ def __init__(self, compression: Compression | str | None,
+ compression_level: int | None = None) -> None: ...
+
+ @classmethod
+ def detect(cls, path: StrPath) -> Self: ...
+
+ @staticmethod
+ def is_available(compression: Compression | str) -> bool: ...
+
+ @staticmethod
+ def supports_compression_level(compression: Compression) -> int: ...
+
+ @staticmethod
+ def default_compression_level(compression: Compression) -> int: ...
+
+ @staticmethod
+ def minimum_compression_level(compression: Compression) -> int: ...
+
+ @staticmethod
+ def maximum_compression_level(compression: Compression) -> int: ...
+
+ @property
+ def name(self) -> Compression: ...
+
+ @property
+ def compression_level(self) -> int: ...
+
+ def compress(
+ self,
+ buf: Buffer | bytes | SupportPyBuffer,
+ *,
+ asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030
+ memory_pool: MemoryPool | None = None,
+ ) -> Buffer | bytes: ...
+
+ def decompress(
+ self,
+ buf: Buffer | bytes | SupportPyBuffer,
+ decompressed_size: int | None = None,
+ *,
+ asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030
+ memory_pool: MemoryPool | None = None,
+ ) -> Buffer | bytes: ...
+
+
+def compress(
+ buf: Buffer | bytes | SupportPyBuffer,
+ codec: Compression = "lz4",
+ *,
+ asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030
+ memory_pool: MemoryPool | None = None,
+) -> Buffer | bytes: ...
+
+
+def decompress(
+ buf: Buffer | bytes | SupportPyBuffer,
+ decompressed_size: int | None = None,
+ codec: Compression = "lz4",
+ *,
+ asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030
+ memory_pool: MemoryPool | None = None,
+) -> Buffer | bytes: ...
+
+
+def input_stream(
+ source: StrPath | Buffer | NativeFile | IOBase | SupportPyBuffer,
+ compression:
+ Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] | None = "detect",
+ buffer_size: int | str | None = None,
+) -> BufferReader: ...
+
+
+def output_stream(
+ source: StrPath | Buffer | NativeFile | IOBase | SupportPyBuffer,
+ compression:
+ Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] | None = "detect",
+ buffer_size: int | None = None,
+) -> NativeFile: ...
+
+
+__all__ = [
+ "have_libhdfs",
+ "io_thread_count",
+ "set_io_thread_count",
+ "NativeFile",
+ "PythonFile",
+ "MemoryMappedFile",
+ "memory_map",
+ "create_memory_map",
+ "OSFile",
+ "FixedSizeBufferWriter",
+ "Buffer",
+ "ResizableBuffer",
+ "allocate_buffer",
+ "BufferOutputStream",
+ "MockOutputStream",
+ "BufferReader",
+ "CompressedInputStream",
+ "CompressedOutputStream",
+ "BufferedInputStream",
+ "BufferedOutputStream",
+ "TransformInputStream",
+ "Transcoder",
+ "transcoding_input_stream",
+ "py_buffer",
+ "foreign_buffer",
+ "as_buffer",
+ "CacheOptions",
+ "Codec",
+ "compress",
+ "decompress",
+ "input_stream",
+ "output_stream",
+]
diff --git a/python/pyarrow-stubs/pyarrow/ipc.pyi b/python/pyarrow-stubs/pyarrow/ipc.pyi
new file mode 100644
index 00000000000..d153ab0f46a
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/ipc.pyi
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from io import IOBase
+from typing import Any
+
+from _typeshed import StrPath
+import pandas as pd
+import pyarrow.lib as lib
+
+from pyarrow.lib import (
+ Alignment,
+ IpcReadOptions,
+ IpcWriteOptions,
+ Message,
+ MessageReader,
+ MetadataVersion,
+ ReadStats,
+ RecordBatchReader,
+ WriteStats,
+ _ReadPandasMixin,
+ get_record_batch_size,
+ get_tensor_size,
+ read_message,
+ read_record_batch,
+ read_schema,
+ read_tensor,
+ write_tensor,
+)
+
+
+class RecordBatchStreamReader(lib._RecordBatchStreamReader):
+ def __init__(
+ self,
+ source: bytes | lib.Buffer | lib.NativeFile | IOBase,
+ *,
+ options: IpcReadOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> None: ...
+
+
+class RecordBatchStreamWriter(lib._RecordBatchStreamWriter):
+ def __init__(
+ self,
+ sink: str | lib.NativeFile | IOBase,
+ schema: lib.Schema,
+ *,
+ use_legacy_format: bool | None = None,
+ options: IpcWriteOptions | None = None,
+ ) -> None: ...
+
+
+class RecordBatchFileReader(lib._RecordBatchFileReader):
+ def __init__(
+ self,
+ source: bytes | lib.Buffer | lib.NativeFile | IOBase,
+ footer_offset: int | None = None,
+ *,
+ options: IpcReadOptions | None = None,
+ memory_pool: lib.MemoryPool | None = None,
+ ) -> None: ...
+
+
+class RecordBatchFileWriter(lib._RecordBatchFileWriter):
+ def __init__(
+ self,
+ sink: str | lib.NativeFile | IOBase,
+ schema: lib.Schema,
+ *,
+ use_legacy_format: bool | None = None,
+ options: IpcWriteOptions | None = None,
+ ) -> None: ...
+
+
+def new_stream(
+ sink: str | lib.NativeFile | IOBase,
+ schema: lib.Schema,
+ *,
+ use_legacy_format: bool | None = None,
+ options: IpcWriteOptions | None = None,
+) -> RecordBatchStreamWriter: ...
+
+
+def open_stream(
+ source: bytes | int | lib.Buffer | lib.NativeFile | IOBase,
+ *,
+ options: Any = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> RecordBatchStreamReader: ...
+
+
+def new_file(
+ sink: str | lib.NativeFile | IOBase,
+ schema: lib.Schema,
+ *,
+ use_legacy_format: bool | None = None,
+ options: IpcWriteOptions | None = None,
+ metadata: lib.KeyValueMetadata | dict[bytes, bytes] | None = None,
+) -> RecordBatchFileWriter: ...
+
+
+def open_file(
+ source: StrPath | bytes | lib.Buffer | lib.NativeFile | IOBase,
+ footer_offset: int | None = None,
+ *,
+ options: Any = None,
+ memory_pool: lib.MemoryPool | None = None,
+) -> RecordBatchFileReader: ...
+
+
+def serialize_pandas(
+ df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None
+) -> lib.Buffer: ...
+
+
+def deserialize_pandas(
+ buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ...
+
+
+__all__ = [
+ "Alignment",
+ "IpcReadOptions",
+ "IpcWriteOptions",
+ "Message",
+ "MessageReader",
+ "MetadataVersion",
+ "ReadStats",
+ "RecordBatchReader",
+ "WriteStats",
+ "_ReadPandasMixin",
+ "get_record_batch_size",
+ "get_tensor_size",
+ "read_message",
+ "read_record_batch",
+ "read_schema",
+ "read_tensor",
+ "write_tensor",
+ "RecordBatchStreamReader",
+ "RecordBatchStreamWriter",
+ "RecordBatchFileReader",
+ "RecordBatchFileWriter",
+ "new_stream",
+ "open_stream",
+ "new_file",
+ "open_file",
+ "serialize_pandas",
+ "deserialize_pandas",
+]
diff --git a/python/pyarrow-stubs/pyarrow/json.pyi b/python/pyarrow-stubs/pyarrow/json.pyi
new file mode 100644
index 00000000000..67768db42e4
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/json.pyi
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json
+
+__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"]
diff --git a/python/pyarrow-stubs/pyarrow/lib.pyi b/python/pyarrow-stubs/pyarrow/lib.pyi
new file mode 100644
index 00000000000..6bd9b7857bf
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/lib.pyi
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Iterator, NamedTuple
+
+from .array import * # noqa: F401, F403
+from .builder import * # noqa: F401, F403
+from .compat import * # noqa: F401, F403
+from .config import * # noqa: F401, F403
+from .device import * # noqa: F401, F403
+from .error import * # noqa: F401, F403
+from .io import * # noqa: F401, F403
+from ._ipc import * # noqa: F401, F403
+from .memory import * # noqa: F401, F403
+from .pandas_shim import * # noqa: F401, F403
+from .scalar import * # noqa: F401, F403
+from .table import * # noqa: F401, F403
+from .tensor import * # noqa: F401, F403
+from ._types import * # noqa: F401, F403
+from .memory import MemoryPool
+from .array import Array
+from ._types import DataType
+
+
+class MonthDayNano(tuple):
+ months: int
+ days: int
+ nanoseconds: int
+
+ def __new__(
+ cls,
+ sequence: tuple[int, int, int] | list[int] = ...,
+ ) -> MonthDayNano: ...
+
+
+def cpu_count() -> int: ...
+
+
+def set_cpu_count(count: int) -> None: ...
+
+
+def is_threading_enabled() -> bool: ...
+
+
+def arange(
+ start: int, stop: int, step: int = 1, *, memory_pool: MemoryPool | None = None
+) -> Array: ...
+
+
+def is_boolean_value(obj: object) -> bool: ...
+
+
+def is_integer_value(obj: object) -> bool: ...
+
+
+def is_float_value(obj: object) -> bool: ...
+
+
+def tzinfo_to_string(tz: object) -> str: ...
+
+
+def string_to_tzinfo(tz: str) -> object: ...
+
+
+def _ndarray_to_arrow_type(values: object, type_: object) -> object: ...
+
+
+def _is_primitive(type_id: int) -> bool: ...
+
+
+def ensure_type(ty: object) -> DataType: ...
+
+
+Type_NA: int
+Type_BOOL: int
+Type_UINT8: int
+Type_INT8: int
+Type_UINT16: int
+Type_INT16: int
+Type_UINT32: int
+Type_INT32: int
+Type_UINT64: int
+Type_INT64: int
+Type_HALF_FLOAT: int
+Type_FLOAT: int
+Type_DOUBLE: int
+Type_DECIMAL32: int
+Type_DECIMAL64: int
+Type_DECIMAL128: int
+Type_DECIMAL256: int
+Type_DATE32: int
+Type_DATE64: int
+Type_TIMESTAMP: int
+Type_TIME32: int
+Type_TIME64: int
+Type_DURATION: int
+Type_INTERVAL_MONTHS: int
+Type_INTERVAL_DAY_TIME: int
+Type_INTERVAL_MONTH_DAY_NANO: int
+Type_BINARY: int
+Type_STRING: int
+Type_LARGE_BINARY: int
+Type_LARGE_STRING: int
+Type_FIXED_SIZE_BINARY: int
+Type_BINARY_VIEW: int
+Type_STRING_VIEW: int
+Type_LIST: int
+Type_LARGE_LIST: int
+Type_LIST_VIEW: int
+Type_LARGE_LIST_VIEW: int
+Type_MAP: int
+Type_FIXED_SIZE_LIST: int
+Type_STRUCT: int
+Type_SPARSE_UNION: int
+Type_DENSE_UNION: int
+Type_DICTIONARY: int
+Type_RUN_END_ENCODED: int
+UnionMode_SPARSE: int
+UnionMode_DENSE: int
diff --git a/python/pyarrow-stubs/pyarrow/memory.pyi b/python/pyarrow-stubs/pyarrow/memory.pyi
new file mode 100644
index 00000000000..f80e01ab21c
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/memory.pyi
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow.lib import _Weakrefable
+
+
+class MemoryPool(_Weakrefable):
+ def release_unused(self) -> None: ...
+
+ def bytes_allocated(self) -> int: ...
+
+ def total_bytes_allocated(self) -> int: ...
+
+ def max_memory(self) -> int | None: ...
+
+ def num_allocations(self) -> int: ...
+
+ def print_stats(self) -> None: ...
+
+ @property
+ def backend_name(self) -> str: ...
+
+
+class LoggingMemoryPool(MemoryPool):
+ ...
+
+
+class ProxyMemoryPool(MemoryPool):
+ ...
+
+
+def default_memory_pool() -> MemoryPool: ...
+
+
+def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: ...
+
+
+def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: ...
+
+
+def system_memory_pool() -> MemoryPool: ...
+
+
+def jemalloc_memory_pool() -> MemoryPool: ...
+
+
+def mimalloc_memory_pool() -> MemoryPool: ...
+
+
+def set_memory_pool(pool: MemoryPool) -> None: ...
+
+
+def log_memory_allocations(enable: bool = True) -> None: ...
+
+
+def total_allocated_bytes() -> int: ...
+
+
+def jemalloc_set_decay_ms(decay_ms: int) -> None: ...
+
+
+def supported_memory_backends() -> list[str]: ...
+
+
+__all__ = [
+ "MemoryPool",
+ "LoggingMemoryPool",
+ "ProxyMemoryPool",
+ "default_memory_pool",
+ "proxy_memory_pool",
+ "logging_memory_pool",
+ "system_memory_pool",
+ "jemalloc_memory_pool",
+ "mimalloc_memory_pool",
+ "set_memory_pool",
+ "log_memory_allocations",
+ "total_allocated_bytes",
+ "jemalloc_set_decay_ms",
+ "supported_memory_backends",
+]
diff --git a/python/pyarrow-stubs/pyarrow/orc.pyi b/python/pyarrow-stubs/pyarrow/orc.pyi
new file mode 100644
index 00000000000..f16350d0ffc
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/orc.pyi
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+from typing import IO, Any, Literal
+
+from _typeshed import StrPath
+
+from . import _orc
+from ._fs import SupportedFileSystem
+from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table
+
+
+class ORCFile:
+ reader: _orc.ORCReader
+ def __init__(self, source: StrPath | NativeFile | IO) -> None: ...
+ @property
+ def metadata(self) -> KeyValueMetadata: ...
+
+ @property
+ def schema(self) -> Schema: ...
+
+ @property
+ def nrows(self) -> int: ...
+
+ @property
+ def nstripes(self) -> int: ...
+
+ @property
+ def file_version(self) -> str: ...
+
+ @property
+ def software_version(self) -> str: ...
+
+ @property
+ def compression(self) -> Literal["UNCOMPRESSED",
+ "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ...
+
+ @property
+ def compression_size(self) -> int: ...
+
+ @property
+ def writer(self) -> str: ...
+
+ @property
+ def writer_version(self) -> str: ...
+
+ @property
+ def row_index_stride(self) -> int: ...
+
+ @property
+ def nstripe_statistics(self) -> int: ...
+
+ @property
+ def content_length(self) -> int: ...
+
+ @property
+ def stripe_statistics_length(self) -> int: ...
+
+ @property
+ def file_footer_length(self) -> int: ...
+
+ @property
+ def file_postscript_length(self) -> int: ...
+
+ @property
+ def file_length(self) -> int: ...
+
+ def read_stripe(
+ self, n: int, columns: list[str | int] | None = None
+ ) -> RecordBatch: ...
+
+ def read(self, columns: list[str | int] | None = None) -> Table: ...
+
+
+class ORCWriter:
+ writer: _orc.ORCWriter
+ is_open: bool
+
+ def __init__(
+ self,
+ where: StrPath | NativeFile | IO,
+ *,
+ file_version: Any = "0.12",
+ batch_size: Any = 1024,
+ stripe_size: Any = 64 * 1024 * 1024, # noqa: Y011
+ compression: Any = "UNCOMPRESSED",
+ compression_block_size: Any = 65536,
+ compression_strategy: Any = "SPEED",
+ row_index_stride: Any = 10000,
+ padding_tolerance: Any = 0.0,
+ dictionary_key_size_threshold: Any = 0.0,
+ bloom_filter_columns: Any = None,
+ bloom_filter_fpp: Any = 0.05,
+ ): ...
+ def __enter__(self) -> Self: ...
+ def __exit__(self, *args, **kwargs) -> None: ...
+ def __getattr__(self, name: str) -> Any: ...
+ def write(self, table: Table) -> None: ...
+
+ def close(self) -> None: ...
+
+
+def read_table(
+ source: StrPath | NativeFile | IO,
+ columns: list[str | int] | None = None,
+ filesystem: SupportedFileSystem | str | None = None,
+) -> Table: ...
+
+
+# TODO: should not use Any here?
+def write_table(
+ table: Table,
+ where: StrPath | NativeFile | IO,
+ *,
+ file_version: Any = "0.12",
+ batch_size: Any = 1024,
+ stripe_size: Any = 64 * 1024 * 1024, # noqa: Y011
+ compression: Any = 'UNCOMPRESSED',
+ compression_block_size: Any = 65536,
+ compression_strategy: Any = 'SPEED',
+ row_index_stride: Any = 10000,
+ padding_tolerance: Any = 0.0,
+ dictionary_key_size_threshold: Any = 0.0,
+ bloom_filter_columns: Any = None,
+ bloom_filter_fpp: Any = 0.05,
+) -> None: ...
diff --git a/python/pyarrow-stubs/pyarrow/pandas_compat.pyi b/python/pyarrow-stubs/pyarrow/pandas_compat.pyi
new file mode 100644
index 00000000000..4e614c58a3f
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/pandas_compat.pyi
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any, TypedDict, TypeVar
+
+import numpy as np
+import pandas as pd
+
+from pandas import DatetimeTZDtype
+
+from .lib import Array, DataType, Schema, Table, _pandas_api
+
+_T = TypeVar("_T")
+
+
+def get_logical_type_map() -> dict[int, str]: ...
+def get_logical_type(arrow_type: DataType) -> str: ...
+def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ...
+def get_logical_type_from_numpy(pandas_collection) -> str: ...
+def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ...
+
+
+class _ColumnMetadata(TypedDict):
+ name: str
+ field_name: str
+ pandas_type: int
+ numpy_type: str
+ metadata: dict | None
+
+
+def get_column_metadata(
+ column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str
+) -> _ColumnMetadata: ...
+
+
+def construct_metadata(
+ columns_to_convert: list[pd.Series],
+ df: pd.DataFrame,
+ column_names: list[str],
+ index_levels: list[pd.Index],
+ index_descriptors: list[dict],
+ preserve_index: bool,
+ types: list[DataType],
+ column_field_names: list[str] = ...,
+) -> dict[bytes, bytes]: ...
+
+
+def dataframe_to_types(
+ df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None
+) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ...
+
+
+def dataframe_to_arrays(
+ df: pd.DataFrame,
+ schema: Schema,
+ preserve_index: bool | None,
+ nthreads: int = 1,
+ columns: list[str] | None = None,
+ safe: bool = True,
+) -> tuple[Array, Schema, int]: ...
+def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ...
+def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ...
+
+
+def table_to_dataframe(
+ options,
+ table: Table,
+ categories=None,
+ ignore_metadata: bool = False,
+ types_mapper=None) -> pd.DataFrame: ...
+
+
+def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ...
+
+
+__all__ = [
+ "_pandas_api",
+]
diff --git a/python/pyarrow-stubs/pyarrow/pandas_shim.pyi b/python/pyarrow-stubs/pyarrow/pandas_shim.pyi
new file mode 100644
index 00000000000..181d78e7a0c
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/pandas_shim.pyi
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import types as stdlib_types
+from collections.abc import Iterable
+from typing import Any, TypeGuard
+
+from pandas import Categorical, DatetimeTZDtype, Index, Series, DataFrame
+
+from numpy import dtype
+from pandas.core.dtypes.base import ExtensionDtype
+
+
+class _PandasAPIShim:
+ has_sparse: bool
+
+ def series(self, *args, **kwargs) -> Series: ...
+ def data_frame(self, *args, **kwargs) -> DataFrame: ...
+ @property
+ def have_pandas(self) -> bool: ...
+ @property
+ def compat(self) -> stdlib_types.ModuleType: ...
+ @property
+ def pd(self) -> stdlib_types.ModuleType: ...
+ def infer_dtype(self, obj: Iterable) -> str: ...
+ def pandas_dtype(self, dtype: str) -> dtype: ...
+ @property
+ def loose_version(self) -> Any: ...
+ @property
+ def version(self) -> str: ...
+ def is_v1(self) -> bool: ...
+ def is_ge_v21(self) -> bool: ...
+ def is_ge_v23(self) -> bool: ...
+ def is_ge_v3(self) -> bool: ...
+ def uses_string_dtype(self) -> bool: ...
+ @property
+ def categorical_type(self) -> type[Categorical]: ...
+ @property
+ def datetimetz_type(self) -> type[DatetimeTZDtype]: ...
+ @property
+ def extension_dtype(self) -> type[ExtensionDtype]: ...
+
+ def is_array_like(
+ self, obj: Any
+ ) -> TypeGuard[Series | Index | Categorical | ExtensionDtype]: ...
+ def is_categorical(self, obj: Any) -> TypeGuard[Categorical]: ...
+ def is_datetimetz(self, obj: Any) -> TypeGuard[DatetimeTZDtype]: ...
+ def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ...
+ def is_sparse(self, obj: Any) -> bool: ...
+ def is_data_frame(self, obj: Any) -> TypeGuard[DataFrame]: ...
+ def is_series(self, obj: Any) -> TypeGuard[Series]: ...
+ def is_index(self, obj: Any) -> TypeGuard[Index]: ...
+ def get_values(self, obj: Any) -> bool: ...
+ def get_rangeindex_attribute(self, level, name): ...
+
+
+_pandas_api: _PandasAPIShim
+
+__all__ = ["_PandasAPIShim", "_pandas_api"]
diff --git a/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi b/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi
new file mode 100644
index 00000000000..5329bd6c66a
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from .core import * # noqa: F401, F403
diff --git a/python/pyarrow-stubs/pyarrow/parquet/core.pyi b/python/pyarrow-stubs/pyarrow/parquet/core.pyi
new file mode 100644
index 00000000000..83326c717ae
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/parquet/core.pyi
@@ -0,0 +1,372 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+from pathlib import Path
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+from collections.abc import Callable, Iterator, Iterable, Sequence
+from typing import IO, Literal
+
+if sys.version_info >= (3, 10):
+ from typing import TypeAlias
+else:
+ from typing_extensions import TypeAlias
+
+from pyarrow import _parquet
+from pyarrow._compute import Expression
+from pyarrow._fs import FileSystem, SupportedFileSystem
+from pyarrow._parquet import (
+ ColumnChunkMetaData,
+ ColumnSchema,
+ FileDecryptionProperties,
+ FileEncryptionProperties,
+ FileMetaData,
+ ParquetLogicalType,
+ ParquetReader,
+ ParquetSchema,
+ RowGroupMetaData,
+ SortingColumn,
+ Statistics,
+)
+from pyarrow._stubs_typing import FilterTuple, SingleOrList
+from pyarrow.dataset import ParquetFileFragment, Partitioning, PartitioningFactory
+from pyarrow.lib import Buffer, NativeFile, RecordBatch, Schema, Table, ChunkedArray
+from typing_extensions import deprecated
+
+__all__ = (
+ "ColumnChunkMetaData",
+ "ColumnSchema",
+ "FileDecryptionProperties",
+ "FileEncryptionProperties",
+ "FileMetaData",
+ "ParquetDataset",
+ "ParquetFile",
+ "ParquetLogicalType",
+ "ParquetReader",
+ "ParquetSchema",
+ "ParquetWriter",
+ "RowGroupMetaData",
+ "SortingColumn",
+ "Statistics",
+ "read_metadata",
+ "read_pandas",
+ "read_schema",
+ "read_table",
+ "write_metadata",
+ "write_table",
+ "write_to_dataset",
+ "_filters_to_expression",
+ "filters_to_expression",
+)
+
+
+def filters_to_expression(
+ filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ...
+
+
+@deprecated("use filters_to_expression")
+def _filters_to_expression(
+ filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ...
+
+
+_Compression: TypeAlias = Literal["gzip", "bz2",
+ "brotli", "lz4", "zstd", "snappy", "none"]
+
+
+class ParquetFile:
+ reader: ParquetReader
+ common_metadata: FileMetaData
+
+ def __init__(
+ self,
+ source: str | Path | Buffer | NativeFile | IO,
+ *,
+ metadata: FileMetaData | None = None,
+ common_metadata: FileMetaData | None = None,
+ read_dictionary: list[str] | None = None,
+ memory_map: bool = False,
+ buffer_size: int = 0,
+ pre_buffer: bool = False,
+ coerce_int96_timestamp_unit: str | None = None,
+ decryption_properties: FileDecryptionProperties | None = None,
+ thrift_string_size_limit: int | None = None,
+ thrift_container_size_limit: int | None = None,
+ filesystem: SupportedFileSystem | None = None,
+ page_checksum_verification: bool = False,
+ ): ...
+ def __enter__(self) -> Self: ...
+ def __exit__(self, *args, **kwargs) -> None: ...
+ @property
+ def metadata(self) -> FileMetaData: ...
+ @property
+ def schema(self) -> ParquetSchema: ...
+ @property
+ def schema_arrow(self) -> Schema: ...
+ @property
+ def num_row_groups(self) -> int: ...
+ def close(self, force: bool = False) -> None: ...
+ @property
+ def closed(self) -> bool: ...
+
+ def read_row_group(
+ self,
+ i: int,
+ columns: Sequence[str | int] | None = None,
+ use_threads: bool = True,
+ use_pandas_metadata: bool = False,
+ ) -> Table: ...
+
+ def read_row_groups(
+ self,
+ row_groups: Sequence[int],
+ columns: Iterable[str | int] | None = None,
+ use_threads: bool = True,
+ use_pandas_metadata: bool = False,
+ ) -> Table: ...
+
+ def iter_batches(
+ self,
+ batch_size: int = 65536,
+ row_groups: Sequence[int] | None = None,
+ columns: Iterable[str | int] | None = None,
+ use_threads: bool = True,
+ use_pandas_metadata: bool = False,
+ ) -> Iterator[RecordBatch]: ...
+
+ def read(
+ self,
+ columns: Sequence[str | int] | None = None,
+ use_threads: bool = True,
+ use_pandas_metadata: bool = False,
+ ) -> Table: ...
+
+ def scan_contents(
+ self, columns: Iterable[str | int] | None = None, batch_size: int = 65536
+ ) -> int: ...
+
+
+class ParquetWriter:
+ flavor: str
+ schema_changed: bool
+ schema: ParquetSchema
+ where: str | Path | IO
+ file_handler: NativeFile | None
+ writer: _parquet.ParquetWriter
+ is_open: bool
+
+ def __init__(
+ self,
+ where: str | Path | IO | NativeFile,
+ schema: Schema,
+ filesystem: SupportedFileSystem | None = None,
+ flavor: str | None = None,
+ version: Literal["1.0", "2.4", "2.6"] = ...,
+ use_dictionary: bool = True,
+ compression: _Compression | dict[str, _Compression] = "snappy",
+ write_statistics: bool | list = True,
+ use_deprecated_int96_timestamps: bool | None = None,
+ compression_level: int | dict | None = None,
+ use_byte_stream_split: bool | list = False,
+ column_encoding: str | dict | None = None,
+ writer_engine_version=None,
+ data_page_version: Literal["1.0", "2.0"] = ...,
+ use_compliant_nested_type: bool = True,
+ encryption_properties: FileEncryptionProperties | None = None,
+ write_batch_size: int | None = None,
+ dictionary_pagesize_limit: int | None = None,
+ store_schema: bool = True,
+ write_page_index: bool = False,
+ write_page_checksum: bool = False,
+ sorting_columns: Sequence[SortingColumn] | None = None,
+ store_decimal_as_integer: bool = False,
+ max_rows_per_page: int | None = None,
+ **options,
+ ) -> None: ...
+ def __enter__(self) -> Self: ...
+ def __exit__(self, *args, **kwargs) -> Literal[False]: ...
+
+ def write(
+ self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None
+ ) -> None: ...
+ def write_batch(self, batch: RecordBatch,
+ row_group_size: int | None = None) -> None: ...
+
+ def write_table(self, table: Table, row_group_size: int | None = None) -> None: ...
+ def close(self) -> None: ...
+ def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: ...
+
+
+class ParquetDataset:
+ def __init__(
+ self,
+ path_or_paths: SingleOrList[str]
+ | SingleOrList[Path]
+ | SingleOrList[NativeFile]
+ | SingleOrList[IO],
+ filesystem: SupportedFileSystem | None = None,
+ schema: Schema | None = None,
+ *,
+ filters: Expression
+ | FilterTuple
+ | list[FilterTuple]
+ | list[list[FilterTuple]]
+ | None = None,
+ read_dictionary: list[str] | None = None,
+ memory_map: bool = False,
+ buffer_size: int = 0,
+ partitioning: str
+ | list[str]
+ | Partitioning
+ | PartitioningFactory
+ | None = "hive",
+ ignore_prefixes: list[str] | None = None,
+ pre_buffer: bool = True,
+ coerce_int96_timestamp_unit: str | None = None,
+ decryption_properties: FileDecryptionProperties | None = None,
+ thrift_string_size_limit: int | None = None,
+ thrift_container_size_limit: int | None = None,
+ page_checksum_verification: bool = False,
+ ): ...
+ def equals(self, other: ParquetDataset) -> bool: ...
+ @property
+ def schema(self) -> Schema: ...
+
+ def read(
+ self,
+ columns: list[str] | None = None,
+ use_threads: bool = True,
+ use_pandas_metadata: bool = False,
+ ) -> Table: ...
+ def read_pandas(self, **kwargs) -> Table: ...
+ @property
+ def fragments(self) -> list[ParquetFileFragment]: ...
+ @property
+ def files(self) -> list[str]: ...
+ @property
+ def filesystem(self) -> FileSystem: ...
+ @property
+ def partitioning(self) -> Partitioning: ...
+
+
+def read_table(
+ source: SingleOrList[str]
+ | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO] | Buffer,
+ *,
+ columns: list | None = None,
+ use_threads: bool = True,
+ schema: Schema | None = None,
+ use_pandas_metadata: bool = False,
+ read_dictionary: list[str] | None = None,
+ memory_map: bool = False,
+ buffer_size: int = 0,
+ partitioning: str | list[str] | Partitioning | PartitioningFactory | None = "hive",
+ filesystem: SupportedFileSystem | str | None = None,
+ filters: Expression
+ | FilterTuple
+ | list[FilterTuple]
+ | Sequence[Sequence[tuple]]
+ | None = None,
+ ignore_prefixes: list[str] | None = None,
+ pre_buffer: bool = True,
+ coerce_int96_timestamp_unit: str | None = None,
+ decryption_properties: FileDecryptionProperties | None = None,
+ thrift_string_size_limit: int | None = None,
+ thrift_container_size_limit: int | None = None,
+ page_checksum_verification: bool = False,
+) -> Table: ...
+
+
+def read_pandas(
+ source: str | Path | NativeFile | IO | Buffer, columns: list | None = None, **kwargs
+) -> Table: ...
+
+
+def write_table(
+ table: Table,
+ where: str | Path | NativeFile | IO,
+ row_group_size: int | None = None,
+ version: Literal["1.0", "2.4", "2.6"] = "2.6",
+ use_dictionary: bool = True,
+ compression: _Compression | dict[str, _Compression] = "snappy",
+ write_statistics: bool | list = True,
+ use_deprecated_int96_timestamps: bool | None = None,
+ coerce_timestamps: str | None = None,
+ allow_truncated_timestamps: bool = False,
+ data_page_size: int | None = None,
+ flavor: str | None = None,
+ filesystem: SupportedFileSystem | str | None = None,
+ compression_level: int | dict | None = None,
+ use_byte_stream_split: bool = False,
+ column_encoding: str | dict | None = None,
+ data_page_version: Literal["1.0", "2.0"] = ...,
+ use_compliant_nested_type: bool = True,
+ encryption_properties: FileEncryptionProperties | None = None,
+ write_batch_size: int | None = None,
+ dictionary_pagesize_limit: int | None = None,
+ store_schema: bool = True,
+ write_page_index: bool = False,
+ write_page_checksum: bool = False,
+ sorting_columns: Sequence[SortingColumn] | None = None,
+ store_decimal_as_integer: bool = False,
+ **kwargs,
+) -> None: ...
+
+
+def write_to_dataset(
+ table: Table | ChunkedArray,
+ root_path: str | Path,
+ partition_cols: list[str] | None = None,
+ filesystem: SupportedFileSystem | None = None,
+ schema: Schema | None = None,
+ partitioning: Partitioning | list[str] | None = None,
+ basename_template: str | None = None,
+ use_threads: bool | None = None,
+ file_visitor: Callable[[str], None] | None = None,
+ existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"]
+ | None = None,
+ **kwargs,
+) -> None: ...
+
+
+def write_metadata(
+ schema: Schema,
+ where: str | NativeFile,
+ metadata_collector: list[FileMetaData] | None = None,
+ filesystem: SupportedFileSystem | None = None,
+ **kwargs,
+) -> None: ...
+
+
+def read_metadata(
+ where: str | Path | IO | NativeFile,
+ memory_map: bool = False,
+ decryption_properties: FileDecryptionProperties | None = None,
+ filesystem: SupportedFileSystem | str | None = None,
+) -> FileMetaData: ...
+
+
+def read_schema(
+ where: str | Path | IO | NativeFile,
+ memory_map: bool = False,
+ decryption_properties: FileDecryptionProperties | None = None,
+ filesystem: SupportedFileSystem | str | None = None,
+) -> Schema: ...
diff --git a/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi b/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi
new file mode 100644
index 00000000000..7add1c6fa53
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow._parquet_encryption import (
+ CryptoFactory,
+ DecryptionConfiguration,
+ EncryptionConfiguration,
+ FileSystemKeyMaterialStore,
+ KmsClient,
+ KmsConnectionConfig,
+)
+
+__all__ = [
+ "CryptoFactory",
+ "DecryptionConfiguration",
+ "EncryptionConfiguration",
+ "FileSystemKeyMaterialStore",
+ "KmsClient",
+ "KmsConnectionConfig",
+]
diff --git a/python/pyarrow-stubs/pyarrow/scalar.pyi b/python/pyarrow-stubs/pyarrow/scalar.pyi
new file mode 100644
index 00000000000..70b2ea2b347
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/scalar.pyi
@@ -0,0 +1,466 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import collections.abc
+import datetime as dt
+import sys
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+from collections.abc import Iterator
+from typing import Any, Generic, Literal
+
+import numpy as np
+
+from pyarrow._compute import CastOptions
+from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable
+from pyarrow.table import ArrayOrChunkedArray
+from typing_extensions import TypeVar
+
+from ._types import ( # noqa: F401
+ DataType,
+ Decimal128Type,
+ Date32Type,
+ Date64Type,
+ Time32Type,
+ Time64Type,
+ TimestampType,
+ Decimal256Type,
+ NullType,
+ BoolType,
+ UInt8Type,
+ Int8Type,
+ DurationType, MonthDayNanoIntervalType, BinaryType, LargeBinaryType,
+ FixedSizeBinaryType, StringType, LargeStringType, BinaryViewType, StringViewType,
+ FixedSizeListType,
+ Float16Type, Float32Type, Float64Type, Decimal32Type, Decimal64Type,
+ LargeListType,
+ LargeListViewType,
+ ListType,
+ ListViewType,
+ OpaqueType, DictionaryType, MapType, _BasicDataType,
+ StructType, RunEndEncodedType,
+ UInt16Type, Int16Type, UInt32Type, Int32Type, UInt64Type, Int64Type,
+ UnionType, ExtensionType, BaseExtensionType, Bool8Type, UuidType, JsonType,
+ _BasicValueT,
+ _DataTypeT,
+ _IndexT,
+ _K,
+ _Precision,
+ _RunEndType,
+ _Scale,
+ _Size,
+ _Time32Unit,
+ _Time64Unit,
+ _Tz,
+ _Unit,
+ _ValueT,
+)
+
+_AsPyTypeK = TypeVar("_AsPyTypeK")
+_AsPyTypeV = TypeVar("_AsPyTypeV")
+_DataType_co = TypeVar("_DataType_co", bound=DataType, covariant=True)
+
+
+class Scalar(_Weakrefable, Generic[_DataType_co]):
+ @property
+ def type(self) -> _DataType_co: ...
+
+ @property
+ def is_valid(self) -> bool: ...
+
+ def cast(
+ self,
+ target_type: None | _DataTypeT | str,
+ safe: bool = True,
+ options: CastOptions | None = None,
+ memory_pool: MemoryPool | None = None,
+ ) -> Self | Scalar[_DataTypeT] | Scalar[Any]: ...
+
+ def validate(self, *, full: bool = False) -> None: ...
+
+ def equals(self, other: Scalar | ArrayOrChunkedArray) -> bool: ...
+
+ def __hash__(self) -> int: ...
+
+ def as_py(self: Scalar[Any], *, maps_as_pydicts: Literal["lossy",
+ "strict"] | None = None) -> Any: ...
+
+ def as_buffer(self) -> Buffer | None: ...
+
+ # Buffer protocol support
+ def __buffer__(self, flags: int) -> memoryview: ...
+
+ # Methods for structured types (StructScalar, MapScalar, ListScalar, etc.)
+ def __len__(self) -> int: ...
+
+ def __iter__(self) -> Iterator[Any]: ...
+
+ def __getitem__(self, key: int | str) -> Any: ...
+
+ def __contains__(self, key: object) -> bool: ...
+
+ def keys(self) -> Iterator[str]: ...
+
+ def items(self) -> Iterator[tuple[str, Any]]: ...
+
+ @property
+ def values(self) -> Any: ...
+
+ # Methods for compatibility with array-like interface
+ def to_pylist(self) -> list: ...
+ def tolist(self) -> list: ...
+ def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> Any: ...
+
+
+_NULL: NullScalar
+NA: NullScalar
+
+
+class NullScalar(Scalar[NullType]):
+ ...
+
+
+class BooleanScalar(Scalar[BoolType]):
+ ...
+
+
+class UInt8Scalar(Scalar[UInt8Type]):
+ ...
+
+
+class Int8Scalar(Scalar[Int8Type]):
+ ...
+
+
+class UInt16Scalar(Scalar[UInt16Type]):
+ ...
+
+
+class Int16Scalar(Scalar[Int16Type]):
+ ...
+
+
+class UInt32Scalar(Scalar[UInt32Type]):
+ ...
+
+
+class Int32Scalar(Scalar[Int32Type]):
+ ...
+
+
+class UInt64Scalar(Scalar[UInt64Type]):
+ ...
+
+
+class Int64Scalar(Scalar[Int64Type]):
+ ...
+
+
+class HalfFloatScalar(Scalar[Float16Type]):
+ ...
+
+
+class FloatScalar(Scalar[Float32Type]):
+ ...
+
+
+class DoubleScalar(Scalar[Float64Type]):
+ ...
+
+
+class Decimal32Scalar(Scalar[Decimal32Type[_Precision, _Scale]]):
+ ...
+
+
+class Decimal64Scalar(Scalar[Decimal64Type[_Precision, _Scale]]):
+ ...
+
+
+class Decimal128Scalar(Scalar[Decimal128Type[_Precision, _Scale]]):
+ ...
+
+
+class Decimal256Scalar(Scalar[Decimal256Type[_Precision, _Scale]]):
+ ...
+
+
+class Date32Scalar(Scalar[Date32Type]):
+ ...
+
+
+class Date64Scalar(Scalar[Date64Type]):
+ @property
+ def value(self) -> dt.date | None: ...
+
+
+class Time32Scalar(Scalar[Time32Type[_Time32Unit]]):
+ @property
+ def value(self) -> dt.time | None: ...
+
+
+class Time64Scalar(Scalar[Time64Type[_Time64Unit]]):
+ @property
+ def value(self) -> dt.time | None: ...
+
+
+class TimestampScalar(Scalar[TimestampType[_Unit, _Tz]]):
+ @property
+ def value(self) -> int | None: ...
+
+
+class DurationScalar(Scalar[DurationType[_Unit]]):
+ @property
+ def value(self) -> dt.timedelta | None: ...
+
+
+class MonthDayNanoIntervalScalar(Scalar[MonthDayNanoIntervalType]):
+ @property
+ def value(self) -> MonthDayNano | None: ...
+
+
+class BinaryScalar(Scalar[BinaryType]):
+ def as_buffer(self) -> Buffer: ...
+
+
+class LargeBinaryScalar(Scalar[LargeBinaryType]):
+ def as_buffer(self) -> Buffer: ...
+
+
+class FixedSizeBinaryScalar(Scalar[FixedSizeBinaryType]):
+ def as_buffer(self) -> Buffer: ...
+
+
+class StringScalar(Scalar[StringType]):
+ def as_buffer(self) -> Buffer: ...
+
+
+class LargeStringScalar(Scalar[LargeStringType]):
+ def as_buffer(self) -> Buffer: ...
+
+
+class BinaryViewScalar(Scalar[BinaryViewType]):
+ def as_buffer(self) -> Buffer: ...
+
+
+class StringViewScalar(Scalar[StringViewType]):
+ def as_buffer(self) -> Buffer: ...
+
+
+class ListScalar(Scalar[ListType[_DataTypeT]]):
+ @property
+ def values(self) -> Array | None: ...
+ def __len__(self) -> int: ...
+
+ def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ...
+
+ def __iter__(self) -> Iterator[Array]: ...
+
+
+class FixedSizeListScalar(Scalar[FixedSizeListType[_DataTypeT, _Size]]):
+ @property
+ def values(self) -> Array | None: ...
+ def __len__(self) -> int: ...
+
+ def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ...
+
+ def __iter__(self) -> Iterator[Array]: ...
+
+
+class LargeListScalar(Scalar[LargeListType[_DataTypeT]]):
+ @property
+ def values(self) -> Array | None: ...
+ def __len__(self) -> int: ...
+
+ def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ...
+
+ def __iter__(self) -> Iterator[Array]: ...
+
+
+class ListViewScalar(Scalar[ListViewType[_DataTypeT]]):
+ @property
+ def values(self) -> Array | None: ...
+ def __len__(self) -> int: ...
+
+ def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ...
+
+ def __iter__(self) -> Iterator[Array]: ...
+
+
+class LargeListViewScalar(Scalar[LargeListViewType[_DataTypeT]]):
+ @property
+ def values(self) -> Array | None: ...
+ def __len__(self) -> int: ...
+
+ def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ...
+
+ def __iter__(self) -> Iterator[Array]: ...
+
+
+class StructScalar(Scalar[StructType], collections.abc.Mapping[str, Scalar]):
+ def __len__(self) -> int: ...
+
+ def __iter__(self) -> Iterator[str]: ...
+
+ def __getitem__(self, key: int | str) -> Scalar[Any]: ...
+
+ def keys(self) -> collections.abc.KeysView[str]: # type: ignore[override]
+ ...
+
+ def items(self) -> collections.abc.ItemsView[str, Scalar[Any]]: # type: ignore[override] # noqa: E501
+ ...
+
+ def _as_py_tuple(self) -> list[tuple[str, Any]]: ...
+
+
+class MapScalar(Scalar[MapType[_K, _ValueT]]):
+ @property
+ def values(self) -> Array | None: ...
+ def __len__(self) -> int: ...
+
+ def __getitem__(self, i: int | str) -> (
+ tuple[Scalar[_K], _ValueT, Any] | Scalar[Any]): ...
+
+ def __iter__(self: Scalar[
+ MapType[_BasicDataType[_AsPyTypeK], _BasicDataType[_AsPyTypeV]]]
+ | Scalar[MapType[Any, _BasicDataType[_AsPyTypeV]]]
+ | Scalar[MapType[_BasicDataType[_AsPyTypeK], Any]]) -> (
+ Iterator[tuple[_AsPyTypeK, _AsPyTypeV]]
+ | Iterator[tuple[Any, _AsPyTypeV]]
+ | Iterator[tuple[_AsPyTypeK, Any]]
+ ): ...
+
+
+class DictionaryScalar(Scalar[DictionaryType[_IndexT, _BasicValueT]]):
+ @property
+ def index(self) -> Scalar[_IndexT]: ...
+
+ @property
+ def value(self) -> Scalar[_BasicValueT]: ...
+
+ @property
+ def dictionary(self) -> Array: ...
+
+
+class RunEndEncodedScalar(Scalar[RunEndEncodedType[_RunEndType, _BasicValueT]]):
+ @property
+ def value(self) -> tuple[int, _BasicValueT] | None: ...
+
+
+class UnionScalar(Scalar[UnionType]):
+ @property
+ def value(self) -> Any | None: ...
+
+ @property
+ def type_code(self) -> str: ...
+
+
+class ExtensionScalar(Scalar[ExtensionType]):
+ @property
+ def value(self) -> Any | None: ...
+
+ @staticmethod
+ def from_storage(typ: BaseExtensionType, value) -> ExtensionScalar: ...
+
+
+class Bool8Scalar(Scalar[Bool8Type]):
+ ...
+
+
+class UuidScalar(Scalar[UuidType]):
+ ...
+
+
+class JsonScalar(Scalar[JsonType]):
+ ...
+
+
+class OpaqueScalar(Scalar[OpaqueType]):
+ ...
+
+
+class FixedShapeTensorScalar(ExtensionScalar):
+ def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> (
+ np.ndarray): ... # type: ignore[override]
+
+ def to_tensor(self) -> Tensor: ...
+
+
+def scalar(
+ value: Any,
+ type: _DataTypeT | str | None = None,
+ *,
+ from_pandas: bool | None = None,
+ memory_pool: MemoryPool | None = None,
+) -> Scalar[_DataTypeT] | Scalar[Any]: ...
+
+
+__all__ = [
+ "Scalar",
+ "_NULL",
+ "NA",
+ "NullScalar",
+ "BooleanScalar",
+ "UInt8Scalar",
+ "Int8Scalar",
+ "UInt16Scalar",
+ "Int16Scalar",
+ "UInt32Scalar",
+ "Int32Scalar",
+ "UInt64Scalar",
+ "Int64Scalar",
+ "HalfFloatScalar",
+ "FloatScalar",
+ "DoubleScalar",
+ "Decimal32Scalar",
+ "Decimal64Scalar",
+ "Decimal128Scalar",
+ "Decimal256Scalar",
+ "Date32Scalar",
+ "Date64Scalar",
+ "Time32Scalar",
+ "Time64Scalar",
+ "TimestampScalar",
+ "DurationScalar",
+ "MonthDayNanoIntervalScalar",
+ "BinaryScalar",
+ "LargeBinaryScalar",
+ "FixedSizeBinaryScalar",
+ "StringScalar",
+ "LargeStringScalar",
+ "BinaryViewScalar",
+ "StringViewScalar",
+ "ListScalar",
+ "FixedSizeListScalar",
+ "LargeListScalar",
+ "ListViewScalar",
+ "LargeListViewScalar",
+ "StructScalar",
+ "MapScalar",
+ "DictionaryScalar",
+ "RunEndEncodedScalar",
+ "UnionScalar",
+ "ExtensionScalar",
+ "FixedShapeTensorScalar",
+ "Bool8Scalar",
+ "UuidScalar",
+ "JsonScalar",
+ "OpaqueScalar",
+ "scalar",
+]
diff --git a/python/pyarrow-stubs/pyarrow/substrait.pyi b/python/pyarrow-stubs/pyarrow/substrait.pyi
new file mode 100644
index 00000000000..b78bbd8aebd
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/substrait.pyi
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow._substrait import (
+ BoundExpressions,
+ SubstraitSchema,
+ deserialize_expressions,
+ deserialize_schema,
+ get_supported_functions,
+ run_query,
+ serialize_expressions,
+ serialize_schema,
+)
+
+__all__ = [
+ "BoundExpressions",
+ "get_supported_functions",
+ "run_query",
+ "deserialize_expressions",
+ "serialize_expressions",
+ "deserialize_schema",
+ "serialize_schema",
+ "SubstraitSchema",
+]
diff --git a/python/pyarrow-stubs/pyarrow/table.pyi b/python/pyarrow-stubs/pyarrow/table.pyi
new file mode 100644
index 00000000000..6dd61674d40
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/table.pyi
@@ -0,0 +1,686 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+if sys.version_info >= (3, 10):
+ from typing import TypeAlias
+else:
+ from typing_extensions import TypeAlias
+from collections.abc import (
+ Collection, Generator, Iterable, Iterator, Sequence, Mapping)
+from typing import Any, Generic, Literal, TypeVar
+import builtins
+
+import numpy as np
+import pandas as pd
+
+from numpy.typing import NDArray
+from pyarrow._compute import (
+ CastOptions,
+ CountOptions,
+ FunctionOptions,
+ ScalarAggregateOptions,
+ TDigestOptions,
+ VarianceOptions,
+)
+from pyarrow._stubs_typing import (
+ Indices,
+ Mask,
+ NullEncoding,
+ NullSelectionBehavior,
+ Order,
+ SupportArrowArray,
+ SupportArrowDeviceArray,
+ SupportArrowStream,
+)
+from pyarrow.compute import Expression
+from pyarrow.interchange.dataframe import _PyArrowDataFrame
+from pyarrow.lib import Device, MemoryManager, MemoryPool, Schema
+from pyarrow.lib import Field as _Field
+
+from .array import Array, StructArray, _CastAs, _PandasConvertible
+from .device import DeviceAllocationType
+from .io import Buffer
+from ._ipc import RecordBatchReader
+from .scalar import BooleanScalar, Int64Scalar, Scalar, StructScalar
+from .tensor import Tensor
+from ._stubs_typing import NullableCollection
+from ._types import DataType, _AsPyType, _BasicDataType, _DataTypeT
+
+Field: TypeAlias = _Field[DataType]
+_ScalarT = TypeVar("_ScalarT", bound=Scalar)
+_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True)
+ArrayOrChunkedArray: TypeAlias = Array[_Scalar_co] | ChunkedArray[_Scalar_co]
+
+_Aggregation: TypeAlias = Literal[
+ "all",
+ "any",
+ "approximate_median",
+ "count",
+ "count_all",
+ "count_distinct",
+ "distinct",
+ "first",
+ "first_last",
+ "last",
+ "list",
+ "max",
+ "mean",
+ "min",
+ "min_max",
+ "one",
+ "product",
+ "stddev",
+ "sum",
+ "tdigest",
+ "variance",
+]
+_AggregationPrefixed: TypeAlias = Literal[
+ "hash_all",
+ "hash_any",
+ "hash_approximate_median",
+ "hash_count",
+ "hash_count_all",
+ "hash_count_distinct",
+ "hash_distinct",
+ "hash_first",
+ "hash_first_last",
+ "hash_last",
+ "hash_list",
+ "hash_max",
+ "hash_mean",
+ "hash_min",
+ "hash_min_max",
+ "hash_one",
+ "hash_product",
+ "hash_stddev",
+ "hash_sum",
+ "hash_tdigest",
+ "hash_variance",
+]
+Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed | str
+AggregateOptions: TypeAlias = (ScalarAggregateOptions | CountOptions
+ | TDigestOptions | VarianceOptions | FunctionOptions)
+
+UnarySelector: TypeAlias = str
+NullarySelector: TypeAlias = tuple[()]
+NarySelector: TypeAlias = list[str] | tuple[str, ...]
+ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector
+
+
+class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]):
+
+ def as_py(self) -> list[Any]: ...
+
+ @property
+ def data(self) -> Self: ...
+ @property
+ def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: ...
+
+ # Private attribute used internally for column names
+ _name: str | None
+
+ def length(self) -> int: ...
+
+ __len__ = length
+
+ def to_string(
+ self,
+ *,
+ indent: int = 0,
+ window: int = 5,
+ container_window: int = 2,
+ skip_new_lines: bool = False,
+ ) -> str: ...
+
+ format = to_string
+ def validate(self, *, full: bool = False) -> None: ...
+
+ @property
+ def null_count(self) -> int: ...
+
+ @property
+ def nbytes(self) -> int: ...
+
+ def get_total_buffer_size(self) -> int: ...
+
+ def __sizeof__(self) -> int: ...
+
+ def __getitem__(
+ self, key: int | np.integer | builtins.slice) -> _Scalar_co | Self: ...
+
+ def getitem(self, i: int) -> Scalar: ...
+ def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[BooleanScalar]: ...
+
+ def is_nan(self) -> ChunkedArray[BooleanScalar]: ...
+
+ def is_valid(self) -> ChunkedArray[BooleanScalar]: ...
+
+ def cast(
+ self, target_type: _CastAs | str | None, safe: bool = True,
+ options: CastOptions | None = None,
+ memory_pool: MemoryPool | None = None
+ ) -> Self | ChunkedArray[Scalar[_CastAs]]: ...
+
+ def fill_null(self, fill_value: Scalar[_DataTypeT] | Any) -> Self: ...
+
+ def equals(self, other: Self | Any) -> bool: ...
+
+ def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: ...
+
+ def __array__(self, dtype: np.dtype | None = None,
+ copy: bool | None = None) -> np.ndarray: ...
+
+ def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ...
+
+ def flatten(self, memory_pool: MemoryPool |
+ None = None) -> list[ChunkedArray[Any]]: ...
+
+ def combine_chunks(self, memory_pool: MemoryPool |
+ None = None) -> Array[_Scalar_co]: ...
+
+ def unique(self) -> ChunkedArray[_Scalar_co]: ...
+
+ def value_counts(self) -> StructArray: ...
+
+ def slice(self, offset: int = 0, length: int | None = None) -> Self: ...
+
+ def filter(self, mask: Mask,
+ null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ...
+
+ def index(
+ self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]],
+ value: Scalar[_DataTypeT] | _AsPyType,
+ start: int | None = None,
+ end: int | None = None,
+ *,
+ memory_pool: MemoryPool | None = None,
+ ) -> Int64Scalar: ...
+
+ def take(self, indices: Indices) -> Self: ...
+
+ def drop_null(self) -> Self: ...
+
+ def sort(self, order: Order = "ascending", **kwargs) -> Self: ...
+
+ def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ...
+
+ @property
+ def num_chunks(self) -> int: ...
+
+ def chunk(self, i: int) -> Array[_Scalar_co]: ...
+
+ @property
+ def chunks(self) -> list[Array[_Scalar_co]]: ...
+
+ def iterchunks(
+ self: ArrayOrChunkedArray[_ScalarT],
+ ) -> Generator[Array, None, None]: ...
+
+ def __iter__(self) -> Iterator[_Scalar_co]: ...
+
+ def to_pylist(
+ self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]],
+ *,
+ maps_as_pydicts: Literal["lossy", "strict"] | None = None,
+ ) -> list[_AsPyType | None]: ...
+
+ def __arrow_c_stream__(self, requested_schema=None) -> Any: ...
+
+ @classmethod
+ def _import_from_c_capsule(cls, stream) -> Self: ...
+
+ @property
+ def is_cpu(self) -> bool: ...
+
+
+def chunked_array(
+ arrays: Iterable[NullableCollection[Any]]
+ | Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray]
+ | Iterable[Array[_ScalarT]] | Array[_ScalarT]
+ | SupportArrowArray | SupportArrowStream,
+ type: DataType | str | None = None,
+) -> ChunkedArray[Scalar[Any]] | ChunkedArray[_ScalarT]: ...
+
+
+_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any])
+
+
+class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]):
+ def __array__(self, dtype: np.dtype | None = None,
+ copy: bool | None = None) -> np.ndarray: ...
+
+ def __dataframe__(
+ self, nan_as_null: bool = False, allow_copy: bool = True
+ ) -> _PyArrowDataFrame: ...
+
+ def __getitem__(self, key: int | str | slice) -> _ColumnT | Self: ...
+
+ def __len__(self) -> int: ...
+ def column(self, i: int | str) -> _ColumnT: ...
+
+ @property
+ def column_names(self) -> list[str]: ...
+
+ @property
+ def columns(self) -> list[_ColumnT]: ...
+
+ def drop_null(self) -> Self: ...
+
+ def field(self, i: int | str) -> Field: ...
+
+ @classmethod
+ def from_pydict(
+ cls,
+ mapping:
+ Mapping[Any, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray | range],
+ schema: Schema | None = None,
+ metadata: Mapping[str | bytes, str | bytes] | None = None,
+ ) -> Self: ...
+
+ @classmethod
+ def from_pylist(
+ cls,
+ mapping: Sequence[Mapping[str, Any]],
+ schema: Schema | None = None,
+ metadata: Mapping[str | bytes, str | bytes] | None = None,
+ ) -> Self: ...
+
+ def itercolumns(self) -> Generator[_ColumnT, None, None]: ...
+
+ @property
+ def num_columns(self) -> int: ...
+ @property
+ def num_rows(self) -> int: ...
+ @property
+ def shape(self) -> tuple[int, int]: ...
+
+ @property
+ def schema(self) -> Schema: ...
+ @property
+ def nbytes(self) -> int: ...
+ def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: ...
+
+ def take(self, indices: Indices) -> Self: ...
+
+ def filter(
+ self,
+ mask: Mask | Expression,
+ null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ...
+
+ def to_pydict(
+ self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None
+ ) -> dict[str, list[Any]]: ...
+
+ def to_pylist(
+ self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None
+ ) -> list[dict[str, Any]]: ...
+
+ def to_string(self, *, show_metadata: bool = False,
+ preview_cols: int = 0) -> str: ...
+
+ def remove_column(self, i: int) -> Self: ...
+ def drop_columns(self, columns: str | list[str]) -> Self: ...
+
+ def add_column(self, i: int, field_: str | Field,
+ column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ...
+
+ def append_column(
+ self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]]
+ ) -> Self: ...
+
+
+class RecordBatch(_Tabular[Array]):
+ def validate(self, *, full: bool = False) -> None: ...
+
+ def replace_schema_metadata(
+ self,
+ metadata: dict[str, str]
+ | dict[bytes, bytes]
+ | dict[bytes, str]
+ | dict[str, bytes]
+ | None = None
+ ) -> Self: ...
+
+ @property
+ def num_columns(self) -> int: ...
+
+ @property
+ def num_rows(self) -> int: ...
+
+ @property
+ def schema(self) -> Schema: ...
+
+ @property
+ def nbytes(self) -> int: ...
+
+ def get_total_buffer_size(self) -> int: ...
+
+ def __sizeof__(self) -> int: ...
+
+ def add_column(
+ self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list
+ ) -> Self: ...
+
+ def remove_column(self, i: int) -> Self: ...
+
+ def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ...
+
+ def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ...
+
+ def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ...
+
+ def slice(self, offset: int = 0, length: int | None = None) -> Self: ...
+
+ def equals(self, other: Self | Any, check_metadata: bool = False) -> bool: ...
+
+ def select(self, columns: Iterable[str] |
+ Iterable[int] | NDArray[np.str_]) -> Self: ...
+
+ def cast(self, target_schema: Schema, safe: bool | None = None,
+ options: CastOptions | None = None) -> Self: ...
+
+ @classmethod
+ def from_arrays(
+ cls,
+ arrays: Iterable[Any],
+ names: list[str] | tuple[str, ...] | None = None,
+ schema: Schema | None = None,
+ metadata: Mapping[bytes, bytes]
+ | Mapping[str, str]
+ | Mapping[bytes, str]
+ | Mapping[str, bytes]
+ | None = None,
+ ) -> Self: ...
+
+ @classmethod
+ def from_pandas(
+ cls,
+ df: pd.DataFrame,
+ schema: Schema | None = None,
+ preserve_index: bool | None = None,
+ nthreads: int | None = None,
+ columns: Sequence[str | int] | None = None,
+ ) -> Self: ...
+
+ @classmethod
+ def from_struct_array(
+ cls, struct_array: StructArray | ChunkedArray[StructScalar]
+ ) -> Self: ...
+
+ def to_struct_array(self) -> StructArray: ...
+
+ def to_tensor(
+ self,
+ null_to_nan: bool = False,
+ row_major: bool = True,
+ memory_pool: MemoryPool | None = None,
+ ) -> Tensor: ...
+
+ def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): ...
+
+ @classmethod
+ def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: ...
+
+ def __arrow_c_array__(self, requested_schema=None): ...
+
+ def __arrow_c_stream__(self, requested_schema=None): ...
+
+ @classmethod
+ def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ...
+
+ def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ...
+
+ @classmethod
+ def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: ...
+
+ def __arrow_c_device_array__(self, requested_schema=None, **kwargs): ...
+
+ @classmethod
+ def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ...
+
+ @property
+ def device_type(self) -> DeviceAllocationType: ...
+
+ @property
+ def is_cpu(self) -> bool: ...
+
+ def copy_to(self, destination: MemoryManager | Device) -> Self: ...
+
+
+def table_to_blocks(options, table: Table, categories, extension_columns): ...
+
+
+JoinType: TypeAlias = Literal[
+ "left semi",
+ "right semi",
+ "left anti",
+ "right anti",
+ "inner",
+ "left outer",
+ "right outer",
+ "full outer",
+]
+
+
+class Table(_Tabular[ChunkedArray[Any]]):
+ def validate(self, *, full: bool = False) -> None: ...
+
+ def slice(self, offset: int = 0, length: int | None = None) -> Self: ...
+
+ def select(self, columns: Iterable[str] |
+ Iterable[int] | NDArray[np.str_]) -> Self: ...
+
+ def replace_schema_metadata(
+ self, metadata: dict[str, str]
+ | dict[bytes, bytes]
+ | dict[bytes, str]
+ | dict[str, bytes]
+ | None = None
+ ) -> Self: ...
+
+ def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ...
+
+ def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ...
+
+ def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ...
+
+ def equals(self, other: Self | Any, check_metadata: bool = False) -> bool: ...
+
+ def cast(self, target_schema: Schema, safe: bool | None = None,
+ options: CastOptions | None = None) -> Self: ...
+
+ @classmethod
+ def from_pandas(
+ cls,
+ df: pd.DataFrame,
+ schema: Schema | None = None,
+ preserve_index: bool | None = None,
+ nthreads: int | None = None,
+ columns: Sequence[str | int] | None = None,
+ safe: bool = True,
+ ) -> Self: ...
+
+ @classmethod
+ def from_arrays(
+ cls,
+ arrays:
+ Collection[ArrayOrChunkedArray[Any] | Collection[NDArray[Any]] | list[Any]],
+ names: list[str] | tuple[str, ...] | None = None,
+ schema: Schema | None = None,
+ metadata: Mapping[bytes, bytes]
+ | Mapping[str, str]
+ | Mapping[bytes, str]
+ | Mapping[str, bytes] | None = None,
+ ) -> Self: ...
+
+ @classmethod
+ def from_struct_array(
+ cls, struct_array: StructArray | ChunkedArray[StructScalar]
+ ) -> Self: ...
+
+ def to_struct_array(
+ self, max_chunksize: int | None = None
+ ) -> ChunkedArray[StructScalar]: ...
+
+ @classmethod
+ def from_batches(cls, batches: Iterable[RecordBatch],
+ schema: Schema | None = None) -> Self: ...
+
+ def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: ...
+
+ def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ...
+
+ @property
+ def schema(self) -> Schema: ...
+
+ @property
+ def num_columns(self) -> int: ...
+
+ @property
+ def num_rows(self) -> int: ...
+
+ @property
+ def nbytes(self) -> int: ...
+
+ def get_total_buffer_size(self) -> int: ...
+
+ def __sizeof__(self) -> int: ...
+
+ def add_column(self, i: int, field_: str | Field,
+ column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ...
+
+ def remove_column(self, i: int) -> Self: ...
+
+ def set_column(self, i: int, field_: str | Field,
+ column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ...
+
+ def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ...
+
+ def drop(self, columns: str | list[str]) -> Self: ...
+
+ def group_by(self, keys: str | list[str],
+ use_threads: bool = True) -> TableGroupBy: ...
+
+ def join(
+ self,
+ right_table: Self,
+ keys: str | list[str],
+ right_keys: str | list[str] | None = None,
+ join_type: JoinType = "left outer",
+ left_suffix: str | None = None,
+ right_suffix: str | None = None,
+ coalesce_keys: bool = True,
+ use_threads: bool = True,
+ ) -> Self: ...
+
+ def join_asof(
+ self,
+ right_table: Self,
+ on: str,
+ by: str | list[str],
+ tolerance: int,
+ right_on: str | list[str] | None = None,
+ right_by: str | list[str] | None = None,
+ ) -> Self: ...
+
+ def __arrow_c_stream__(self, requested_schema=None): ...
+
+ @property
+ def is_cpu(self) -> bool: ...
+
+
+def record_batch(
+ data: Mapping[str, list[Any] | Array[Any]]
+ | Collection[Array[Any] | ChunkedArray[Any] | list[Any]]
+ | pd.DataFrame
+ | SupportArrowArray
+ | SupportArrowDeviceArray,
+ names: list[str] | Schema | None = None,
+ schema: Schema | None = None,
+ metadata: Mapping[str | bytes, str | bytes] | None = None,
+) -> RecordBatch: ...
+
+
+def table(
+ data: Collection[ArrayOrChunkedArray[Any] | list[Any] | range | str]
+ | pd.DataFrame
+ | SupportArrowArray
+ | SupportArrowStream
+ | SupportArrowDeviceArray
+ | Mapping[str, list[Any] | Array[Any] | ChunkedArray[Any] | range]
+ | Mapping[str, Any],
+ names: list[str] | Schema | None = None,
+ schema: Schema | None = None,
+ metadata: Mapping[str | bytes, str | bytes] | None = None,
+ nthreads: int | None = None,
+) -> Table: ...
+
+
+def concat_tables(
+ tables: Iterable[Table],
+ memory_pool: MemoryPool | None = None,
+ promote_options: Literal["none", "default", "permissive"] = "none",
+ **kwargs: Any,
+) -> Table: ...
+
+
+class TableGroupBy:
+
+ keys: str | list[str]
+
+ def __init__(self, table: Table, keys: str |
+ list[str], use_threads: bool = True): ...
+
+ def aggregate(
+ self,
+ aggregations: Iterable[
+ tuple[ColumnSelector, Aggregation]
+ | tuple[ColumnSelector, Aggregation, AggregateOptions | None]
+ ],
+ ) -> Table: ...
+
+ def _table(self) -> Table: ...
+ @property
+ def _use_threads(self) -> bool: ...
+
+
+def concat_batches(
+ recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None
+) -> RecordBatch: ...
+
+
+__all__ = [
+ "ChunkedArray",
+ "chunked_array",
+ "_Tabular",
+ "RecordBatch",
+ "table_to_blocks",
+ "Table",
+ "record_batch",
+ "table",
+ "concat_tables",
+ "TableGroupBy",
+ "concat_batches",
+ "Aggregation",
+ "AggregateOptions",
+]
diff --git a/python/pyarrow-stubs/pyarrow/tensor.pyi b/python/pyarrow-stubs/pyarrow/tensor.pyi
new file mode 100644
index 00000000000..ba40c7b299d
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/tensor.pyi
@@ -0,0 +1,268 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+if sys.version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+
+from collections.abc import Sequence
+import numpy as np
+
+from pyarrow.lib import _Weakrefable
+from pyarrow._types import DataType
+from scipy.sparse import coo_matrix, csr_matrix
+from sparse import COO # type: ignore[import-untyped, import-not-found]
+
+
+class Tensor(_Weakrefable):
+ @classmethod
+ def from_numpy(cls, obj: np.ndarray,
+ dim_names: Sequence[str] | None = None) -> Self: ...
+
+ def to_numpy(self) -> np.ndarray: ...
+
+ def equals(self, other: Tensor) -> bool: ...
+
+ def dim_name(self, i: int) -> str: ...
+
+ @property
+ def dim_names(self) -> list[str]: ...
+
+ @property
+ def is_mutable(self) -> bool: ...
+
+ @property
+ def is_contiguous(self) -> bool: ...
+
+ @property
+ def ndim(self) -> int: ...
+
+ @property
+ def size(self) -> str: ...
+
+ @property
+ def shape(self) -> tuple[int, ...]: ...
+
+ @property
+ def strides(self) -> tuple[int, ...]: ...
+
+ @property
+ def type(self) -> DataType: ...
+
+
+class SparseCOOTensor(_Weakrefable):
+ @classmethod
+ def from_dense_numpy(cls, obj: np.ndarray,
+ dim_names: list[str] | None = None) -> Self: ...
+
+ @classmethod
+ def from_numpy(
+ cls,
+ data: np.ndarray,
+ coords: np.ndarray,
+ shape: Sequence[int],
+ dim_names: Sequence[str] | None = None,
+ ) -> Self: ...
+
+ @classmethod
+ def from_scipy(cls, obj: csr_matrix,
+ dim_names: Sequence[str] | None = None) -> Self: ...
+
+ @classmethod
+ def from_pydata_sparse(
+ cls, obj: COO, dim_names: Sequence[str] | None = None) -> Self: ...
+
+ @classmethod
+ def from_tensor(cls, obj: Tensor) -> Self: ...
+
+ def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: ...
+
+ def to_scipy(self) -> coo_matrix: ...
+
+ def to_pydata_sparse(self) -> COO: ...
+
+ def to_tensor(self) -> Tensor: ...
+
+ def equals(self, other: Self) -> bool: ...
+
+ @property
+ def is_mutable(self) -> bool: ...
+ @property
+ def ndim(self) -> int: ...
+ @property
+ def size(self) -> str: ...
+ @property
+ def shape(self) -> tuple[int, ...]: ...
+ def dim_name(self, i: int) -> str: ...
+
+ @property
+ def dim_names(self) -> list[str]: ...
+ @property
+ def non_zero_length(self) -> int: ...
+ @property
+ def has_canonical_format(self) -> bool: ...
+ @property
+ def type(self) -> DataType: ...
+
+
+class SparseCSRMatrix(_Weakrefable):
+ @classmethod
+ def from_dense_numpy(cls, obj: np.ndarray,
+ dim_names: list[str] | None = None) -> Self: ...
+
+ @classmethod
+ def from_numpy(
+ cls,
+ data: np.ndarray,
+ indptr: np.ndarray,
+ indices: np.ndarray,
+ shape: Sequence[int],
+ dim_names: Sequence[str] | None = None,
+ ) -> Self: ...
+
+ @classmethod
+ def from_scipy(cls, obj: csr_matrix,
+ dim_names: Sequence[str] | None = None) -> Self: ...
+
+ @classmethod
+ def from_tensor(cls, obj: Tensor) -> Self: ...
+
+ def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ...
+
+ def to_scipy(self) -> csr_matrix: ...
+
+ def to_tensor(self) -> Tensor: ...
+
+ def equals(self, other: Self) -> bool: ...
+
+ @property
+ def is_mutable(self) -> bool: ...
+ @property
+ def ndim(self) -> int: ...
+ @property
+ def size(self) -> str: ...
+ @property
+ def shape(self) -> tuple[int, ...]: ...
+ def dim_name(self, i: int) -> str: ...
+
+ @property
+ def dim_names(self) -> list[str]: ...
+ @property
+ def non_zero_length(self) -> int: ...
+ @property
+ def type(self) -> DataType: ...
+
+
+class SparseCSCMatrix(_Weakrefable):
+ @classmethod
+ def from_dense_numpy(cls, obj: np.ndarray,
+ dim_names: list[str] | None = None) -> Self: ...
+
+ @classmethod
+ def from_numpy(
+ cls,
+ data: np.ndarray,
+ indptr: np.ndarray,
+ indices: np.ndarray,
+ shape: tuple[int, ...],
+ dim_names: list[str] | None = None,
+ ) -> Self: ...
+
+ @classmethod
+ def from_scipy(cls, obj: csr_matrix,
+ dim_names: list[str] | None = None) -> Self: ...
+
+ @classmethod
+ def from_tensor(cls, obj: Tensor) -> Self: ...
+
+ def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ...
+
+ def to_scipy(self) -> csr_matrix: ...
+
+ def to_tensor(self) -> Tensor: ...
+
+ def equals(self, other: Self) -> bool: ...
+
+ @property
+ def is_mutable(self) -> bool: ...
+ @property
+ def ndim(self) -> int: ...
+ @property
+ def size(self) -> str: ...
+ @property
+ def shape(self) -> tuple[int, ...]: ...
+ def dim_name(self, i: int) -> str: ...
+
+ @property
+ def dim_names(self) -> list[str]: ...
+ @property
+ def non_zero_length(self) -> int: ...
+
+
+class SparseCSFTensor(_Weakrefable):
+ @classmethod
+ def from_dense_numpy(cls, obj: np.ndarray,
+ dim_names: Sequence[str] | None = None) -> Self: ...
+
+ @classmethod
+ def from_numpy(
+ cls,
+ data: np.ndarray,
+ indptr: Sequence[np.ndarray],
+ indices: Sequence[np.ndarray],
+ shape: tuple[int, ...],
+ axis_order: Sequence[int] | None = None,
+ dim_names: Sequence[str] | None = None,
+ ) -> Self: ...
+
+ @classmethod
+ def from_tensor(cls, obj: Tensor) -> Self: ...
+
+ def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ...
+
+ def to_tensor(self) -> Tensor: ...
+
+ def equals(self, other: Self) -> bool: ...
+
+ @property
+ def is_mutable(self) -> bool: ...
+ @property
+ def ndim(self) -> int: ...
+ @property
+ def size(self) -> str: ...
+ @property
+ def shape(self) -> tuple[int, ...]: ...
+ def dim_name(self, i: int) -> str: ...
+
+ @property
+ def dim_names(self) -> list[str]: ...
+ @property
+ def non_zero_length(self) -> int: ...
+ @property
+ def type(self) -> DataType: ...
+
+
+__all__ = [
+ "Tensor",
+ "SparseCOOTensor",
+ "SparseCSRMatrix",
+ "SparseCSCMatrix",
+ "SparseCSFTensor",
+]
diff --git a/python/pyarrow-stubs/pyarrow/tests/util.pyi b/python/pyarrow-stubs/pyarrow/tests/util.pyi
new file mode 100644
index 00000000000..5ceb784588a
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/tests/util.pyi
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+from contextlib import AbstractContextManager
+from decimal import Decimal
+from os import PathLike
+from typing import Any, Literal
+import socket
+
+import pyarrow.fs
+
+
+def randsign() -> int: ...
+def random_seed(seed: int) -> AbstractContextManager[None]: ...
+def randdecimal(precision: int, scale: int) -> Decimal: ...
+def random_ascii(length: int) -> bytes: ...
+def rands(nchars: int) -> str: ...
+def get_modified_env_with_pythonpath() -> dict[str, str]: ...
+def invoke_script(script_name: str, *args: str) -> None: ...
+def changed_environ(name: str, value: str) -> AbstractContextManager[None]: ...
+def change_cwd(path: str | PathLike[str]) -> AbstractContextManager[None]: ...
+def disabled_gc() -> AbstractContextManager[None]: ...
+def _filesystem_uri(path: str) -> str: ...
+
+
+def memory_leak_check(
+ f: Callable[[], Any],
+ metric: Literal['rss', 'vms', 'shared'] = 'rss',
+ threshold: int = 131072,
+ iterations: int = 10,
+ check_interval: int = 1
+) -> None: ...
+
+
+class FSProtocolClass:
+ def __init__(self, path: str | PathLike[str]) -> None: ...
+ def __fspath__(self) -> str: ...
+
+
+class ProxyHandler(pyarrow.fs.FileSystemHandler):
+ _fs: pyarrow.fs.FileSystem
+ def __init__(self, fs: pyarrow.fs.FileSystem) -> None: ...
+ def __eq__(self, other: object) -> bool: ...
+ def __ne__(self, other: object) -> bool: ...
+ def get_type_name(self) -> str: ...
+ def normalize_path(self, path: str) -> str: ...
+ def get_file_info(self, paths: list[str]) -> list[pyarrow.fs.FileInfo]: ...
+ def get_file_info_selector(
+ self, selector: pyarrow.fs.FileSelector) -> list[pyarrow.fs.FileInfo]: ...
+
+ def create_dir(self, path: str, recursive: bool) -> None: ...
+ def delete_dir(self, path: str) -> None: ...
+ def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ...
+ def delete_root_dir_contents(self) -> None: ...
+ def delete_file(self, path: str) -> None: ...
+ def move(self, src: str, dest: str) -> None: ...
+ def copy_file(self, src: str, dest: str) -> None: ...
+ def open_input_stream(self, path: str) -> Any: ...
+ def open_input_file(self, path: str) -> Any: ...
+ def open_output_stream(self, path: str, metadata: dict[str, str]) -> Any: ...
+ def open_append_stream(self, path: str, metadata: dict[str, str]) -> Any: ...
+
+
+def _ensure_minio_component_version(component: str, minimum_year: int) -> bool: ...
+def _run_mc_command(mcdir: str, *args: str) -> None: ...
+def windows_has_tzdata() -> bool: ...
+def running_on_musllinux() -> bool: ...
+
+
+def signal_wakeup_fd(
+ *, warn_on_full_buffer: bool = False) -> AbstractContextManager[socket.socket]: ...
+
+
+def _configure_s3_limited_user(
+ s3_server: dict[str, Any], policy: str, username: str, password: str) -> None: ...
+
+
+def _wait_for_minio_startup(
+ mcdir: str, address: str, access_key: str, secret_key: str) -> None: ...
diff --git a/python/pyarrow-stubs/pyarrow/types.pyi b/python/pyarrow-stubs/pyarrow/types.pyi
new file mode 100644
index 00000000000..9e5a0568db0
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/types.pyi
@@ -0,0 +1,227 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+from enum import IntEnum
+
+from typing import Any
+
+if sys.version_info >= (3, 13):
+ from typing import TypeIs
+else:
+ from typing_extensions import TypeIs
+if sys.version_info >= (3, 10):
+ from typing import TypeAlias
+else:
+ from typing_extensions import TypeAlias
+
+import pyarrow.lib as lib
+
+from pyarrow.lib import (
+ BinaryType,
+ BinaryViewType,
+ BoolType,
+ DataType,
+ Date32Type,
+ Date64Type,
+ Decimal32Type,
+ Decimal64Type,
+ Decimal128Type,
+ Decimal256Type,
+ DenseUnionType,
+ DictionaryType,
+ DurationType,
+ FixedSizeBinaryType,
+ FixedSizeListType,
+ Float16Type,
+ Float32Type,
+ Float64Type,
+ Int8Type,
+ Int16Type,
+ Int32Type,
+ Int64Type,
+ LargeBinaryType,
+ LargeListType,
+ LargeListViewType,
+ LargeStringType,
+ ListType,
+ ListViewType,
+ MapType,
+ MonthDayNanoIntervalType,
+ NullType,
+ RunEndEncodedType,
+ SparseUnionType,
+ StringType,
+ StringViewType,
+ StructType,
+ Time32Type,
+ Time64Type,
+ TimestampType,
+ UInt8Type,
+ UInt16Type,
+ UInt32Type,
+ UInt64Type,
+)
+
+_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type
+_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | UInt32Type | UInt64Type
+_Integer: TypeAlias = _SignedInteger | _UnsignedInteger
+_Floating: TypeAlias = Float16Type | Float32Type | Float64Type
+_Decimal: TypeAlias = (
+ Decimal32Type[Any, Any]
+ | Decimal64Type[Any, Any]
+ | Decimal128Type[Any, Any]
+ | Decimal256Type[Any, Any]
+)
+_Date: TypeAlias = Date32Type | Date64Type
+_Time: TypeAlias = Time32Type[Any] | Time64Type[Any]
+_Interval: TypeAlias = MonthDayNanoIntervalType
+_Temporal: TypeAlias = (TimestampType[Any, Any]
+ | DurationType[Any] | _Time | _Date | _Interval)
+_Union: TypeAlias = SparseUnionType | DenseUnionType
+_Nested: TypeAlias = (
+ ListType[Any]
+ | FixedSizeListType[Any, Any]
+ | LargeListType[Any]
+ | ListViewType[Any]
+ | LargeListViewType[Any]
+ | StructType
+ | MapType[Any, Any, Any]
+ | _Union
+)
+
+
+def is_null(t: DataType) -> TypeIs[NullType]: ...
+def is_boolean(t: DataType) -> TypeIs[BoolType]: ...
+def is_integer(t: DataType) -> TypeIs[_Integer]: ...
+def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ...
+def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ...
+def is_int8(t: DataType) -> TypeIs[Int8Type]: ...
+def is_int16(t: DataType) -> TypeIs[Int16Type]: ...
+def is_int32(t: DataType) -> TypeIs[Int32Type]: ...
+def is_int64(t: DataType) -> TypeIs[Int64Type]: ...
+def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ...
+def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ...
+def is_uint32(t: DataType) -> TypeIs[UInt32Type]: ...
+def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ...
+def is_floating(t: DataType) -> TypeIs[_Floating]: ...
+def is_float16(t: DataType) -> TypeIs[Float16Type]: ...
+def is_float32(t: DataType) -> TypeIs[Float32Type]: ...
+def is_float64(t: DataType) -> TypeIs[Float64Type]: ...
+def is_list(t: DataType) -> TypeIs[ListType[Any]]: ...
+def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ...
+def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ...
+def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ...
+def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ...
+def is_struct(t: DataType) -> TypeIs[StructType]: ...
+def is_union(t: DataType) -> TypeIs[_Union]: ...
+def is_nested(t: DataType) -> TypeIs[_Nested]: ...
+def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ...
+def is_temporal(t: DataType) -> TypeIs[_Temporal]: ...
+def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ...
+def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ...
+def is_time(t: DataType) -> TypeIs[_Time]: ...
+def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ...
+def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ...
+def is_binary(t: DataType) -> TypeIs[BinaryType]: ...
+def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ...
+def is_unicode(t: DataType) -> TypeIs[StringType]: ...
+def is_string(t: DataType) -> TypeIs[StringType]: ...
+def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ...
+def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ...
+def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ...
+def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ...
+def is_string_view(t: DataType) -> TypeIs[StringViewType]: ...
+def is_date(t: DataType) -> TypeIs[_Date]: ...
+def is_date32(t: DataType) -> TypeIs[Date32Type]: ...
+def is_date64(t: DataType) -> TypeIs[Date64Type]: ...
+def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ...
+def is_decimal(t: DataType) -> TypeIs[_Decimal]: ...
+def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ...
+def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ...
+def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ...
+def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ...
+def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ...
+def is_interval(t: DataType) -> TypeIs[_Interval]: ...
+def is_primitive(t: DataType) -> bool: ...
+def is_boolean_value(obj: Any) -> bool: ...
+def is_integer_value(obj: Any) -> bool: ...
+def is_float_value(obj: Any) -> bool: ...
+
+
+__all__ = [
+ "lib",
+ "is_binary",
+ "is_binary_view",
+ "is_boolean",
+ "is_date",
+ "is_date32",
+ "is_date64",
+ "is_decimal",
+ "is_decimal128",
+ "is_decimal256",
+ "is_decimal32",
+ "is_decimal64",
+ "is_dictionary",
+ "is_duration",
+ "is_fixed_size_binary",
+ "is_fixed_size_list",
+ "is_float16",
+ "is_float32",
+ "is_float64",
+ "is_floating",
+ "is_int16",
+ "is_int32",
+ "is_int64",
+ "is_int8",
+ "is_integer",
+ "is_interval",
+ "is_large_binary",
+ "is_large_list",
+ "is_large_list_view",
+ "is_large_string",
+ "is_large_unicode",
+ "is_list",
+ "is_list_view",
+ "is_map",
+ "is_nested",
+ "is_null",
+ "is_primitive",
+ "is_run_end_encoded",
+ "is_signed_integer",
+ "is_string",
+ "is_string_view",
+ "is_struct",
+ "is_temporal",
+ "is_time",
+ "is_time32",
+ "is_time64",
+ "is_timestamp",
+ "is_uint16",
+ "is_uint32",
+ "is_uint64",
+ "is_uint8",
+ "is_unicode",
+ "is_union",
+ "is_unsigned_integer",
+]
+
+
+class TypesEnum(IntEnum):
+ INTERVAL_MONTHS = 0
+ INTERVAL_DAY_TIME = 1
+ INTERVAL_MONTH_DAY_NANO = 2
diff --git a/python/pyarrow-stubs/pyarrow/util.pyi b/python/pyarrow-stubs/pyarrow/util.pyi
new file mode 100644
index 00000000000..c3317960c81
--- /dev/null
+++ b/python/pyarrow-stubs/pyarrow/util.pyi
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections.abc import Callable, Sequence
+from os import PathLike
+from typing import Any, Protocol, TypeVar
+
+_F = TypeVar("_F", bound=Callable)
+_N = TypeVar("_N")
+
+
+class _DocStringComponents(Protocol):
+ _docstring_components: list[str]
+
+
+def doc(
+ *docstrings: str | _DocStringComponents | Callable | None, **params: Any
+) -> Callable[[_F], _F]: ...
+def _is_iterable(obj) -> bool: ...
+def _is_path_like(path) -> bool: ...
+def _stringify_path(path: str | PathLike) -> str: ...
+def product(seq: Sequence[_N]) -> _N: ...
+
+
+def get_contiguous_span(
+ shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int
+) -> tuple[int, int]: ...
+def find_free_port() -> int: ...
+def guid() -> str: ...
+def _download_urllib(url, out_path) -> None: ...
+def _download_requests(url, out_path) -> None: ...
+def download_tzdata_on_windows() -> None: ...
+def _deprecate_api(old_name, new_name, api, next_version, type=...): ...
+def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ...
+def _break_traceback_cycle_from_frame(frame) -> None: ...
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index da2fe966475..d6836c14bd6 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -29,19 +29,17 @@
For more information see the official page at https://arrow.apache.org
"""
-import gc as _gc
import importlib as _importlib
import os as _os
import platform as _platform
import sys as _sys
-import warnings as _warnings
try:
- from ._generated_version import version as __version__
+ from ._generated_version import version as __version__ # type: ignore[import-untyped, import-not-found] # noqa: E501
except ImportError:
# Package is not installed, parse git tag at runtime
try:
- import setuptools_scm
+ import setuptools_scm # type: ignore[import-not-found, import-untyped]
# Code duplicated from setup.py to avoid a dependency on each other
def parse_git(root, **kwargs):
@@ -49,14 +47,14 @@ def parse_git(root, **kwargs):
Parse function for setuptools_scm that ignores tags for non-C++
subprojects, e.g. apache-arrow-js-XXX tags.
"""
- from setuptools_scm.git import parse
+ from setuptools_scm.git import parse # type: ignore[import-not-found, import-untyped] # noqa: E501
kwargs['describe_command'] = \
"git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'"
return parse(root, **kwargs)
__version__ = setuptools_scm.get_version('../',
parse=parse_git)
except ImportError:
- __version__ = None
+ __version__ = None # type: ignore[assignment]
import pyarrow.lib as _lib
from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path,
@@ -153,6 +151,8 @@ def print_entry(label, value):
print(f" {codec: <20}: {status: <8}")
+from pyarrow.lib import (
+ DataType, Array, MemoryPool) # type: ignore[reportAttributeAccessIssue]
from pyarrow.lib import (null, bool_,
int8, int16, int32, int64,
uint8, uint16, uint32, uint64,
@@ -170,7 +170,7 @@ def print_entry(label, value):
bool8, fixed_shape_tensor, json_, opaque, uuid,
field,
type_for_alias,
- DataType, DictionaryType, StructType,
+ DictionaryType, StructType,
ListType, LargeListType, FixedSizeListType,
ListViewType, LargeListViewType,
MapType, UnionType, SparseUnionType, DenseUnionType,
@@ -187,8 +187,7 @@ def print_entry(label, value):
Field,
Schema,
schema,
- unify_schemas,
- Array, Tensor,
+ unify_schemas, Tensor,
array, chunked_array, record_batch, nulls, repeat,
SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
SparseCSFTensor,
@@ -243,7 +242,7 @@ def print_entry(label, value):
from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer,
Codec, compress, decompress, allocate_buffer)
-from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool,
+from pyarrow.lib import (LoggingMemoryPool, ProxyMemoryPool,
total_allocated_bytes, set_memory_pool,
default_memory_pool, system_memory_pool,
jemalloc_memory_pool, mimalloc_memory_pool,
@@ -365,7 +364,7 @@ def create_library_symlinks():
if _sys.platform == 'linux':
bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*'))
- def get_symlink_path(hard_path):
+ def get_symlink_path(hard_path): # type: ignore[reportRedeclaration]
return hard_path.rsplit('.', 1)[0]
else:
bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib'))
diff --git a/python/pyarrow/acero.py b/python/pyarrow/acero.py
index e475e8db5c2..cd99a1bbc53 100644
--- a/python/pyarrow/acero.py
+++ b/python/pyarrow/acero.py
@@ -22,7 +22,7 @@
# distutils: language = c++
# cython: language_level = 3
-from pyarrow.lib import Table, RecordBatch, array
+from pyarrow.lib import Table, RecordBatch, array, Schema
from pyarrow.compute import Expression, field
try:
@@ -49,11 +49,14 @@
except ImportError:
class DatasetModuleStub:
class Dataset:
- pass
+ @property
+ def schema(self):
+ return Schema()
class InMemoryDataset:
- pass
- ds = DatasetModuleStub
+ def __init__(self, source):
+ pass
+ ds = DatasetModuleStub # type: ignore[assignment]
def _dataset_to_decl(dataset, use_threads=True, implicit_ordering=False):
@@ -306,7 +309,7 @@ def _perform_join_asof(left_operand, left_on, left_by,
# AsofJoin does not return on or by columns for right_operand.
right_columns = [
col for col in right_operand.schema.names
- if col not in [right_on] + right_by
+ if col not in [right_on] + right_by # type: ignore[reportOperatorIssue]
]
columns_collisions = set(left_operand.schema.names) & set(right_columns)
if columns_collisions:
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 575b628db3a..915a715f8ec 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -3648,7 +3648,7 @@ cdef class FixedSizeListArray(BaseListArray):
Or create from a values array, list size and matching type:
>>> typ = pa.list_(pa.field("values", pa.int64()), 2)
- >>> arr = pa.FixedSizeListArray.from_arrays(values,type=typ)
+ >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ)
>>> arr
[
diff --git a/python/pyarrow/benchmark.py b/python/pyarrow/benchmark.py
index 25ee1141f08..0ee9063a9a7 100644
--- a/python/pyarrow/benchmark.py
+++ b/python/pyarrow/benchmark.py
@@ -18,4 +18,4 @@
# flake8: noqa
-from pyarrow.lib import benchmark_PandasObjectIsNull
+from pyarrow.lib import benchmark_PandasObjectIsNull # type: ignore[attr-defined]
diff --git a/python/pyarrow/cffi.py b/python/pyarrow/cffi.py
index 1da1a916914..e5a1c9c1d07 100644
--- a/python/pyarrow/cffi.py
+++ b/python/pyarrow/cffi.py
@@ -16,8 +16,15 @@
# under the License.
from __future__ import absolute_import
+from typing import TYPE_CHECKING
-import cffi
+if TYPE_CHECKING:
+ import cffi
+else:
+ try:
+ import cffi
+ except ImportError:
+ pass
c_source = """
struct ArrowSchema {
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index fe0afdb0a87..259dd5eb94d 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -106,7 +106,7 @@
import warnings
import pyarrow as pa
-from pyarrow import _compute_docstrings
+from pyarrow import _compute_docstrings # type: ignore[reportAttributeAccessIssue]
from pyarrow.vendored import docscrape
@@ -241,7 +241,7 @@ def _handle_options(name, options_class, options, args, kwargs):
def _make_generic_wrapper(func_name, func, options_class, arity):
if options_class is None:
- def wrapper(*args, memory_pool=None):
+ def wrapper(*args, memory_pool=None): # type: ignore[misc]
if arity is not Ellipsis and len(args) != arity:
raise TypeError(
f"{func_name} takes {arity} positional argument(s), "
@@ -251,7 +251,8 @@ def wrapper(*args, memory_pool=None):
return Expression._call(func_name, list(args))
return func.call(args, None, memory_pool)
else:
- def wrapper(*args, memory_pool=None, options=None, **kwargs):
+ def wrapper( # type: ignore[misc]
+ *args, memory_pool=None, options=None, **kwargs):
if arity is not Ellipsis:
if len(args) < arity:
raise TypeError(
@@ -608,7 +609,7 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
sort_keys.append(("dummy", "descending"))
else:
sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)
- options = SelectKOptions(k, sort_keys)
+ options = SelectKOptions(k, sort_keys) # type: ignore[reportArgumentType]
return call_function("select_k_unstable", [values], options, memory_pool)
@@ -655,7 +656,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
sort_keys.append(("dummy", "ascending"))
else:
sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)
- options = SelectKOptions(k, sort_keys)
+ options = SelectKOptions(k, sort_keys) # type: ignore[reportArgumentType]
return call_function("select_k_unstable", [values], options, memory_pool)
@@ -681,7 +682,8 @@ def random(n, *, initializer='system', options=None, memory_pool=None):
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
"""
- options = RandomOptions(initializer=initializer)
+ options = RandomOptions(
+ initializer=initializer) # type: ignore[reportArgumentType]
return call_function("random", [], options, memory_pool, length=n)
@@ -723,7 +725,7 @@ def field(*name_or_index):
if isinstance(name_or_index[0], (str, int)):
return Expression._field(name_or_index[0])
elif isinstance(name_or_index[0], tuple):
- return Expression._nested_field(name_or_index[0])
+ return Expression._nested_field(name_or_index[0]) # type: ignore
else:
raise TypeError(
"field reference should be str, multiple str, tuple or "
@@ -731,7 +733,7 @@ def field(*name_or_index):
)
# In case of multiple strings not supplied in a tuple
else:
- return Expression._nested_field(name_or_index)
+ return Expression._nested_field(name_or_index) # type: ignore
def scalar(value):
diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py
index 41beaa14041..0e8ef66485e 100644
--- a/python/pyarrow/conftest.py
+++ b/python/pyarrow/conftest.py
@@ -114,13 +114,13 @@
defaults['timezone_data'] = os.path.exists("/usr/share/zoneinfo")
try:
- import cython # noqa
+ import cython # type: ignore[import-untyped, import-not-found] # noqa
defaults['cython'] = True
except ImportError:
pass
try:
- import fastparquet # noqa
+ import fastparquet # type: ignore[import-untyped, import-not-found] # noqa
defaults['fastparquet'] = True
except ImportError:
pass
@@ -347,7 +347,7 @@ def func(ctx, x):
pc.register_aggregate_function(func,
func_name,
- func_doc,
+ func_doc, # type: ignore
{
"x": pa.float64(),
},
diff --git a/python/pyarrow/cuda.py b/python/pyarrow/cuda.py
index 18c530d4afe..eeb637f0ab4 100644
--- a/python/pyarrow/cuda.py
+++ b/python/pyarrow/cuda.py
@@ -18,7 +18,7 @@
# flake8: noqa
-from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer,
+from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, # type: ignore[reportMissingModuleSource]
HostBuffer, BufferReader, BufferWriter,
new_host_buffer,
serialize_record_batch, read_message,
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index 039da8c0d56..967c4b475dd 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -54,6 +54,9 @@
get_partition_keys as _get_partition_keys, # keep for backwards compatibility
_filesystemdataset_write,
)
+ from pyarrow.fs import FileInfo
+
+
except ImportError as exc:
raise ImportError(
f"The pyarrow installation is not built with support for 'dataset' ({str(exc)})"
@@ -70,7 +73,8 @@
)
try:
- from pyarrow._dataset_orc import OrcFileFormat
+ from pyarrow._dataset_orc import ( # type: ignore[import-not-found]
+ OrcFileFormat)
_orc_available = True
except ImportError:
pass
@@ -371,6 +375,7 @@ def _ensure_multiple_sources(paths, filesystem=None):
# possible improvement is to group the file_infos by type and raise for
# multiple paths per error category
if is_local:
+ # type: ignore[reportGeneralTypeIssues]
for info in filesystem.get_file_info(paths):
file_type = info.type
if file_type == FileType.File:
@@ -422,16 +427,18 @@ def _ensure_single_source(path, filesystem=None):
filesystem, path = _resolve_filesystem_and_path(path, filesystem)
# ensure that the path is normalized before passing to dataset discovery
+ assert isinstance(path, str)
path = filesystem.normalize_path(path)
# retrieve the file descriptor
file_info = filesystem.get_file_info(path)
+ assert isinstance(file_info, FileInfo)
# depending on the path type either return with a recursive
# directory selector or as a list containing a single file
- if file_info.type == FileType.Directory:
+ if file_info.type == FileType.Directory: # type: ignore[reportAttributeAccessIssue]
paths_or_selector = FileSelector(path, recursive=True)
- elif file_info.type == FileType.File:
+ elif file_info.type == FileType.File: # type: ignore[reportAttributeAccessIssue]
paths_or_selector = [path]
else:
raise FileNotFoundError(path)
@@ -1035,6 +1042,7 @@ def file_visitor(written_file):
_filesystemdataset_write(
scanner, base_dir, basename_template, filesystem, partitioning,
preserve_order, file_options, max_partitions, file_visitor,
- existing_data_behavior, max_open_files, max_rows_per_file,
- min_rows_per_group, max_rows_per_group, create_dir
+ existing_data_behavior, # type: ignore[reportArgumentType]
+ max_open_files, max_rows_per_file, min_rows_per_group,
+ max_rows_per_group, create_dir
)
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 241c27706a6..4b0ecb9f18e 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -183,6 +183,7 @@ def write_feather(df, dest, compression=None, compression_level=None,
f'one of {_FEATHER_SUPPORTED_CODECS}')
try:
+ assert version in (1, 2)
_feather.write_feather(table, dest, compression=compression,
compression_level=compression_level,
chunksize=chunksize, version=version)
@@ -269,7 +270,7 @@ def read_table(source, columns=None, memory_map=False, use_threads=True):
f"Got columns {columns} of types {column_type_names}")
# Feather v1 already respects the column selection
- if reader.version < 3:
+ if int(reader.version) < 3:
return table
# Feather v2 reads with sorted / deduplicated selection
elif sorted(set(columns)) == columns:
diff --git a/python/pyarrow/flight.py b/python/pyarrow/flight.py
index b1836907c67..ba5008c9ecf 100644
--- a/python/pyarrow/flight.py
+++ b/python/pyarrow/flight.py
@@ -16,7 +16,7 @@
# under the License.
try:
- from pyarrow._flight import ( # noqa:F401
+ from pyarrow._flight import ( # noqa:F401 # type: ignore[import-not-found]
connect,
Action,
ActionType,
diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py
index 670ccaaf245..e1aa9090d2d 100644
--- a/python/pyarrow/fs.py
+++ b/python/pyarrow/fs.py
@@ -40,7 +40,7 @@
_not_imported = []
try:
- from pyarrow._azurefs import AzureFileSystem # noqa
+ from pyarrow._azurefs import AzureFileSystem # noqa # type: ignore[reportMissingModuleSource]
except ImportError:
_not_imported.append("AzureFileSystem")
@@ -50,12 +50,12 @@
_not_imported.append("HadoopFileSystem")
try:
- from pyarrow._gcsfs import GcsFileSystem # noqa
+ from pyarrow._gcsfs import GcsFileSystem # noqa # type: ignore[reportMissingModuleSource]
except ImportError:
_not_imported.append("GcsFileSystem")
try:
- from pyarrow._s3fs import ( # noqa
+ from pyarrow._s3fs import ( # noqa # type: ignore[reportMissingModuleSource]
AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy,
S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized,
finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region)
@@ -111,7 +111,7 @@ def _ensure_filesystem(filesystem, *, use_mmap=False):
else:
# handle fsspec-compatible filesystems
try:
- import fsspec
+ import fsspec # type: ignore[import-untyped]
except ImportError:
pass
else:
@@ -165,6 +165,7 @@ def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False):
file_info = None
exists_locally = False
else:
+ assert isinstance(file_info, FileInfo)
exists_locally = (file_info.type != FileType.NotFound)
# if the file or directory doesn't exists locally, then assume that
@@ -250,7 +251,9 @@ def copy_files(source, destination,
destination, destination_filesystem
)
+ assert isinstance(source_fs, FileSystem)
file_info = source_fs.get_file_info(source_path)
+ assert isinstance(file_info, FileInfo)
if file_info.type == FileType.Directory:
source_sel = FileSelector(source_path, recursive=True)
_copy_files_selector(source_fs, source_sel,
diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py
index 4e0d66ec665..222c289c879 100644
--- a/python/pyarrow/orc.py
+++ b/python/pyarrow/orc.py
@@ -20,7 +20,7 @@
import warnings
from pyarrow.lib import Table
-import pyarrow._orc as _orc
+import pyarrow._orc as _orc # type: ignore[reportMissingModuleSource]
from pyarrow.fs import _resolve_filesystem_and_path
@@ -255,9 +255,11 @@ def __init__(self, where, *,
file_version=file_version,
batch_size=batch_size,
stripe_size=stripe_size,
- compression=compression,
+ compression=compression, # type: ignore[reportArgumentType]
compression_block_size=compression_block_size,
- compression_strategy=compression_strategy,
+ compression_strategy=(
+ compression_strategy # type: ignore[reportArgumentType]
+ ),
row_index_stride=row_index_stride,
padding_tolerance=padding_tolerance,
dictionary_key_size_threshold=dictionary_key_size_threshold,
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index dfed76d3711..e1fb05d1317 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -33,18 +33,18 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa
-_logical_type_map = {}
-_numpy_logical_type_map = {}
-_pandas_logical_type_map = {}
+_logical_type_map: dict[int, str] = {}
+_numpy_logical_type_map: dict[int, str] = {}
+_pandas_logical_type_map: dict[int, str] = {}
def get_logical_type_map():
- global _logical_type_map
+ global _logical_type_map # noqa: F824
if not _logical_type_map:
_logical_type_map.update({
@@ -90,9 +90,9 @@ def get_logical_type(arrow_type):
def get_numpy_logical_type_map():
- global _numpy_logical_type_map
+ global _numpy_logical_type_map # noqa: F824
if not _numpy_logical_type_map:
- _numpy_logical_type_map.update({
+ _numpy_logical_type_map.update({ # type: ignore[reportCallIssue]
np.bool_: 'bool',
np.int8: 'int8',
np.int16: 'int16',
@@ -704,7 +704,7 @@ def get_datetimetz_type(values, dtype, type_):
# If no user type passed, construct a tz-aware timestamp type
tz = dtype.tz
unit = dtype.unit
- type_ = pa.timestamp(unit, tz)
+ type_ = pa.timestamp(unit, tz) # type: ignore[reportArgumentType]
elif type_ is None:
# Trust the NumPy dtype
type_ = pa.from_numpy_dtype(values.dtype)
@@ -743,7 +743,7 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block=
pandas Block
"""
- import pandas.core.internals as _int
+ import pandas.core.internals as _int # type: ignore[import-not-found]
block_arr = item.get('block', None)
placement = item['placement']
@@ -769,6 +769,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block=
# create ExtensionBlock
arr = item['py_array']
assert len(placement) == 1
+ assert isinstance(columns, list)
+ assert isinstance(extension_columns, dict)
name = columns[placement[0]]
pandas_dtype = extension_columns[name]
if not hasattr(pandas_dtype, '__from_arrow__'):
@@ -788,7 +790,7 @@ def make_datetimetz(unit, tz):
if _pandas_api.is_v1():
unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
tz = pa.lib.string_to_tzinfo(tz)
- return _pandas_api.datetimetz_type(unit, tz=tz)
+ return _pandas_api.datetimetz_type(unit, tz=tz) # type: ignore[reportArgumentType]
def table_to_dataframe(
@@ -822,7 +824,8 @@ def table_to_dataframe(
result = pa.lib.table_to_blocks(options, table, categories,
list(ext_columns_dtypes.keys()))
if _pandas_api.is_ge_v3():
- from pandas.api.internals import create_dataframe_from_blocks
+ from pandas.api.internals import ( # type: ignore[import-not-found]
+ create_dataframe_from_blocks)
blocks = [
_reconstruct_block(
@@ -834,7 +837,8 @@ def table_to_dataframe(
return df
else:
- from pandas.core.internals import BlockManager
+ from pandas.core.internals import ( # type: ignore[reportMissingImports]
+ BlockManager)
from pandas import DataFrame
blocks = [
@@ -844,7 +848,8 @@ def table_to_dataframe(
axes = [columns, index]
mgr = BlockManager(blocks, axes)
if _pandas_api.is_ge_v21():
- df = DataFrame._from_mgr(mgr, mgr.axes)
+ df = DataFrame._from_mgr( # type: ignore[reportAttributeAccessIssue]
+ mgr, mgr.axes)
else:
df = DataFrame(mgr)
@@ -1092,10 +1097,10 @@ def _is_generated_index_name(name):
def get_pandas_logical_type_map():
- global _pandas_logical_type_map
+ global _pandas_logical_type_map # noqa: F824
if not _pandas_logical_type_map:
- _pandas_logical_type_map.update({
+ _pandas_logical_type_map.update({ # type: ignore[reportCallIssue]
'date': 'datetime64[D]',
'datetime': 'datetime64[ns]',
'datetimetz': 'datetime64[ns]',
@@ -1162,12 +1167,14 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
labels = getattr(columns, 'codes', None) or [None]
# Convert each level to the dtype provided in the metadata
- levels_dtypes = [
- (level, col_index.get('pandas_type', str(level.dtype)),
- col_index.get('numpy_type', None))
+ levels_dtypes = [(level, col_index.get(
+ 'pandas_type',
+ str(level.dtype) # type: ignore[reportAttributeAccessIssue]
+ ),
+ col_index.get('numpy_type', None))
for level, col_index in zip_longest(
levels, column_indexes, fillvalue={}
- )
+ )
]
new_levels = []
@@ -1179,7 +1186,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
# bytes into unicode strings when json.loads-ing them. We need to
# convert them back to bytes to preserve metadata.
if dtype == np.bytes_:
- level = level.map(encoder)
+ level = level.map(encoder) # type: ignore[reportAttributeAccessIssue]
# ARROW-13756: if index is timezone aware DataTimeIndex
elif pandas_dtype == "datetimetz":
tz = pa.lib.string_to_tzinfo(
@@ -1188,12 +1195,14 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
if _pandas_api.is_ge_v3():
# with pandas 3+, to_datetime returns a unit depending on the string
# data, so we restore it to the original unit from the metadata
- level = level.as_unit(np.datetime_data(dtype)[0])
+ level = level.as_unit(np.datetime_data(
+ dtype)[0]) # type: ignore[reportArgumentType]
# GH-41503: if the column index was decimal, restore to decimal
elif pandas_dtype == "decimal":
level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level])
elif (
- level.dtype == "str" and numpy_dtype == "object"
+ level.dtype == "str" # type: ignore[reportAttributeAccessIssue]
+ and numpy_dtype == "object"
and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", "string"])
):
# the metadata indicate that the original dataframe used object dtype,
@@ -1206,11 +1215,12 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
# for pandas >= 3 we want to use the default string dtype for .columns
new_levels.append(level)
continue
- elif level.dtype != dtype:
- level = level.astype(dtype)
+ elif level.dtype != dtype: # type: ignore[reportAttributeAccessIssue]
+ level = level.astype(dtype) # type: ignore[reportAttributeAccessIssue]
# ARROW-9096: if original DataFrame was upcast we keep that
if level.dtype != numpy_dtype and pandas_dtype != "datetimetz":
- level = level.astype(numpy_dtype)
+ level = level.astype( # type: ignore[reportAttributeAccessIssue]
+ numpy_dtype)
new_levels.append(level)
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 676bc445238..a9e7a1984ae 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -45,7 +45,7 @@
FileDecryptionProperties,
SortingColumn)
from pyarrow.fs import (LocalFileSystem, FileType, _resolve_filesystem_and_path,
- _ensure_filesystem)
+ _ensure_filesystem, FileInfo)
from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api
@@ -1413,12 +1413,15 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None,
path_or_paths, filesystem, memory_map=memory_map
)
finfo = filesystem.get_file_info(path_or_paths)
+ assert isinstance(finfo, FileInfo)
if finfo.type == FileType.Directory:
self._base_dir = path_or_paths
else:
single_file = path_or_paths
- parquet_format = ds.ParquetFileFormat(**read_options)
+ parquet_format = ds.ParquetFileFormat(
+ **read_options # type: ignore[invalid-argument-type]
+ )
if single_file is not None:
fragment = parquet_format.make_fragment(single_file, filesystem)
@@ -1573,6 +1576,7 @@ def _get_common_pandas_metadata(self):
for name in ["_common_metadata", "_metadata"]:
metadata_path = os.path.join(str(self._base_dir), name)
finfo = self.filesystem.get_file_info(metadata_path)
+ assert isinstance(finfo, FileInfo)
if finfo.is_file:
pq_meta = read_metadata(
metadata_path, filesystem=self.filesystem)
@@ -1671,6 +1675,7 @@ def files(self):
>>> dataset.files
['dataset_v2_files/year=2019/...-0.parquet', ...
"""
+ assert isinstance(self._dataset, pa.dataset.FileSystemDataset)
return self._dataset.files
@property
@@ -1678,6 +1683,7 @@ def filesystem(self):
"""
The filesystem type of the Dataset source.
"""
+ assert isinstance(self._dataset, pa.dataset.FileSystemDataset)
return self._dataset.filesystem
@property
@@ -1685,6 +1691,7 @@ def partitioning(self):
"""
The partitioning of the Dataset source, if discovered.
"""
+ assert isinstance(self._dataset, pa.dataset.FileSystemDataset)
return self._dataset.partitioning
@@ -1901,14 +1908,16 @@ def read_table(source, *, columns=None, use_threads=True,
filesystem, path = _resolve_filesystem_and_path(source, filesystem)
if filesystem is not None:
- if not filesystem.get_file_info(path).is_file:
+ file_info = filesystem.get_file_info(path)
+ assert isinstance(file_info, FileInfo)
+ if not file_info.is_file:
raise ValueError(
"the 'source' argument should be "
"an existing parquet file and not a directory "
"when the pyarrow.dataset module is not available"
)
- source = filesystem.open_input_file(path)
+ source = filesystem.open_input_file(path) # type: ignore
dataset = ParquetFile(
source, read_dictionary=read_dictionary,
@@ -2081,7 +2090,8 @@ def write_table(table, where, row_group_size=None, version='2.6',
def write_to_dataset(table, root_path, partition_cols=None,
filesystem=None, schema=None, partitioning=None,
basename_template=None, use_threads=None,
- file_visitor=None, existing_data_behavior=None,
+ file_visitor=None, # type: ignore[reportRedeclaration]
+ existing_data_behavior=None,
**kwargs):
"""Wrapper around dataset.write_dataset for writing a Table to
Parquet format by partitions.
@@ -2310,7 +2320,7 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None,
filesystem, where = _resolve_filesystem_and_path(where, filesystem)
if hasattr(where, "seek"): # file-like
- cursor_position = where.tell()
+ cursor_position = where.tell() # type: ignore[reportAttributeAccessIssue]
writer = ParquetWriter(where, schema, filesystem, **kwargs)
writer.close()
@@ -2319,8 +2329,8 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None,
# ParquetWriter doesn't expose the metadata until it's written. Write
# it and read it again.
metadata = read_metadata(where, filesystem=filesystem)
- if hasattr(where, "seek"):
- where.seek(cursor_position) # file-like, set cursor back.
+ if hasattr(where, "seek"): # file-like, set cursor back.
+ where.seek(cursor_position) # type: ignore[reportAttributeAccessIssue]
for m in metadata_collector:
metadata.append_row_groups(m)
diff --git a/python/pyarrow/parquet/encryption.py b/python/pyarrow/parquet/encryption.py
index df6eed913fa..1c6835d6acf 100644
--- a/python/pyarrow/parquet/encryption.py
+++ b/python/pyarrow/parquet/encryption.py
@@ -20,4 +20,5 @@
EncryptionConfiguration,
DecryptionConfiguration,
KmsConnectionConfig,
- KmsClient)
+ KmsClient,
+ FileSystemKeyMaterialStore)
diff --git a/python/pyarrow/py.typed b/python/pyarrow/py.typed
new file mode 100644
index 00000000000..13a83393a91
--- /dev/null
+++ b/python/pyarrow/py.typed
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 83cabcf447d..16fed344e4d 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -1036,7 +1036,7 @@ cdef class StructScalar(Scalar, Mapping):
Parameters
----------
- index : Union[int, str]
+ key : Union[int, str]
Index / position or name of the field.
Returns
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 575444c1cfc..3f227d3101c 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -64,7 +64,8 @@
if os.environ.get('TZDIR', None) is None:
from importlib import resources
try:
- os.environ['TZDIR'] = os.path.join(resources.files('tzdata'), 'zoneinfo')
+ tzdata_path = resources.files('tzdata')
+ os.environ['TZDIR'] = os.path.join(str(tzdata_path), 'zoneinfo')
except ModuleNotFoundError:
print(
'Package "tzdata" not found. Not setting TZDIR environment variable.'
@@ -191,6 +192,7 @@ def decorate(func):
def wrapper(*args, **kwargs):
remaining_attempts = attempts
curr_delay = delay
+ last_exception = None
while remaining_attempts > 0:
try:
return func(*args, **kwargs)
@@ -201,6 +203,9 @@ def wrapper(*args, **kwargs):
if max_delay:
curr_delay = min(curr_delay, max_delay)
time.sleep(curr_delay)
+ # At this point, we've exhausted all attempts and last_exception must be set
+ # (since we must have caught at least one exception to exit the loop)
+ assert last_exception is not None, "No attempts were made"
raise last_exception
return wrapper
return decorate
diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py
index 50da6693aff..62da25f0af3 100644
--- a/python/pyarrow/tests/interchange/test_conversion.py
+++ b/python/pyarrow/tests/interchange/test_conversion.py
@@ -23,7 +23,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow.interchange as pi
from pyarrow.interchange.column import (
@@ -163,8 +163,8 @@ def test_pandas_roundtrip_string():
result = pi.from_dataframe(pandas_df)
assert result["a"].to_pylist() == table["a"].to_pylist()
- assert pa.types.is_string(table["a"].type)
- assert pa.types.is_large_string(result["a"].type)
+ assert pa.types.is_string(table.column("a").type)
+ assert pa.types.is_large_string(result.column("a").type)
table_protocol = table.__dataframe__()
result_protocol = result.__dataframe__()
@@ -193,8 +193,8 @@ def test_pandas_roundtrip_large_string():
result = pi.from_dataframe(pandas_df)
assert result["a_large"].to_pylist() == table["a_large"].to_pylist()
- assert pa.types.is_large_string(table["a_large"].type)
- assert pa.types.is_large_string(result["a_large"].type)
+ assert pa.types.is_large_string(table.column("a_large").type)
+ assert pa.types.is_large_string(result.column("a_large").type)
table_protocol = table.__dataframe__()
result_protocol = result.__dataframe__()
@@ -231,12 +231,12 @@ def test_pandas_roundtrip_string_with_missing():
result = pi.from_dataframe(pandas_df)
assert result["a"].to_pylist() == table["a"].to_pylist()
- assert pa.types.is_string(table["a"].type)
- assert pa.types.is_large_string(result["a"].type)
+ assert pa.types.is_string(table.column("a").type)
+ assert pa.types.is_large_string(result.column("a").type)
assert result["a_large"].to_pylist() == table["a_large"].to_pylist()
- assert pa.types.is_large_string(table["a_large"].type)
- assert pa.types.is_large_string(result["a_large"].type)
+ assert pa.types.is_large_string(table.column("a_large").type)
+ assert pa.types.is_large_string(result.column("a_large").type)
else:
# older versions of pandas do not have bitmask support
# https://github.com/pandas-dev/pandas/issues/49888
@@ -261,12 +261,16 @@ def test_pandas_roundtrip_categorical():
result = pi.from_dataframe(pandas_df)
assert result["weekday"].to_pylist() == table["weekday"].to_pylist()
- assert pa.types.is_dictionary(table["weekday"].type)
- assert pa.types.is_dictionary(result["weekday"].type)
- assert pa.types.is_string(table["weekday"].chunk(0).dictionary.type)
- assert pa.types.is_large_string(result["weekday"].chunk(0).dictionary.type)
- assert pa.types.is_int32(table["weekday"].chunk(0).indices.type)
- assert pa.types.is_int8(result["weekday"].chunk(0).indices.type)
+ assert pa.types.is_dictionary(table.column("weekday").type)
+ assert pa.types.is_dictionary(result.column("weekday").type)
+ table_chunk_0 = table.column("weekday").chunk(0)
+ result_chunk_0 = result.column("weekday").chunk(0)
+ assert isinstance(table_chunk_0, pa.DictionaryArray)
+ assert isinstance(result_chunk_0, pa.DictionaryArray)
+ assert pa.types.is_string(table_chunk_0.dictionary.type)
+ assert pa.types.is_large_string(result_chunk_0.dictionary.type)
+ assert pa.types.is_int32(table_chunk_0.indices.type)
+ assert pa.types.is_int8(result_chunk_0.indices.type)
table_protocol = table.__dataframe__()
result_protocol = result.__dataframe__()
@@ -289,6 +293,7 @@ def test_pandas_roundtrip_categorical():
assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+ assert desc_cat_result["categories"] is not None
assert isinstance(desc_cat_result["categories"]._col, pa.Array)
@@ -450,6 +455,7 @@ def test_pyarrow_roundtrip_categorical(offset, length):
assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+ assert desc_cat_result["categories"] is not None
assert isinstance(desc_cat_result["categories"]._col, pa.Array)
@@ -464,8 +470,8 @@ def test_pyarrow_roundtrip_large_string():
col = result.__dataframe__().get_column(0)
assert col.size() == 3*1024**2
- assert pa.types.is_large_string(table[0].type)
- assert pa.types.is_large_string(result[0].type)
+ assert pa.types.is_large_string(table.column(0).type)
+ assert pa.types.is_large_string(result.column(0).type)
assert table.equals(result)
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
index cea694d1c1e..3208b56c42d 100644
--- a/python/pyarrow/tests/interchange/test_interchange_spec.py
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -23,7 +23,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
import pyarrow.tests.strategies as past
diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py
index 5390a24b90d..3cbf5801dfc 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -16,11 +16,12 @@
# under the License.
import io
+from typing import cast
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
from pyarrow.tests import util
@@ -137,7 +138,7 @@ def make_sample_file(table_or_df):
else:
a_table = pa.Table.from_pandas(table_or_df)
- buf = io.BytesIO()
+ buf = io.BytesIO() # type: ignore[attr-defined]
_write_table(a_table, buf, compression='SNAPPY', version='2.6')
buf.seek(0)
@@ -161,12 +162,9 @@ def alltypes_sample(size=10000, seed=0, categorical=False):
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
- 'datetime_ms': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[ms]'),
- 'datetime_us': np.arange("2016-01-01T00:00:00.000001", size,
- dtype='datetime64[us]'),
- 'datetime_ns': np.arange("2016-01-01T00:00:00.000000001", size,
- dtype='datetime64[ns]'),
+ 'datetime_ms': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ms').values,
+ 'datetime_us': pd.date_range("2016-01-01T00:00:00.000001", periods=size, freq='us').values,
+ 'datetime_ns': pd.date_range("2016-01-01T00:00:00.000000001", periods=size, freq='ns').values,
'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
'str': pd.Series([str(x) for x in range(size)]),
'empty_str': [''] * size,
@@ -175,5 +173,6 @@ def alltypes_sample(size=10000, seed=0, categorical=False):
'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
}
if categorical:
- arrays['str_category'] = arrays['str'].astype('category')
+ import pandas as pd
+ arrays['str_category'] = cast(pd.Series, arrays['str']).astype('category')
return pd.DataFrame(arrays)
diff --git a/python/pyarrow/tests/parquet/encryption.py b/python/pyarrow/tests/parquet/encryption.py
index efaee1d08a9..7a6ef3de7bc 100644
--- a/python/pyarrow/tests/parquet/encryption.py
+++ b/python/pyarrow/tests/parquet/encryption.py
@@ -30,7 +30,7 @@ def __init__(self, config):
pe.KmsClient.__init__(self)
self.master_keys_map = config.custom_kms_conf
- def wrap_key(self, key_bytes, master_key_identifier):
+ def wrap_key(self, key_bytes, master_key_identifier): # type: ignore[override]
"""Not a secure cipher - the wrapped key
is just the master key concatenated with key bytes"""
master_key_bytes = self.master_keys_map[master_key_identifier].encode(
@@ -39,7 +39,7 @@ def wrap_key(self, key_bytes, master_key_identifier):
result = base64.b64encode(wrapped_key)
return result
- def unwrap_key(self, wrapped_key, master_key_identifier):
+ def unwrap_key(self, wrapped_key, master_key_identifier): # type: ignore[override]
"""Not a secure cipher - just extract the key from
the wrapped key"""
if master_key_identifier not in self.master_keys_map:
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index 94868741f39..4c0e6d1429b 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -35,7 +35,7 @@
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _read_table, _write_table
except ImportError:
- pq = None
+ pass
try:
@@ -45,12 +45,12 @@
from pyarrow.tests.pandas_examples import dataframe_with_lists
from pyarrow.tests.parquet.common import alltypes_sample
except ImportError:
- pd = tm = None
+ pass
try:
import numpy as np
except ImportError:
- np = None
+ pass
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not parquet'
@@ -162,10 +162,10 @@ def test_invalid_source():
# Test that we provide an helpful error message pointing out
# that None wasn't expected when trying to open a Parquet None file.
with pytest.raises(TypeError, match="None"):
- pq.read_table(None)
+ pq.read_table(None) # type: ignore[arg-type]
with pytest.raises(TypeError, match="None"):
- pq.ParquetFile(None)
+ pq.ParquetFile(None) # type: ignore[arg-type]
def test_read_table_without_dataset(tempdir):
@@ -747,7 +747,7 @@ def test_fastparquet_cross_compatibility(tempdir):
# Arrow -> fastparquet
file_arrow = str(tempdir / "cross_compat_arrow.parquet")
- pq.write_table(table, file_arrow, compression=None)
+ pq.write_table(table, file_arrow, compression=None) # type: ignore[arg-type]
fp_file = fp.ParquetFile(file_arrow)
df_fp = fp_file.to_pandas()
@@ -788,7 +788,7 @@ def test_buffer_contents(
for col in table.columns:
[chunk] = col.chunks
buf = chunk.buffers()[1]
- assert buf.to_pybytes() == buf.size * b"\0"
+ assert buf.to_pybytes() == buf.size * b"\0" # type: ignore[union-attr]
def test_parquet_compression_roundtrip(tempdir):
@@ -798,7 +798,7 @@ def test_parquet_compression_roundtrip(tempdir):
# the stream due to auto-detecting the extension in the filename
table = pa.table([pa.array(range(4))], names=["ints"])
path = tempdir / "arrow-10480.pyarrow.gz"
- pq.write_table(table, path, compression="GZIP")
+ pq.write_table(table, path, compression="GZIP") # type: ignore[arg-type]
result = pq.read_table(path)
assert result.equals(table)
@@ -823,7 +823,7 @@ def test_empty_row_groups(tempdir):
def test_reads_over_batch(tempdir):
data = [None] * (1 << 20)
- data.append([1])
+ data.append([1]) # type: ignore[reportArgumentType]
# Large list with mostly nones and one final
# value. This should force batched reads when
# reading back.
diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py
index 2345855a332..af418812be8 100644
--- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py
+++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py
@@ -24,15 +24,14 @@
from pyarrow.tests.parquet.common import (_read_table,
_check_roundtrip)
except ImportError:
- pq = None
+ pass
try:
import pandas as pd
- import pandas.testing as tm
from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
except ImportError:
- pd = tm = None
+ pass
# Marks all of the tests in this module
diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py
index c546bc1532a..bd48ffe7155 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -22,7 +22,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pytest
import pyarrow as pa
@@ -33,7 +33,7 @@
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _read_table, _write_table
except ImportError:
- pq = None
+ pass
try:
@@ -44,7 +44,7 @@
dataframe_with_lists)
from pyarrow.tests.parquet.common import alltypes_sample
except ImportError:
- pd = tm = None
+ pass
# Marks all of the tests in this module
@@ -142,7 +142,7 @@ def test_direct_read_dictionary():
read_dictionary=['f0'])
# Compute dictionary-encoded subfield
- expected = pa.table([table[0].dictionary_encode()], names=['f0'])
+ expected = pa.table([table.column(0).dictionary_encode()], names=['f0'])
assert result.equals(expected)
@@ -174,7 +174,7 @@ def test_direct_read_dictionary_subfield():
expected = pa.table([expected_arr], names=['f0'])
assert result.equals(expected)
- assert result[0].num_chunks == 1
+ assert result.column(0).num_chunks == 1
@pytest.mark.numpy
@@ -260,8 +260,8 @@ def test_single_pylist_column_roundtrip(tempdir, dtype,):
_write_table(table, filename)
table_read = _read_table(filename)
for i in range(table.num_columns):
- col_written = table[i]
- col_read = table_read[i]
+ col_written = table.column(i)
+ col_read = table_read.column(i)
assert table.field(i).name == table_read.field(i).name
assert col_read.num_chunks == 1
data_written = col_written.chunk(0)
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
index d3e9cda7301..14253ca7d6b 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -20,35 +20,41 @@
import os
import pathlib
import sys
+from typing import TYPE_CHECKING
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pytest
import unittest.mock as mock
import pyarrow as pa
import pyarrow.compute as pc
-from pyarrow.fs import (FileSelector, FileSystem, LocalFileSystem,
+from pyarrow.fs import (FileSelector, FileSystem, LocalFileSystem, FileInfo, FileType,
PyFileSystem, SubTreeFileSystem, FSSpecHandler)
from pyarrow.tests import util
from pyarrow.util import guid
-try:
+if TYPE_CHECKING:
+ import pandas as pd
+ import pandas.testing as tm
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import (
_read_table, _test_dataframe, _test_table, _write_table)
-except ImportError:
- pq = None
+else:
+ try:
+ import pyarrow.parquet as pq
+ from pyarrow.tests.parquet.common import (
+ _read_table, _test_dataframe, _test_table, _write_table)
+ except ImportError:
+ pass
-
-try:
- import pandas as pd
- import pandas.testing as tm
-
-except ImportError:
- pd = tm = None
+ try:
+ import pandas as pd
+ import pandas.testing as tm
+ except ImportError:
+ pass
# Marks all of the tests in this module
@@ -70,8 +76,8 @@ def test_filesystem_uri(tempdir):
assert result.equals(table)
# filesystem URI
- result = pq.read_table(
- "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir))
+ result = pq.read_table("data_dir/data.parquet",
+ filesystem=util._filesystem_uri(tempdir))
assert result.equals(table)
@@ -553,7 +559,7 @@ def _generate_partition_directories(fs, base_dir, partition_spec, df):
# ['bar', ['a', 'b', 'c']]
# part_table : a pyarrow.Table to write to each partition
if not isinstance(fs, FileSystem):
- fs = PyFileSystem(FSSpecHandler(fs))
+ fs = PyFileSystem(FSSpecHandler(fs)) # type: ignore[abstract]
DEPTH = len(partition_spec)
@@ -572,15 +578,15 @@ def _visit_level(base_dir, level, part_keys):
if level == DEPTH - 1:
# Generate example data
- from pyarrow.fs import FileType
-
file_path = pathsep.join([level_dir, guid()])
filtered_df = _filter_partition(df, this_part_keys)
part_table = pa.Table.from_pandas(filtered_df)
with fs.open_output_stream(file_path) as f:
_write_table(part_table, f)
- assert fs.get_file_info(file_path).type != FileType.NotFound
- assert fs.get_file_info(file_path).type == FileType.File
+ file_info = fs.get_file_info(file_path)
+ assert isinstance(file_info, FileInfo)
+ assert file_info.type != FileType.NotFound
+ assert file_info.type == FileType.File
file_success = pathsep.join([level_dir, '_SUCCESS'])
with fs.open_output_stream(file_success) as f:
@@ -717,8 +723,8 @@ def test_dataset_read_pandas(tempdir):
paths = []
for i in range(nfiles):
df = _test_dataframe(size, seed=i)
- df.index = np.arange(i * size, (i + 1) * size)
- df.index.name = 'index'
+ df.index = np.arange(i * size, (i + 1) * size) # type: ignore[assignment]
+ df.index.name = 'index' # type: ignore[attr-defined]
path = dirpath / f'{i}.parquet'
@@ -931,8 +937,7 @@ def _test_write_to_dataset_with_partitions(base_path,
'group2': list('eefeffgeee'),
'num': list(range(10)),
'nan': [np.nan] * 10,
- 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype(
- 'datetime64[ns]')
+ 'date': pd.date_range('2017-01-01', periods=10, freq='D').values.astype('datetime64[ns]')
})
cols = output_df.columns.tolist()
partition_by = ['group1', 'group2']
@@ -965,7 +970,7 @@ def _test_write_to_dataset_with_partitions(base_path,
input_df_cols = input_df.columns.tolist()
assert partition_by == input_df_cols[-1 * len(partition_by):]
- input_df = input_df[cols]
+ input_df = input_df.loc[:, cols]
# Partitioned columns become 'categorical' dtypes
for col in partition_by:
output_df[col] = output_df[col].astype('category')
@@ -974,6 +979,7 @@ def _test_write_to_dataset_with_partitions(base_path,
expected_date_type = schema.field('date').type.to_pandas_dtype()
output_df["date"] = output_df["date"].astype(expected_date_type)
+ assert isinstance(input_df, pd.DataFrame)
tm.assert_frame_equal(output_df, input_df)
@@ -988,8 +994,7 @@ def _test_write_to_dataset_no_partitions(base_path,
'group1': list('aaabbbbccc'),
'group2': list('eefeffgeee'),
'num': list(range(10)),
- 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype(
- 'datetime64[ns]')
+ 'date': pd.date_range('2017-01-01', periods=10, freq='D').values.astype('datetime64[ns]')
})
cols = output_df.columns.tolist()
output_table = pa.Table.from_pandas(output_df)
@@ -997,7 +1002,7 @@ def _test_write_to_dataset_no_partitions(base_path,
if filesystem is None:
filesystem = LocalFileSystem()
elif not isinstance(filesystem, FileSystem):
- filesystem = PyFileSystem(FSSpecHandler(filesystem))
+ filesystem = PyFileSystem(FSSpecHandler(filesystem)) # type: ignore[abstract]
# Without partitions, append files to root_path
n = 5
@@ -1009,8 +1014,10 @@ def _test_write_to_dataset_no_partitions(base_path,
recursive=True)
infos = filesystem.get_file_info(selector)
- output_files = [info for info in infos if info.path.endswith(".parquet")]
- assert len(output_files) == n
+ if isinstance(infos, list):
+ assert all(isinstance(info, FileInfo) for info in infos)
+ output_files = [info for info in infos if info.path.endswith(".parquet")]
+ assert len(output_files) == n
# Deduplicated incoming DataFrame should match
# original outgoing Dataframe
@@ -1020,6 +1027,7 @@ def _test_write_to_dataset_no_partitions(base_path,
input_df = input_table.to_pandas()
input_df = input_df.drop_duplicates()
input_df = input_df[cols]
+ assert isinstance(input_df, pd.DataFrame)
tm.assert_frame_equal(output_df, input_df)
@@ -1168,11 +1176,11 @@ def test_dataset_read_dictionary(tempdir):
path, read_dictionary=['f0']).read()
# The order of the chunks is non-deterministic
- ex_chunks = [t1[0].chunk(0).dictionary_encode(),
- t2[0].chunk(0).dictionary_encode()]
+ ex_chunks = [t1.column(0).chunk(0).dictionary_encode(),
+ t2.column(0).chunk(0).dictionary_encode()]
- assert result[0].num_chunks == 2
- c0, c1 = result[0].chunk(0), result[0].chunk(1)
+ assert result.column(0).num_chunks == 2
+ c0, c1 = result.column(0).chunk(0), result.column(0).chunk(1)
if c0.equals(ex_chunks[0]):
assert c1.equals(ex_chunks[1])
else:
diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py
index b89fd97cb91..a7652a01e64 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -22,7 +22,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pytest
import pyarrow as pa
@@ -32,7 +32,7 @@
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _read_table, _write_table
except ImportError:
- pq = None
+ pass
try:
@@ -41,7 +41,7 @@
from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
except ImportError:
- pd = tm = None
+ pass
# Marks all of the tests in this module
@@ -56,7 +56,7 @@ def test_pandas_parquet_datetime_tz():
# coerce to [ns] due to lack of non-[ns] support.
s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]')
s = s.dt.tz_localize('utc')
- s.index = s
+ s.index = s # type: ignore[assignment]
# Both a column and an index to hit both use cases
df = pd.DataFrame({'tz_aware': s,
@@ -287,7 +287,8 @@ def test_coerce_int96_timestamp_unit(unit):
# For either Parquet version, coercing to nanoseconds is allowed
# if Int96 storage is used
- expected = pa.Table.from_arrays([arrays.get(unit)]*4, names)
+ array_for_unit = arrays.get(unit, a_ns)
+ expected = pa.Table.from_arrays([array_for_unit] * 4, names)
read_table_kwargs = {"coerce_int96_timestamp_unit": unit}
_check_roundtrip(table, expected,
read_table_kwargs=read_table_kwargs,
@@ -323,6 +324,7 @@ def get_table(pq_reader_method, filename, **kwargs):
# with the default resolution of ns, we get wrong values for INT96
# that are out of bounds for nanosecond range
tab_error = get_table(pq_reader_method, filename)
+ assert tab_error is not None
with warnings.catch_warnings():
warnings.filterwarnings("ignore",
"Discarding nonzero nanoseconds in conversion",
@@ -333,6 +335,7 @@ def get_table(pq_reader_method, filename, **kwargs):
tab_correct = get_table(
pq_reader_method, filename, coerce_int96_timestamp_unit="s"
)
+ assert tab_correct is not None
df_correct = tab_correct.to_pandas(timestamp_as_object=True)
df["a"] = df["a"].astype(object)
tm.assert_frame_equal(df, df_correct)
diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py
index 4e2fb069bd0..82b934edf77 100644
--- a/python/pyarrow/tests/parquet/test_encryption.py
+++ b/python/pyarrow/tests/parquet/test_encryption.py
@@ -21,8 +21,7 @@
import pyarrow.parquet as pq
import pyarrow.parquet.encryption as pe
except ImportError:
- pq = None
- pe = None
+ pass
else:
from pyarrow.tests.parquet.encryption import (InMemoryKmsClient,
MockVersioningKmsClient,
@@ -131,7 +130,7 @@ def test_encrypted_parquet_write_read(tempdir, data_table):
encryption_algorithm="AES_GCM_V1",
cache_lifetime=timedelta(minutes=5.0),
data_key_length_bits=256)
- assert encryption_config.uniform_encryption is False
+ assert encryption_config.uniform_encryption is False # type: ignore[attr-defined]
kms_connection_config, crypto_factory = write_encrypted_file(
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
@@ -154,11 +153,11 @@ def test_uniform_encrypted_parquet_write_read(tempdir, data_table):
# Encrypt the footer and all columns with the footer key,
encryption_config = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME,
- uniform_encryption=True,
+ uniform_encryption=True, # type: ignore[call-arg]
encryption_algorithm="AES_GCM_V1",
cache_lifetime=timedelta(minutes=5.0),
data_key_length_bits=256)
- assert encryption_config.uniform_encryption is True
+ assert encryption_config.uniform_encryption is True # type: ignore[attr-defined]
kms_connection_config, crypto_factory = write_encrypted_file(
path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, b"",
@@ -303,7 +302,7 @@ def test_encrypted_parquet_write_col_key_and_uniform_encryption(tempdir, data_ta
column_keys={
COL_KEY_NAME: ["a", "b"],
},
- uniform_encryption=True)
+ uniform_encryption=True) # type: ignore[call-arg]
with pytest.raises(OSError,
match=r"Cannot set both column_keys and uniform_encryption"):
@@ -415,7 +414,7 @@ def unwrap_key(self, wrapped_key, master_key_identifier):
def kms_factory(kms_connection_configuration):
return WrongTypeKmsClient(kms_connection_configuration)
- crypto_factory = pe.CryptoFactory(kms_factory)
+ crypto_factory = pe.CryptoFactory(kms_factory) # type: ignore[arg-type]
with pytest.raises(TypeError):
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
@@ -554,7 +553,7 @@ def test_encrypted_parquet_write_read_external(tempdir, data_table,
result_table = read_encrypted_parquet(
path, decryption_config, kms_connection_config, crypto_factory,
internal_key_material=False)
- store = pa._parquet_encryption.FileSystemKeyMaterialStore.for_file(path)
+ store = pe.FileSystemKeyMaterialStore.for_file(path)
assert len(key_ids := store.get_key_id_set()) == (
len(external_encryption_config.column_keys[COL_KEY_NAME]) + 1)
diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
index 148bfebaa67..646873b3d4f 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -19,11 +19,7 @@
import decimal
from collections import OrderedDict
import io
-
-try:
- import numpy as np
-except ImportError:
- np = None
+from typing import TYPE_CHECKING
import pytest
import pyarrow as pa
@@ -31,20 +27,25 @@
from pyarrow.fs import LocalFileSystem
from pyarrow.tests import util
-try:
- import pyarrow.parquet as pq
- from pyarrow.tests.parquet.common import _write_table
-except ImportError:
- pq = None
-
-
-try:
+if TYPE_CHECKING:
+ import numpy as np
import pandas as pd
- import pandas.testing as tm
-
- from pyarrow.tests.parquet.common import alltypes_sample
-except ImportError:
- pd = tm = None
+ import pyarrow.parquet as pq
+ from pyarrow.tests.parquet.common import alltypes_sample, _write_table
+else:
+ try:
+ import pyarrow.parquet as pq
+ from pyarrow.tests.parquet.common import _write_table, alltypes_sample
+ except ImportError:
+ pass
+ try:
+ import pandas as pd
+ except ImportError:
+ pass
+ try:
+ import numpy as np
+ except ImportError:
+ pass
# Marks all of the tests in this module
@@ -56,7 +57,7 @@
def test_parquet_metadata_api():
df = alltypes_sample(size=10000)
df = df.reindex(columns=sorted(df.columns))
- df.index = np.random.randint(0, 1000000, size=len(df))
+ df.index = np.random.randint(0, 1000000, size=len(df)) # type: ignore[assignment]
fileh = make_sample_file(df)
ncols = len(df.columns)
@@ -80,15 +81,15 @@ def test_parquet_metadata_api():
col = schema[0]
repr(col)
- assert col.name == df.columns[0]
- assert col.max_definition_level == 1
- assert col.max_repetition_level == 0
- assert col.max_repetition_level == 0
- assert col.physical_type == 'BOOLEAN'
- assert col.converted_type == 'NONE'
+ assert col.name == df.columns[0] # type: ignore[attr-defined]
+ assert col.max_definition_level == 1 # type: ignore[attr-defined]
+ assert col.max_repetition_level == 0 # type: ignore[attr-defined]
+ assert col.max_repetition_level == 0 # type: ignore[attr-defined]
+ assert col.physical_type == 'BOOLEAN' # type: ignore[attr-defined]
+ assert col.converted_type == 'NONE' # type: ignore[attr-defined]
col_float16 = schema[5]
- assert col_float16.logical_type.type == 'FLOAT16'
+ assert col_float16.logical_type.type == 'FLOAT16' # type: ignore[attr-defined]
with pytest.raises(IndexError):
schema[ncols + 1] # +1 for index
@@ -210,15 +211,16 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value,
col_meta = rg_meta.column(0)
stat = col_meta.statistics
- assert stat.has_min_max
- assert _close(type, stat.min, min_value)
- assert _close(type, stat.max, max_value)
- assert stat.null_count == null_count
- assert stat.num_values == num_values
+ assert stat is not None
+ assert stat.has_min_max # type: ignore[attr-defined]
+ assert _close(type, stat.min, min_value) # type: ignore[attr-defined]
+ assert _close(type, stat.max, max_value) # type: ignore[attr-defined]
+ assert stat.null_count == null_count # type: ignore[attr-defined]
+ assert stat.num_values == num_values # type: ignore[attr-defined]
# TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
# method, missing distinct_count is represented as zero instead of None
- assert stat.distinct_count == distinct_count
- assert stat.physical_type == physical_type
+ assert stat.distinct_count == distinct_count # type: ignore[attr-defined]
+ assert stat.physical_type == physical_type # type: ignore[attr-defined]
def _close(type, left, right):
@@ -236,8 +238,10 @@ def test_parquet_raise_on_unset_statistics():
df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")})
meta = make_sample_file(pa.Table.from_pandas(df)).metadata
- assert not meta.row_group(0).column(0).statistics.has_min_max
- assert meta.row_group(0).column(0).statistics.max is None
+ stat = meta.row_group(0).column(0).statistics
+ assert stat is not None
+ assert not stat.has_min_max
+ assert stat.max is None
def test_statistics_convert_logical_types(tempdir):
@@ -271,8 +275,9 @@ def test_statistics_convert_logical_types(tempdir):
pq.write_table(t, path, version='2.6')
pf = pq.ParquetFile(path)
stats = pf.metadata.row_group(0).column(0).statistics
- assert stats.min == min_val
- assert stats.max == max_val
+ assert stats is not None
+ assert stats.min == min_val # type: ignore[attr-defined]
+ assert stats.max == max_val # type: ignore[attr-defined]
def test_parquet_write_disable_statistics(tempdir):
@@ -429,29 +434,36 @@ def test_field_id_metadata():
pf = pq.ParquetFile(pa.BufferReader(contents))
schema = pf.schema_arrow
- assert schema[0].metadata[field_id] == b'1'
- assert schema[0].metadata[b'other'] == b'abc'
+ assert schema[0].metadata is not None
+ assert schema[0].metadata[field_id] == b'1' # type: ignore[index]
+ assert schema[0].metadata[b'other'] == b'abc' # type: ignore[index]
list_field = schema[1]
- assert list_field.metadata[field_id] == b'11'
+ assert list_field.metadata is not None
+ assert list_field.metadata[field_id] == b'11' # type: ignore[index]
list_item_field = list_field.type.value_field
- assert list_item_field.metadata[field_id] == b'10'
+ assert list_item_field.metadata is not None
+ assert list_item_field.metadata[field_id] == b'10' # type: ignore[index]
struct_field = schema[2]
- assert struct_field.metadata[field_id] == b'102'
+ assert struct_field.metadata is not None
+ assert struct_field.metadata[field_id] == b'102' # type: ignore[index]
struct_middle_field = struct_field.type[0]
- assert struct_middle_field.metadata[field_id] == b'101'
+ assert struct_middle_field.metadata is not None
+ assert struct_middle_field.metadata[field_id] == b'101' # type: ignore[index]
struct_inner_field = struct_middle_field.type[0]
- assert struct_inner_field.metadata[field_id] == b'100'
+ assert struct_inner_field.metadata is not None
+ assert struct_inner_field.metadata[field_id] == b'100' # type: ignore[index]
assert schema[3].metadata is None
# Invalid input is passed through (ok) but does not
# have field_id in parquet (not tested)
- assert schema[4].metadata[field_id] == b'xyz'
- assert schema[5].metadata[field_id] == b'-1000'
+ assert schema[4].metadata is not None
+ assert schema[4].metadata[field_id] == b'xyz' # type: ignore[index]
+ assert schema[5].metadata[field_id] == b'-1000' # type: ignore[index]
def test_parquet_file_page_index():
@@ -495,13 +507,14 @@ def test_multi_dataset_metadata(tempdir):
_meta.append_row_groups(meta[0])
# Write merged metadata-only file
+ assert _meta is not None
with open(metapath, "wb") as f:
- _meta.write_metadata_file(f)
+ _meta.write_metadata_file(f) # type: ignore[union-attr]
# Read back the metadata
meta = pq.read_metadata(metapath)
md = meta.to_dict()
- _md = _meta.to_dict()
+ _md = _meta.to_dict() # type: ignore[union-attr]
for key in _md:
if key != 'serialized_size':
assert _md[key] == md[key]
@@ -695,13 +708,14 @@ def test_metadata_schema_filesystem(tempdir):
assert pq.read_metadata(
file_path, filesystem=LocalFileSystem()).equals(metadata)
assert pq.read_metadata(
+ # type: ignore[arg-type]
fname, filesystem=f'file:///{tempdir}').equals(metadata)
assert pq.read_schema(file_uri).equals(schema)
assert pq.read_schema(
file_path, filesystem=LocalFileSystem()).equals(schema)
assert pq.read_schema(
- fname, filesystem=f'file:///{tempdir}').equals(schema)
+ fname, filesystem=f'file:///{tempdir}').equals(schema) # type: ignore[arg-type]
with util.change_cwd(tempdir):
# Pass `filesystem` arg
@@ -721,7 +735,7 @@ def test_metadata_equals():
original_metadata = pq.read_metadata(pa.BufferReader(buf))
match = "Argument 'other' has incorrect type"
with pytest.raises(TypeError, match=match):
- original_metadata.equals(None)
+ original_metadata.equals(None) # type: ignore[arg-type]
@pytest.mark.parametrize("t1,t2,expected_error", (
@@ -810,7 +824,7 @@ def msg(c):
pq.ColumnChunkMetaData()
with pytest.raises(TypeError, match=msg("RowGroupMetaData")):
- pq.RowGroupMetaData()
+ pq.RowGroupMetaData() # type: ignore[call-arg]
with pytest.raises(TypeError, match=msg("FileMetaData")):
- pq.FileMetaData()
+ pq.FileMetaData() # type: ignore[call-arg]
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
index 53864ff15ea..91ae2385734 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -17,11 +17,12 @@
import io
import json
+from typing import TYPE_CHECKING, cast
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pytest
import pyarrow as pa
@@ -29,22 +30,29 @@
from pyarrow.util import guid
from pyarrow.vendored.version import Version
-try:
- import pyarrow.parquet as pq
- from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
- _write_table)
-except ImportError:
- pq = None
-
-
-try:
+if TYPE_CHECKING:
import pandas as pd
import pandas.testing as tm
+ import pyarrow.parquet as pq
+ from pyarrow.tests.parquet.common import (
+ _read_table, _roundtrip_pandas_dataframe, _test_dataframe,
+ _write_table, alltypes_sample
+ )
+else:
+ try:
+ import pyarrow.parquet as pq
+ from pyarrow.tests.parquet.common import (
+ _read_table, _test_dataframe, _write_table, alltypes_sample,
+ _roundtrip_pandas_dataframe
+ )
- from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe,
- alltypes_sample)
-except ImportError:
- pd = tm = None
+ except ImportError:
+ pass
+ try:
+ import pandas as pd
+ import pandas.testing as tm
+ except ImportError:
+ pass
# Marks all of the tests in this module
@@ -58,11 +66,14 @@ def test_pandas_parquet_custom_metadata(tempdir):
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
+ assert arrow_table.schema.metadata is not None
assert b'pandas' in arrow_table.schema.metadata
_write_table(arrow_table, filename)
- metadata = pq.read_metadata(filename).metadata
+ file_metadata = pq.read_metadata(filename)
+ metadata = file_metadata.metadata
+ assert metadata is not None
assert b'pandas' in metadata
js = json.loads(metadata[b'pandas'].decode('utf8'))
@@ -117,10 +128,13 @@ def test_attributes_metadata_persistence(tempdir):
}
table = pa.Table.from_pandas(df)
+ assert table.schema.metadata is not None
assert b'attributes' in table.schema.metadata[b'pandas']
_write_table(table, filename)
- metadata = pq.read_metadata(filename).metadata
+ file_metadata = pq.read_metadata(filename)
+ metadata = file_metadata.metadata
+ assert metadata is not None
js = json.loads(metadata[b'pandas'].decode('utf8'))
assert 'attributes' in js
assert js['attributes'] == df.attrs
@@ -297,8 +311,8 @@ def test_pandas_parquet_configuration_options(tempdir):
@pytest.mark.pandas
def test_spark_flavor_preserves_pandas_metadata():
df = _test_dataframe(size=100)
- df.index = np.arange(0, 10 * len(df), 10)
- df.index.name = 'foo'
+ df.index = np.arange(0, 10 * len(df), 10) # type: ignore[assignment]
+ df.index.name = 'foo' # type: ignore[attr-defined]
result = _roundtrip_pandas_dataframe(df, {'flavor': 'spark'})
tm.assert_frame_equal(result, df)
@@ -450,7 +464,9 @@ def test_backwards_compatible_column_metadata_handling(datadir):
table = _read_table(
path, columns=['a'])
result = table.to_pandas()
- tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
+ expected_df = expected[['a']].reset_index(drop=True)
+ assert isinstance(expected_df, pd.DataFrame)
+ tm.assert_frame_equal(result, expected_df)
@pytest.mark.pandas
@@ -510,7 +526,7 @@ def test_pandas_categorical_roundtrip():
codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
categories = ['foo', 'bar', 'baz']
df = pd.DataFrame({'x': pd.Categorical.from_codes(
- codes, categories=categories)})
+ codes, categories=categories)}) # type: ignore[arg-type]
buf = pa.BufferOutputStream()
pq.write_table(pa.table(df), buf)
@@ -555,15 +571,18 @@ def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir):
table, str(tempdir / "case1"), partition_cols=['part'],
)
result = pq.read_table(str(tempdir / "case1")).to_pandas()
- tm.assert_frame_equal(result[["col"]], df[["col"]])
+ tm.assert_frame_equal(
+ result[["col"]], df[["col"]])
pq.write_to_dataset(table, str(tempdir / "case2"))
result = pq.read_table(str(tempdir / "case2")).to_pandas()
- tm.assert_frame_equal(result[["col"]], df[["col"]])
+ tm.assert_frame_equal(
+ result[["col"]], df[["col"]])
pq.write_table(table, str(tempdir / "data.parquet"))
result = pq.read_table(str(tempdir / "data.parquet")).to_pandas()
- tm.assert_frame_equal(result[["col"]], df[["col"]])
+ tm.assert_frame_equal(
+ result[["col"]], df[["col"]])
@pytest.mark.pandas
diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py
index a62b5c3298c..3c5182dc56e 100644
--- a/python/pyarrow/tests/parquet/test_parquet_file.py
+++ b/python/pyarrow/tests/parquet/test_parquet_file.py
@@ -30,15 +30,14 @@
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _write_table
except ImportError:
- pq = None
+ pass
try:
- import pandas as pd
import pandas.testing as tm
from pyarrow.tests.parquet.common import alltypes_sample
except ImportError:
- pd = tm = None
+ pass
# Marks all of the tests in this module
@@ -172,7 +171,7 @@ def test_scan_contents():
pf = pq.ParquetFile(buf)
assert pf.scan_contents() == 10000
- assert pf.scan_contents(df.columns[:4]) == 10000
+ assert pf.scan_contents(list(df.columns[:4])) == 10000
def test_parquet_file_pass_directory_instead_of_file(tempdir):
@@ -215,7 +214,7 @@ def test_iter_batches_columns_reader(tempdir, batch_size):
chunk_size=chunk_size)
file_ = pq.ParquetFile(filename)
- for columns in [df.columns[:10], df.columns[10:]]:
+ for columns in [list(df.columns[:10]), list(df.columns[10:])]:
batches = file_.iter_batches(batch_size=batch_size, columns=columns)
batch_starts = range(0, total_size+batch_size, batch_size)
for batch, start in zip(batches, batch_starts):
@@ -263,9 +262,10 @@ def get_all_batches(f):
tm.assert_frame_equal(
batches[batch_no].to_pandas().reset_index(drop=True),
- file_.read_row_groups([i]).to_pandas().iloc[900:].reset_index(
- drop=True
- )
+ file_
+ .read_row_groups([i])
+ .to_pandas().iloc[900:]
+ .reset_index(drop=True) # type: ignore[arg-type]
)
batch_no += 1
@@ -346,6 +346,7 @@ def test_read_statistics():
buf.seek(0)
statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
+ assert statistics is not None
assert statistics.is_null_count_exact is True
assert statistics.null_count == 1
assert statistics.distinct_count is None
@@ -389,7 +390,8 @@ def test_parquet_file_fsspec_support():
def test_parquet_file_fsspec_support_through_filesystem_argument():
try:
- from fsspec.implementations.memory import MemoryFileSystem
+ from fsspec.implementations.memory import ( # type: ignore[import-untyped]
+ MemoryFileSystem)
except ImportError:
pytest.skip("fsspec is not installed, skipping test")
@@ -412,7 +414,7 @@ def test_parquet_file_hugginface_support():
pytest.skip("fsspec is not installed, skipping Hugging Face test")
fake_hf_module = types.ModuleType("huggingface_hub")
- fake_hf_module.HfFileSystem = MemoryFileSystem
+ fake_hf_module.HfFileSystem = MemoryFileSystem # type: ignore[attr-defined]
with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}):
uri = "hf://datasets/apache/arrow/test.parquet"
table = pa.table({"a": range(10)})
@@ -424,7 +426,7 @@ def test_parquet_file_hugginface_support():
def test_fsspec_uri_raises_if_fsspec_is_not_available():
# sadly cannot patch sys.modules because cython will still be able to import fsspec
try:
- import fsspec # noqa: F401
+ import fsspec # type: ignore[import-untyped] # noqa: F401
except ImportError:
pass
else:
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py
index a49441f09f4..87787a0f3f0 100644
--- a/python/pyarrow/tests/parquet/test_parquet_writer.py
+++ b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -23,9 +23,10 @@
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
+ # type: ignore[attr-defined]
_test_table, _range_integers)
except ImportError:
- pq = None
+ pass
try:
@@ -33,7 +34,7 @@
import pandas.testing as tm
except ImportError:
- pd = tm = None
+ pass
# Marks all of the tests in this module
@@ -94,10 +95,10 @@ def test_parquet_invalid_writer(tempdir):
# avoid segfaults with invalid construction
with pytest.raises(TypeError):
some_schema = pa.schema([pa.field("x", pa.int32())])
- pq.ParquetWriter(None, some_schema)
+ pq.ParquetWriter(None, some_schema) # type: ignore[arg-type]
with pytest.raises(TypeError):
- pq.ParquetWriter(tempdir / "some_path", None)
+ pq.ParquetWriter(tempdir / "some_path", None) # type: ignore[arg-type]
@pytest.mark.pandas
@@ -335,6 +336,7 @@ def test_parquet_writer_store_schema(tempdir):
writer.write_table(table)
meta = pq.read_metadata(path1)
+ assert meta.metadata is not None
assert b'ARROW:schema' in meta.metadata
assert meta.metadata[b'ARROW:schema']
@@ -357,6 +359,7 @@ def test_parquet_writer_append_key_value_metadata(tempdir):
writer.add_key_value_metadata({'key2': '2', 'key3': '3'})
reader = pq.ParquetFile(path)
metadata = reader.metadata.metadata
+ assert metadata is not None
assert metadata[b'key1'] == b'1'
assert metadata[b'key2'] == b'2'
assert metadata[b'key3'] == b'3'
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index 8319c9ce3e4..434d5efc7d4 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -17,31 +17,32 @@
import datetime
import sys
+from typing import Any
-import pytest
-import hypothesis as h
-import hypothesis.strategies as st
+import pytest # type: ignore[import-not-found]
+import hypothesis as h # type: ignore[import-not-found]
+import hypothesis.strategies as st # type: ignore[import-not-found]
try:
- import hypothesis.extra.numpy as npst
+ import hypothesis.extra.numpy as npst # type: ignore[import-not-found]
except ImportError:
- npst = None
+ npst = None # type: ignore[assignment]
try:
- import hypothesis.extra.pytz as tzst
+ import hypothesis.extra.pytz as tzst # type: ignore[import-not-found]
except ImportError:
- tzst = None
+ tzst = None # type: ignore[assignment]
try:
import zoneinfo
except ImportError:
- zoneinfo = None
+ zoneinfo = None # type: ignore[assignment]
if sys.platform == 'win32':
try:
- import tzdata # noqa:F401
+ import tzdata # type: ignore[import-not-found, import-untyped] # noqa:F401
except ImportError:
- zoneinfo = None
+ zoneinfo = None # type: ignore[assignment]
try:
import numpy as np
except ImportError:
- np = None
+ np = None # type: ignore[assignment]
import pyarrow as pa
@@ -134,12 +135,12 @@
timezones = st.one_of(st.none(), st.timezones())
else:
timezones = st.none()
-timestamp_types = st.builds(
+timestamp_types: Any = st.builds(
pa.timestamp,
unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
tz=timezones
)
-duration_types = st.builds(
+duration_types: Any = st.builds(
pa.duration,
st.sampled_from(['s', 'ms', 'us', 'ns'])
)
@@ -234,13 +235,13 @@ def schemas(type_strategy=primitive_types, max_fields=None):
all_types = st.deferred(
lambda: (
- primitive_types |
- list_types() |
- struct_types() |
- dictionary_types() |
- map_types() |
- list_types(all_types) |
- struct_types(all_types)
+ primitive_types
+ | list_types()
+ | struct_types()
+ | dictionary_types()
+ | map_types()
+ | list_types(all_types) # type: ignore[has-type]
+ | struct_types(all_types) # type: ignore[has-type]
)
)
all_fields = fields(all_types)
@@ -280,6 +281,7 @@ def arrays(draw, type, size=None, nullable=True):
elif not isinstance(size, int):
raise TypeError('Size must be an integer')
+ assert npst is not None
if pa.types.is_null(ty):
h.assume(nullable)
value = st.none()
@@ -292,6 +294,7 @@ def arrays(draw, type, size=None, nullable=True):
values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,)))
# Workaround ARROW-4952: no easy way to assert array equality
# in a NaN-tolerant way.
+ assert np is not None
values[np.isnan(values)] = -42.0
return pa.array(values, type=ty)
elif pa.types.is_decimal(ty):
@@ -317,9 +320,11 @@ def arrays(draw, type, size=None, nullable=True):
offset = ty.tz.split(":")
offset_hours = int(offset[0])
offset_min = int(offset[1])
- tz = datetime.timedelta(hours=offset_hours, minutes=offset_min)
+ tz = datetime.timezone(
+ datetime.timedelta(hours=offset_hours, minutes=offset_min)
+ )
except ValueError:
- tz = zoneinfo.ZoneInfo(ty.tz)
+ tz = zoneinfo.ZoneInfo(str(ty.tz))
value = st.datetimes(timezones=st.just(tz), min_value=min_datetime,
max_value=max_datetime)
elif pa.types.is_duration(ty):
@@ -478,7 +483,9 @@ def pandas_compatible_list_types(
dictionary_types(
value_strategy=pandas_compatible_dictionary_value_types
),
- pandas_compatible_list_types(pandas_compatible_types),
- struct_types(pandas_compatible_types)
+ pandas_compatible_list_types(
+ pandas_compatible_types # type: ignore[has-type]
+ ),
+ struct_types(pandas_compatible_types) # type: ignore[has-type]
)
)
diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py
index cb97e3849fd..1285534d08a 100644
--- a/python/pyarrow/tests/test_acero.py
+++ b/python/pyarrow/tests/test_acero.py
@@ -16,6 +16,7 @@
# under the License.
import pytest
+from typing import Literal, cast
import pyarrow as pa
import pyarrow.compute as pc
@@ -37,9 +38,10 @@
try:
import pyarrow.dataset as ds
- from pyarrow.acero import ScanNodeOptions
+ from pyarrow._dataset import ScanNodeOptions
except ImportError:
- ds = None
+ ds = None # type: ignore[assignment]
+ ScanNodeOptions = None # type: ignore[assignment, misc]
pytestmark = pytest.mark.acero
@@ -53,7 +55,6 @@ def table_source():
def test_declaration():
-
table = pa.table({'a': [1, 2, 3], 'b': [4, 5, 6]})
table_opts = TableSourceNodeOptions(table)
filter_opts = FilterNodeOptions(field('a') > 1)
@@ -89,7 +90,8 @@ def test_declaration_to_reader(table_source):
def test_table_source():
with pytest.raises(TypeError):
- TableSourceNodeOptions(pa.record_batch([pa.array([1, 2, 3])], ["a"]))
+ TableSourceNodeOptions(pa.record_batch(
+ [pa.array([1, 2, 3])], ["a"]))
table_source = TableSourceNodeOptions(None)
decl = Declaration("table_source", table_source)
@@ -110,9 +112,9 @@ def test_filter(table_source):
# requires a pyarrow Expression
with pytest.raises(TypeError):
- FilterNodeOptions(pa.array([True, False, True]))
+ FilterNodeOptions(pa.array([True, False, True])) # type: ignore[arg-type]
with pytest.raises(TypeError):
- FilterNodeOptions(None)
+ FilterNodeOptions(None) # type: ignore[arg-type]
@pytest.mark.parametrize('source', [
@@ -267,19 +269,23 @@ def test_order_by():
table = pa.table({'a': [1, 2, 3, 4], 'b': [1, 3, None, 2]})
table_source = Declaration("table_source", TableSourceNodeOptions(table))
- ord_opts = OrderByNodeOptions([("b", "ascending")])
+ sort_keys = [("b", "ascending")]
+ sort_keys = cast(list[tuple[str, Literal["ascending", "descending"]]], sort_keys)
+ ord_opts = OrderByNodeOptions(sort_keys)
decl = Declaration.from_sequence([table_source, Declaration("order_by", ord_opts)])
result = decl.to_table()
expected = pa.table({"a": [1, 4, 2, 3], "b": [1, 2, 3, None]})
assert result.equals(expected)
- ord_opts = OrderByNodeOptions([(field("b"), "descending")])
+ ord_opts = OrderByNodeOptions(
+ [(field("b"), "descending")]) # type: ignore[arg-type]
decl = Declaration.from_sequence([table_source, Declaration("order_by", ord_opts)])
result = decl.to_table()
expected = pa.table({"a": [2, 4, 1, 3], "b": [3, 2, 1, None]})
assert result.equals(expected)
- ord_opts = OrderByNodeOptions([(1, "descending")], null_placement="at_start")
+ ord_opts = OrderByNodeOptions(
+ [(1, "descending")], null_placement="at_start") # type: ignore[arg-type]
decl = Declaration.from_sequence([table_source, Declaration("order_by", ord_opts)])
result = decl.to_table()
expected = pa.table({"a": [3, 2, 4, 1], "b": [None, 3, 2, 1]})
@@ -294,10 +300,12 @@ def test_order_by():
_ = decl.to_table()
with pytest.raises(ValueError, match="\"decreasing\" is not a valid sort order"):
- _ = OrderByNodeOptions([("b", "decreasing")])
+ _ = OrderByNodeOptions([("b", "decreasing")]) # type: ignore[arg-type]
with pytest.raises(ValueError, match="\"start\" is not a valid null placement"):
- _ = OrderByNodeOptions([("b", "ascending")], null_placement="start")
+ _ = OrderByNodeOptions(
+ [("b", "ascending")], null_placement="start" # type: ignore[arg-type]
+ )
def test_hash_join():
@@ -382,7 +390,9 @@ def test_hash_join_with_residual_filter():
# test filter expression referencing columns from both side
join_opts = HashJoinNodeOptions(
"left outer", left_keys="key", right_keys="key",
- filter_expression=pc.equal(pc.field("a"), 5) | pc.equal(pc.field("b"), 10)
+ filter_expression=(
+ pc.equal(pc.field("a"), 5)
+ | pc.equal(pc.field("b"), 10)) # type: ignore[reportOperatorIssue]
)
joined = Declaration(
"hashjoin", options=join_opts, inputs=[left_source, right_source])
@@ -462,6 +472,8 @@ def test_asof_join():
@pytest.mark.dataset
def test_scan(tempdir):
+ assert ds is not None
+ assert ScanNodeOptions is not None
table = pa.table({'a': [1, 2, 3], 'b': [4, 5, 6]})
ds.write_dataset(table, tempdir / "dataset", format="parquet")
dataset = ds.dataset(tempdir / "dataset", format="parquet")
@@ -486,11 +498,10 @@ def test_scan(tempdir):
assert decl.to_table().num_rows == 0
# projection scan option
-
scan_opts = ScanNodeOptions(dataset, columns={"a2": pc.multiply(field("a"), 2)})
decl = Declaration("scan", scan_opts)
result = decl.to_table()
# "a" is included in the result (needed later on for the actual projection)
assert result["a"].to_pylist() == [1, 2, 3]
# "b" is still included, but without data as it will be removed by the projection
- assert pc.all(result["b"].is_null()).as_py()
+ assert pc.all(result.column("b").is_null()).as_py()
diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py
index 76a766984da..9f61bc7ddfe 100644
--- a/python/pyarrow/tests/test_adhoc_memory_leak.py
+++ b/python/pyarrow/tests/test_adhoc_memory_leak.py
@@ -20,7 +20,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
import pyarrow.tests.util as test_util
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index ec361159c5f..969adcb87b0 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -18,19 +18,23 @@
from collections.abc import Iterable
import datetime
import decimal
-import hypothesis as h
-import hypothesis.strategies as st
+import hypothesis as h # type: ignore[import-not-found]
+import hypothesis.strategies as st # type: ignore[import-not-found]
import itertools
-import pytest
+import pytest # type: ignore[import-not-found]
import struct
import subprocess
import sys
import weakref
+from typing import TYPE_CHECKING
-try:
+if TYPE_CHECKING:
import numpy as np
-except ImportError:
- np = None
+else:
+ try:
+ import numpy as np
+ except ImportError:
+ np = None
import pyarrow as pa
import pyarrow.tests.strategies as past
@@ -71,7 +75,7 @@ def test_constructor_raises():
# This could happen by wrong capitalization.
# ARROW-2638: prevent calling extension class constructors directly
with pytest.raises(TypeError):
- pa.Array([1, 2])
+ pa.Array([1, 2]) # type: ignore[reportCallIssue]
def test_list_format():
@@ -321,11 +325,11 @@ def test_asarray():
arr = pa.array(range(4))
- # The iterator interface gives back an array of Int64Value's
+ # The iterator interface gives back an array of Int64Type's
np_arr = np.asarray([_ for _ in arr])
assert np_arr.tolist() == [0, 1, 2, 3]
assert np_arr.dtype == np.dtype('O')
- assert isinstance(np_arr[0], pa.lib.Int64Value)
+ assert isinstance(np_arr[0], pa.lib.Int64Type)
# Calling with the arrow array gives back an array with 'int64' dtype
np_arr = np.asarray(arr)
@@ -649,8 +653,8 @@ def test_array_eq():
@pytest.mark.numpy
def test_array_from_buffers():
- values_buf = pa.py_buffer(np.int16([4, 5, 6, 7]))
- nulls_buf = pa.py_buffer(np.uint8([0b00001101]))
+ values_buf = pa.py_buffer(np.array([4, 5, 6, 7], dtype=np.int16()))
+ nulls_buf = pa.py_buffer(np.array([0b00001101], dtype=np.uint8()))
arr = pa.Array.from_buffers(pa.int16(), 4, [nulls_buf, values_buf])
assert arr.type == pa.int16()
assert arr.to_pylist() == [4, None, 6, 7]
@@ -665,7 +669,9 @@ def test_array_from_buffers():
assert arr.to_pylist() == [None, 6, 7]
with pytest.raises(TypeError):
- pa.Array.from_buffers(pa.int16(), 3, ['', ''], offset=1)
+ pa.Array.from_buffers(
+ pa.int16(), 3, ['', ''], offset=1 # type: ignore[reportArgumentType]
+ )
def test_string_binary_from_buffers():
@@ -859,7 +865,8 @@ def test_struct_array_from_chunked():
chunked_arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError, match="Expected Array"):
- pa.StructArray.from_arrays([chunked_arr], ["foo"])
+ pa.StructArray.from_arrays(
+ [chunked_arr], ["foo"]) # type: ignore[reportArgumentType]
@pytest.mark.parametrize("offset", (0, 1))
@@ -1179,24 +1186,24 @@ def test_map_from_arrays():
keys = pa.array(pykeys, type='binary')
items = pa.array(pyitems, type='i4')
- result = pa.MapArray.from_arrays(offsets, keys, items)
+ result = pa.MapArray.from_arrays(offsets, keys, items) # type: ignore[arg-type]
expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32()))
assert result.equals(expected)
# pass in the type explicitly
- result = pa.MapArray.from_arrays(offsets, keys, items, pa.map_(
- keys.type,
- items.type
- ))
+ result = pa.MapArray.from_arrays(offsets, keys, items, # type: ignore[arg-type]
+ pa.map_(keys.type, items.type))
assert result.equals(expected)
# pass in invalid types
with pytest.raises(pa.ArrowTypeError, match='Expected map type, got string'):
- pa.MapArray.from_arrays(offsets, keys, items, pa.string())
+ pa.MapArray.from_arrays(
+ offsets, keys, items, pa.string() # type: ignore[arg-type]
+ )
with pytest.raises(pa.ArrowTypeError, match='Mismatching map items type'):
- pa.MapArray.from_arrays(offsets, keys, items, pa.map_(
+ pa.MapArray.from_arrays(offsets, keys, items, pa.map_( # type: ignore[arg-type]
keys.type,
# Larger than the original i4
pa.int64()
@@ -1234,7 +1241,7 @@ def test_map_from_arrays():
# error if null bitmap and offsets with nulls passed
msg1 = 'Ambiguous to specify both validity map and offsets with nulls'
with pytest.raises(pa.ArrowInvalid, match=msg1):
- pa.MapArray.from_arrays(offsets, keys, items, pa.map_(
+ pa.MapArray.from_arrays(offsets, keys, items, pa.map_( # type: ignore[arg-type]
keys.type,
items.type),
mask=pa.array([False, True, False], type=pa.bool_())
@@ -2642,7 +2649,7 @@ def test_interval_array_from_relativedelta():
assert arr.type == pa.month_day_nano_interval()
expected_list = [
None,
- pa.MonthDayNano([13, 8,
+ pa.MonthDayNano([13, 8, # type: ignore[arg-type]
(datetime.timedelta(seconds=1, microseconds=1,
minutes=1, hours=1) //
datetime.timedelta(microseconds=1)) * 1000])]
@@ -2675,7 +2682,7 @@ def test_interval_array_from_tuple():
assert arr.type == pa.month_day_nano_interval()
expected_list = [
None,
- pa.MonthDayNano([1, 2, -3])]
+ pa.MonthDayNano([1, 2, -3])] # type: ignore[arg-type]
expected = pa.array(expected_list)
assert arr.equals(expected)
assert arr.to_pylist() == expected_list
@@ -2696,8 +2703,8 @@ def test_interval_array_from_dateoffset():
assert arr.type == pa.month_day_nano_interval()
expected_list = [
None,
- pa.MonthDayNano([13, 8, 3661000001001]),
- pa.MonthDayNano([0, 0, 0])]
+ pa.MonthDayNano([13, 8, 3661000001001]), # type: ignore[arg-type]
+ pa.MonthDayNano([0, 0, 0])] # type: ignore[arg-type]
expected = pa.array(expected_list)
assert arr.equals(expected)
expected_from_pandas = [
@@ -2861,7 +2868,7 @@ def test_buffers_primitive():
# Slicing does not affect the buffers but the offset
a_sliced = a[1:]
buffers = a_sliced.buffers()
- a_sliced.offset == 1
+ assert a_sliced.offset == 1
assert len(buffers) == 2
null_bitmap = buffers[0].to_pybytes()
assert 1 <= len(null_bitmap) <= 64 # XXX this is varying
@@ -2869,7 +2876,7 @@ def test_buffers_primitive():
assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4)
- a = pa.array(np.int8([4, 5, 6]))
+ a = pa.array(np.array([4, 5, 6], dtype=np.int8))
buffers = a.buffers()
assert len(buffers) == 2
# No null bitmap from Numpy int array
@@ -2955,7 +2962,7 @@ def test_nbytes_size():
def test_invalid_tensor_constructor_repr():
# ARROW-2638: prevent calling extension class constructors directly
with pytest.raises(TypeError):
- repr(pa.Tensor([1]))
+ repr(pa.Tensor([1])) # type: ignore[reportCallIssue]
def test_invalid_tensor_construction():
@@ -3473,7 +3480,7 @@ def test_array_supported_masks():
with pytest.raises(pa.ArrowTypeError):
arr = pa.array([4, None, 4, 3],
- mask=[1.0, 2.0, 3.0, 4.0])
+ mask=[1.0, 2.0, 3.0, 4.0]) # type: ignore[reportArgumentType]
with pytest.raises(pa.ArrowTypeError):
arr = pa.array([4, None, 4, 3],
@@ -3760,11 +3767,11 @@ def test_concat_array_invalid_type():
# ARROW-9920 - do not segfault on non-array input
with pytest.raises(TypeError, match="should contain Array objects"):
- pa.concat_arrays([None])
+ pa.concat_arrays([None]) # type: ignore[reportArgumentType]
arr = pa.chunked_array([[0, 1], [3, 4]])
with pytest.raises(TypeError, match="should contain Array objects"):
- pa.concat_arrays(arr)
+ pa.concat_arrays(arr) # type: ignore[reportArgumentType]
@pytest.mark.pandas
@@ -4293,7 +4300,7 @@ def test_non_cpu_array():
with pytest.raises(NotImplementedError):
[i for i in iter(arr)]
with pytest.raises(NotImplementedError):
- arr == arr2
+ _ = arr == arr2
with pytest.raises(NotImplementedError):
arr.is_null()
with pytest.raises(NotImplementedError):
diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py
index 481c387d533..f8abec90269 100644
--- a/python/pyarrow/tests/test_cffi.py
+++ b/python/pyarrow/tests/test_cffi.py
@@ -24,7 +24,7 @@
try:
from pyarrow.cffi import ffi
except ImportError:
- ffi = None
+ pass
import pytest
@@ -32,7 +32,7 @@
import pandas as pd
import pandas.testing as tm
except ImportError:
- pd = tm = None
+ pd = None # type: ignore[assignment]
needs_cffi = pytest.mark.skipif(ffi is None,
@@ -148,7 +148,7 @@ def test_export_import_type():
# Invalid format string
pa.int32()._export_to_c(ptr_schema)
bad_format = ffi.new("char[]", b"zzz")
- c_schema.format = bad_format
+ c_schema.format = bad_format # type: ignore[attr-defined]
with pytest.raises(ValueError,
match="Invalid or unsupported format string"):
pa.DataType._import_from_c(ptr_schema)
@@ -248,9 +248,9 @@ def test_export_import_device_array():
arr = pa.array([[1], [2, 42]], type=pa.list_(pa.int32()))
arr._export_to_c_device(ptr_array)
- assert c_array.device_type == 1 # ARROW_DEVICE_CPU 1
- assert c_array.device_id == -1
- assert c_array.array.length == 2
+ assert c_array.device_type == 1 # type: ignore[attr-defined] # ARROW_DEVICE_CPU 1
+ assert c_array.device_id == -1 # type: ignore[attr-defined]
+ assert c_array.array.length == 2 # type: ignore[attr-defined]
def check_export_import_schema(schema_factory, expected_schema_factory=None):
@@ -310,9 +310,10 @@ def test_export_import_schema_float_pointer():
match = "Passing a pointer value as a float is unsafe"
with pytest.warns(UserWarning, match=match):
- make_schema()._export_to_c(float(ptr_schema))
+ make_schema()._export_to_c(float(ptr_schema)) # type: ignore[arg-type]
with pytest.warns(UserWarning, match=match):
- schema_new = pa.Schema._import_from_c(float(ptr_schema))
+ schema_new = pa.Schema._import_from_c(
+ float(ptr_schema)) # type: ignore[arg-type]
assert schema_new == make_schema()
@@ -405,9 +406,9 @@ def test_export_import_device_batch():
ptr_array = int(ffi.cast("uintptr_t", c_array))
batch = make_batch()
batch._export_to_c_device(ptr_array)
- assert c_array.device_type == 1 # ARROW_DEVICE_CPU 1
- assert c_array.device_id == -1
- assert c_array.array.length == 2
+ assert c_array.device_type == 1 # type: ignore[attr-defined] # ARROW_DEVICE_CPU 1
+ assert c_array.device_id == -1 # type: ignore[attr-defined]
+ assert c_array.array.length == 2 # type: ignore[attr-defined]
def _export_import_batch_reader(ptr_stream, reader_factory):
@@ -764,7 +765,7 @@ def test_import_device_no_cuda():
# patch the device type of the struct, this results in an invalid ArrowDeviceArray
# but this is just to test we raise am error before actually importing buffers
- c_array.device_type = 2 # ARROW_DEVICE_CUDA
+ c_array.device_type = 2 # type: ignore[attr-defined] # ARROW_DEVICE_CUDA
with pytest.raises(ImportError, match="Trying to import data on a CUDA device"):
pa.Array._import_from_c_device(ptr_array, arr.type)
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index fe810a6dc90..0ea2590a9f5 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -31,12 +31,12 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
try:
import pandas as pd
except ImportError:
- pd = None
+ pass
import pyarrow as pa
import pyarrow.compute as pc
@@ -45,7 +45,7 @@
try:
import pyarrow.substrait as pas
except ImportError:
- pas = None
+ pas = None # type: ignore[assignment]
exported_functions = [
func for (name, func) in sorted(pc.__dict__.items())
@@ -329,9 +329,11 @@ def test_function_attributes():
def test_input_type_conversion():
# Automatic array conversion from Python
arr = pc.add([1, 2], [4, None])
+ assert isinstance(arr, pa.Array)
assert arr.to_pylist() == [5, None]
# Automatic scalar conversion from Python
arr = pc.add([1, 2], 4)
+ assert isinstance(arr, pa.Array)
assert arr.to_pylist() == [5, 6]
# Other scalar type
assert pc.equal(["foo", "bar", None],
@@ -779,9 +781,11 @@ def test_min_max():
assert s.as_py() == {'min': 1, 'max': 6}
s = pc.min_max(data, options=pc.ScalarAggregateOptions())
assert s.as_py() == {'min': 1, 'max': 6}
- s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True))
+ s = pc.min_max(data, options=pc.ScalarAggregateOptions(
+ skip_nulls=True))
assert s.as_py() == {'min': 1, 'max': 6}
- s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False))
+ s = pc.min_max(data, options=pc.ScalarAggregateOptions(
+ skip_nulls=False))
assert s.as_py() == {'min': None, 'max': None}
# Options as dict of kwargs
@@ -799,11 +803,11 @@ def test_min_max():
# Wrong options type
options = pc.TakeOptions()
with pytest.raises(TypeError):
- s = pc.min_max(data, options=options)
+ s = pc.min_max(data, options=options) # type: ignore[arg-type]
# Missing argument
with pytest.raises(TypeError, match="min_max takes 1 positional"):
- s = pc.min_max()
+ s = pc.min_max() # type: ignore[call-arg]
def test_any():
@@ -844,12 +848,12 @@ def test_all():
assert pc.all(a, options=options).as_py() is None
a = pa.chunked_array([[True], [True, None]])
- assert pc.all(a).as_py() is True
- assert pc.all(a, options=options).as_py() is None
+ assert pc.all(a).as_py() is True # type: ignore[arg-type]
+ assert pc.all(a, options=options).as_py() is None # type: ignore[arg-type]
a = pa.chunked_array([[True], [False]])
- assert pc.all(a).as_py() is False
- assert pc.all(a, options=options).as_py() is False
+ assert pc.all(a).as_py() is False # type: ignore[arg-type]
+ assert pc.all(a, options=options).as_py() is False # type: ignore[arg-type]
def test_is_valid():
@@ -858,7 +862,7 @@ def test_is_valid():
assert pc.is_valid(data).to_pylist() == [True, True, False]
with pytest.raises(TypeError):
- pc.is_valid(data, options=None)
+ pc.is_valid(data, options=None) # type: ignore[call-arg]
def test_generated_docstrings():
@@ -1037,21 +1041,6 @@ def find_new_unicode_codepoints():
0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c,
0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8,
0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, }
-# utf8proc does not store if a codepoint is numeric
-numeric_info_missing = {
- 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
- 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
- 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
- 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
- 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
- 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
- 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
- 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
- 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
- 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
- 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5,
- 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca,
- 0x10fcb, }
# utf8proc has no no digit/numeric information
digit_info_missing = {
0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c,
@@ -1070,6 +1059,7 @@ def find_new_unicode_codepoints():
0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41,
0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63,
0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, }
+# utf8proc does not store if a codepoint is numeric
numeric_info_missing = {
0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
@@ -1104,7 +1094,7 @@ def test_string_py_compat_boolean(function_name, variant):
py_name = function_name.replace('_', '')
ignore = codepoints_ignore.get(function_name, set()) | \
find_new_unicode_codepoints()
- for i in range(128 if ascii else 0x11000):
+ for i in range(128 if ascii else 0x11000): # type: ignore[truthy-function]
if i in range(0xD800, 0xE000):
continue # bug? pyarrow doesn't allow utf16 surrogates
# the issues we know of, we skip
@@ -1593,10 +1583,10 @@ def test_filter_null_type():
@pytest.mark.parametrize("typ", ["array", "chunked_array"])
def test_compare_array(typ):
if typ == "array":
- def con(values):
+ def con(values): # type: ignore[no-redef]
return pa.array(values)
else:
- def con(values):
+ def con(values): # type: ignore[no-redef]
return pa.chunked_array([values])
arr1 = con([1, 2, 3, 4, None])
@@ -1624,10 +1614,10 @@ def con(values):
@pytest.mark.parametrize("typ", ["array", "chunked_array"])
def test_compare_string_scalar(typ):
if typ == "array":
- def con(values):
+ def con(values): # type: ignore[no-redef]
return pa.array(values)
else:
- def con(values):
+ def con(values): # type: ignore[no-redef]
return pa.chunked_array([values])
arr = con(['a', 'b', 'c', None])
@@ -1661,10 +1651,10 @@ def con(values):
@pytest.mark.parametrize("typ", ["array", "chunked_array"])
def test_compare_scalar(typ):
if typ == "array":
- def con(values):
+ def con(values): # type: ignore[no-redef]
return pa.array(values)
else:
- def con(values):
+ def con(values): # type: ignore[no-redef]
return pa.chunked_array([values])
arr = con([1, 2, 3, None])
@@ -1757,8 +1747,9 @@ def test_round_to_integer(ty):
"half_to_odd": [3, 3, 4, 5, -3, -3, -4, None],
}
for round_mode, expected in rmode_and_expected.items():
- options = RoundOptions(round_mode=round_mode)
- result = round(values, options=options)
+ options = RoundOptions(
+ round_mode=round_mode) # type: ignore[arg-type]
+ result = round(values, options=options) # type: ignore[arg-type]
expected_array = pa.array(expected, type=pa.float64())
assert expected_array.equals(result)
@@ -1776,7 +1767,9 @@ def test_round():
for ndigits, expected in ndigits_and_expected.items():
options = pc.RoundOptions(ndigits, "half_towards_infinity")
result = pc.round(values, options=options)
- np.testing.assert_allclose(result, pa.array(expected), equal_nan=True)
+ assert isinstance(result, pa.Array)
+ np.testing.assert_allclose(
+ result, pa.array(expected), equal_nan=True)
assert pc.round(values, ndigits,
round_mode="half_towards_infinity") == result
assert pc.round(values, ndigits, "half_towards_infinity") == result
@@ -1796,6 +1789,7 @@ def test_round_to_multiple():
for multiple, expected in multiple_and_expected.items():
options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity")
result = pc.round_to_multiple(values, options=options)
+ assert isinstance(result, pa.Array)
np.testing.assert_allclose(result, pa.array(expected), equal_nan=True)
assert pc.round_to_multiple(values, multiple,
"half_towards_infinity") == result
@@ -1803,11 +1797,11 @@ def test_round_to_multiple():
for multiple in [0, -2, pa.scalar(-10.4)]:
with pytest.raises(pa.ArrowInvalid,
match="Rounding multiple must be positive"):
- pc.round_to_multiple(values, multiple=multiple)
+ pc.round_to_multiple(values, multiple=multiple) # type: ignore[arg-type]
for multiple in [object, 99999999999999999999999]:
with pytest.raises(TypeError, match="is not a valid multiple type"):
- pc.round_to_multiple(values, multiple=multiple)
+ pc.round_to_multiple(values, multiple=multiple) # type: ignore[arg-type]
def test_round_binary():
@@ -1992,7 +1986,8 @@ def test_logical():
def test_dictionary_decode():
array = pa.array(["a", "a", "b", "c", "b"])
dictionary_array = array.dictionary_encode()
- dictionary_array_decode = pc.dictionary_decode(dictionary_array)
+ dictionary_array_decode = pc.dictionary_decode(
+ dictionary_array)
assert array != dictionary_array
@@ -2172,7 +2167,7 @@ def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx,
# Round `expected` to `scale` digits after the decimal point
expected = expected.quantize(decimal.Decimal(1).scaleb(-decimal_ty.scale))
s = pa.scalar(float_val, type=float_ty)
- actual = pc.cast(s, decimal_ty).as_py()
+ actual = pc.cast(s, decimal_ty).as_py() # type: ignore[union-attr]
if actual != expected:
# Allow the last digit to vary. The tolerance is higher for
# very high precisions as rounding errors can accumulate in
@@ -2264,8 +2259,9 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits):
expected = decimal.Decimal(mantissa) / 2**-float_exp
expected_as_int = round(expected.scaleb(scale))
actual = pc.cast(
- pa.scalar(float_val, type=float_ty), decimal_ty).as_py()
- actual_as_int = round(actual.scaleb(scale))
+ pa.scalar(float_val, type=float_ty), decimal_ty
+ ).as_py() # type: ignore[union-attr]
+ actual_as_int = round(actual.scaleb(scale)) # type: ignore[union-attr]
# We allow for a minor rounding error between expected and actual
assert abs(actual_as_int - expected_as_int) <= 1
@@ -2301,7 +2297,7 @@ def test_strptime():
@pytest.mark.pandas
@pytest.mark.timezone_data
def test_strftime():
- times = ["2018-03-10 09:00", "2038-01-31 12:23", None]
+ times: list[str | None] = ["2018-03-10 09:00", "2038-01-31 12:23", None]
timezones = ["CET", "UTC", "Europe/Ljubljana"]
formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", "%I",
@@ -2311,14 +2307,15 @@ def test_strftime():
formats.extend(["%c", "%x", "%X"])
for timezone in timezones:
- ts = pd.to_datetime(times).tz_localize(timezone)
+ ts = pd.to_datetime(times).tz_localize(timezone) # type: ignore[no-matching-overload]
for unit in ["s", "ms", "us", "ns"]:
tsa = pa.array(ts, type=pa.timestamp(unit, timezone))
for fmt in formats:
options = pc.StrftimeOptions(fmt)
result = pc.strftime(tsa, options=options)
+ st = ts.strftime(fmt) # type: ignore[call-non-callable]
# cast to the same type as result to ignore string vs large_string
- expected = pa.array(ts.strftime(fmt)).cast(result.type)
+ expected = pa.array(st).cast(result.type)
assert result.equals(expected)
fmt = "%Y-%m-%dT%H:%M:%S"
@@ -2326,42 +2323,48 @@ def test_strftime():
# Default format
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
result = pc.strftime(tsa, options=pc.StrftimeOptions())
- expected = pa.array(ts.strftime(fmt)).cast(result.type)
+ st = ts.strftime(fmt) # type: ignore[call-non-callable]
+ expected = pa.array(st).cast(result.type)
assert result.equals(expected)
# Default format plus timezone
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
- expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type)
+ st = ts.strftime(fmt + "%Z") # type: ignore[call-non-callable]
+ expected = pa.array(st).cast(result.type)
assert result.equals(expected)
# Pandas %S is equivalent to %S in arrow for unit="s"
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
options = pc.StrftimeOptions("%S")
result = pc.strftime(tsa, options=options)
- expected = pa.array(ts.strftime("%S")).cast(result.type)
+ st = ts.strftime("%S") # type: ignore[call-non-callable]
+ expected = pa.array(st).cast(result.type)
assert result.equals(expected)
# Pandas %S.%f is equivalent to %S in arrow for unit="us"
tsa = pa.array(ts, type=pa.timestamp("us", timezone))
options = pc.StrftimeOptions("%S")
result = pc.strftime(tsa, options=options)
- expected = pa.array(ts.strftime("%S.%f")).cast(result.type)
+ st = ts.strftime("%S.%f") # type: ignore[call-non-callable]
+ expected = pa.array(st).cast(result.type)
assert result.equals(expected)
# Test setting locale
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
options = pc.StrftimeOptions(fmt, locale="C")
result = pc.strftime(tsa, options=options)
- expected = pa.array(ts.strftime(fmt)).cast(result.type)
+ st = ts.strftime(fmt) # type: ignore[call-non-callable]
+ expected = pa.array(st).cast(result.type)
assert result.equals(expected)
# Test timestamps without timezone
fmt = "%Y-%m-%dT%H:%M:%S"
- ts = pd.to_datetime(times)
+ ts = pd.to_datetime(times) # type: ignore[no-matching-overload]
tsa = pa.array(ts, type=pa.timestamp("s"))
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
- expected = pa.array(ts.strftime(fmt)).cast(result.type)
+ st = ts.strftime(fmt) # type: ignore[call-non-callable]
+ expected = pa.array(st).cast(result.type)
# Positional format
assert pc.strftime(tsa, fmt) == result
@@ -2490,10 +2493,11 @@ def test_extract_datetime_components(request):
def test_offset_timezone():
- arr = pc.strptime(["2012-12-12T12:12:12"], format="%Y-%m-%dT%H:%M:%S", unit="s")
+ arr = pc.strptime(pa.array(["2012-12-12T12:12:12"]),
+ format="%Y-%m-%dT%H:%M:%S", unit="s")
zoned_arr = arr.cast(pa.timestamp("s", tz="+05:30"))
- assert pc.hour(zoned_arr)[0].as_py() == 17
- assert pc.minute(zoned_arr)[0].as_py() == 42
+ assert pc.hour(zoned_arr)[0].as_py() == 17 # type: ignore[index,arg-type]
+ assert pc.minute(zoned_arr)[0].as_py() == 42 # type: ignore[index,arg-type]
@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
@@ -2590,12 +2594,14 @@ def test_assume_timezone():
f"timezone '{timezone}'"):
pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise)
- expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True])
+ expected = ambiguous.tz_localize(
+ timezone, ambiguous=np.array([True, True, True]))
result = pc.assume_timezone(
ambiguous_array, options=options_ambiguous_earliest)
result.equals(pa.array(expected))
- expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False])
+ expected = ambiguous.tz_localize(
+ timezone, ambiguous=np.array([False, False, False]))
result = pc.assume_timezone(
ambiguous_array, options=options_ambiguous_latest)
result.equals(pa.array(expected))
@@ -2684,7 +2690,9 @@ def _check_temporal_rounding(ts, values, unit):
expected = np.where(
expected == ts,
- expected + pd.Timedelta(value, unit_shorthand[unit]),
+ expected + pd.Timedelta(
+ value, unit_shorthand[unit] # type: ignore[arg-type]
+ ),
expected)
np.testing.assert_array_equal(result, expected)
@@ -2746,7 +2754,7 @@ def test_count():
with pytest.raises(ValueError,
match='"something else" is not a valid count mode'):
- pc.count(arr, 'something else')
+ pc.count(arr, 'something else') # type: ignore[arg-type]
def test_index():
@@ -2796,7 +2804,7 @@ def test_partition_nth():
with pytest.raises(
ValueError,
match="'partition_nth_indices' cannot be called without options"):
- pc.partition_nth_indices(data)
+ pc.partition_nth_indices(data) # type: ignore[call-arg]
def test_partition_nth_null_placement():
@@ -2918,7 +2926,7 @@ def test_array_sort_indices():
assert result.to_pylist() == [2, 1, 0, 3]
with pytest.raises(ValueError, match="not a valid sort order"):
- pc.array_sort_indices(arr, order="nonscending")
+ pc.array_sort_indices(arr, order="nonscending") # type: ignore[arg-type]
def test_sort_indices_array():
@@ -2981,23 +2989,29 @@ def test_sort_indices_table():
pc.sort_indices(table, sort_keys=[("unknown", "ascending")])
with pytest.raises(ValueError, match="not a valid sort order"):
- pc.sort_indices(table, sort_keys=[("a", "nonscending")])
+ pc.sort_indices(
+ table, sort_keys=[("a", "nonscending")] # type: ignore[list-item]
+ )
def test_is_in():
arr = pa.array([1, 2, None, 1, 2, 3])
result = pc.is_in(arr, value_set=pa.array([1, 3, None]))
- assert result.to_pylist() == [True, False, True, True, False, True]
+ assert result.to_pylist() == [True, False, True, True,
+ False, True]
result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True)
- assert result.to_pylist() == [True, False, False, True, False, True]
+ assert result.to_pylist() == [True, False, False, True,
+ False, True]
result = pc.is_in(arr, value_set=pa.array([1, 3]))
- assert result.to_pylist() == [True, False, False, True, False, True]
+ assert result.to_pylist() == [True, False, False, True,
+ False, True]
result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True)
- assert result.to_pylist() == [True, False, False, True, False, True]
+ assert result.to_pylist() == [True, False, False, True,
+ False, True]
def test_index_in():
@@ -3061,7 +3075,7 @@ def test_quantile():
with pytest.raises(ValueError, match="Quantile must be between 0 and 1"):
pc.quantile(arr, q=1.1)
with pytest.raises(ValueError, match="not a valid quantile interpolation"):
- pc.quantile(arr, interpolation='zzz')
+ pc.quantile(arr, interpolation='zzz') # type: ignore[arg-type]
def test_tdigest():
@@ -3170,12 +3184,13 @@ def test_cumulative_sum(start, skip_nulls):
# Add `start` offset to expected array before comparing
expected = pc.add(expected_arrays[i], strt if strt is not None
else 0)
+ assert isinstance(expected, pa.Array)
np.testing.assert_array_almost_equal(result.to_numpy(
zero_copy_only=False), expected.to_numpy(zero_copy_only=False))
for strt in ['a', pa.scalar('arrow'), 1.1]:
with pytest.raises(pa.ArrowInvalid):
- pc.cumulative_sum([1, 2, 3], start=strt)
+ pc.cumulative_sum([1, 2, 3], start=strt) # type: ignore[arg-type]
@pytest.mark.numpy
@@ -3225,6 +3240,7 @@ def test_cumulative_prod(start, skip_nulls):
# Multiply `start` offset to expected array before comparing
expected = pc.multiply(expected_arrays[i], strt if strt is not None
else 1)
+ assert isinstance(expected, pa.Array)
np.testing.assert_array_almost_equal(result.to_numpy(
zero_copy_only=False), expected.to_numpy(zero_copy_only=False))
@@ -3283,8 +3299,10 @@ def test_cumulative_max(start, skip_nulls):
expected = pc.max_element_wise(
expected_arrays[i], strt if strt is not None else -1e9,
skip_nulls=False)
- np.testing.assert_array_almost_equal(result.to_numpy(
- zero_copy_only=False), expected.to_numpy(zero_copy_only=False))
+ np.testing.assert_array_almost_equal(
+ result.to_numpy(zero_copy_only=False),
+ expected.to_numpy(zero_copy_only=False)
+ )
for strt in ['a', pa.scalar('arrow'), 1.1]:
with pytest.raises(pa.ArrowInvalid):
@@ -3341,8 +3359,10 @@ def test_cumulative_min(start, skip_nulls):
expected = pc.min_element_wise(
expected_arrays[i], strt if strt is not None else 1e9,
skip_nulls=False)
- np.testing.assert_array_almost_equal(result.to_numpy(
- zero_copy_only=False), expected.to_numpy(zero_copy_only=False))
+ np.testing.assert_array_almost_equal(
+ result.to_numpy(zero_copy_only=False),
+ expected.to_numpy(zero_copy_only=False)
+ )
for strt in ['a', pa.scalar('arrow'), 1.1]:
with pytest.raises(pa.ArrowInvalid):
@@ -3420,7 +3440,7 @@ def test_struct_fields_options():
pc.struct_field(arr, '.a.foo')
with pytest.raises(pa.ArrowInvalid, match="cannot be called without options"):
- pc.struct_field(arr)
+ pc.struct_field(arr) # type: ignore[call-arg]
def test_case_when():
@@ -3472,7 +3492,7 @@ def test_utf8_normalize():
with pytest.raises(
ValueError,
match='"NFZ" is not a valid Unicode normalization form'):
- pc.utf8_normalize(arr, form="NFZ")
+ pc.utf8_normalize(arr, form="NFZ") # type: ignore[arg-type]
def test_random():
@@ -3499,7 +3519,7 @@ def test_random():
with pytest.raises(TypeError,
match=r"initializer should be 'system', an integer, "
r"or a hashable object; got \[\]"):
- pc.random(100, initializer=[])
+ pc.random(100, initializer=[]) # type: ignore[arg-type]
@pytest.mark.parametrize(
@@ -3549,7 +3569,7 @@ def test_rank_options():
match=r'"NonExisting" is not a valid tiebreaker'):
pc.RankOptions(sort_keys="descending",
null_placement="at_end",
- tiebreaker="NonExisting")
+ tiebreaker="NonExisting") # type: ignore[arg-type]
def test_rank_quantile_options():
@@ -3579,7 +3599,7 @@ def test_rank_quantile_options():
assert result.equals(expected_descending)
with pytest.raises(ValueError, match="not a valid sort order"):
- pc.rank_quantile(arr, sort_keys="XXX")
+ pc.rank_quantile(arr, sort_keys="XXX") # type: ignore[arg-type]
def test_rank_normal_options():
@@ -3765,21 +3785,21 @@ def test_expression_construction():
nested_field = pc.field(("nested", "field"))
nested_field2 = pc.field("nested", "field")
- zero | one == string
- ~true == false
+ _ = zero | one == string
+ _ = ~true == false
for typ in ("bool", pa.bool_()):
- field.cast(typ) == true
+ _ = field.cast(typ) == true
- field.isin([1, 2])
- nested_mixed_types.isin(["foo", "bar"])
+ _ = field.isin([1, 2])
+ _ = nested_mixed_types.isin(["foo", "bar"])
nested_field.isin(["foo", "bar"])
nested_field2.isin(["foo", "bar"])
with pytest.raises(TypeError):
- field.isin(1)
+ field.isin(1) # type: ignore[arg-type]
with pytest.raises(pa.ArrowInvalid):
- field != object()
+ _ = field != object()
def test_expression_boolean_operators():
@@ -3788,16 +3808,16 @@ def test_expression_boolean_operators():
false = pc.scalar(False)
with pytest.raises(ValueError, match="cannot be evaluated to python True"):
- true and false
+ _ = true and false
with pytest.raises(ValueError, match="cannot be evaluated to python True"):
- true or false
+ _ = true or false
with pytest.raises(ValueError, match="cannot be evaluated to python True"):
bool(true)
with pytest.raises(ValueError, match="cannot be evaluated to python True"):
- not true
+ _ = not true
def test_expression_call_function():
@@ -3826,7 +3846,7 @@ def test_cast_table_raises():
table = pa.table({'a': [1, 2]})
with pytest.raises(pa.lib.ArrowTypeError):
- pc.cast(table, pa.int64())
+ pc.cast(table, pa.int64()) # type: ignore[arg-type]
@pytest.mark.parametrize("start,stop,expected", (
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 07286125c4c..b5a472e3225 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -21,13 +21,18 @@
import itertools
import math
import re
+from typing import TYPE_CHECKING, cast
import hypothesis as h
import pytest
-try:
+
+if TYPE_CHECKING:
import numpy as np
-except ImportError:
- np = None
+else:
+ try:
+ import numpy as np
+ except ImportError:
+ np = None
from pyarrow.pandas_compat import _pandas_api # noqa
import pyarrow as pa
@@ -66,7 +71,7 @@ def __int__(self):
class MyBrokenInt:
def __int__(self):
- 1/0 # MARKER
+ _ = 1/0 # MARKER
def check_struct_type(ty, expected):
@@ -145,7 +150,7 @@ def test_object_with_getitem():
# https://github.com/apache/arrow/issues/34944
# considered as sequence because of __getitem__, but has no length
with pytest.raises(TypeError, match="has no len()"):
- pa.array(ObjectWithOnlyGetitem())
+ pa.array(ObjectWithOnlyGetitem()) # type: ignore[arg-type]
def _as_list(xs):
@@ -853,7 +858,7 @@ def test_large_binary_value(ty):
assert isinstance(arr, pa.Array)
assert arr.type == ty
assert len(arr) == 4
- buf = arr[1].as_buffer()
+ buf = cast(pa.FixedSizeBinaryScalar, arr[1]).as_buffer()
assert len(buf) == len(s) * nrepeats
@@ -1099,11 +1104,11 @@ def expected_datetime_value(dt):
),
]
utcdata = [
- pytz.utc.localize(data[0]),
+ pytz.utc.localize(cast(datetime.datetime, data[0])),
data[1],
None,
- data[3].astimezone(pytz.utc),
- data[4].astimezone(pytz.utc),
+ cast(datetime.datetime, data[3]).astimezone(pytz.utc),
+ cast(datetime.datetime, data[4]).astimezone(pytz.utc),
]
ty = pa.timestamp(unit, tz=timezone)
@@ -1231,9 +1236,9 @@ def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes():
None,
]
utcdata = [
- data[0].astimezone(pytz.utc),
- pytz.utc.localize(data[1]),
- data[2].astimezone(pytz.utc),
+ cast(datetime.datetime, data[0]).astimezone(pytz.utc),
+ pytz.utc.localize(cast(datetime.datetime, data[1])),
+ cast(datetime.datetime, data[2]).astimezone(pytz.utc),
None,
]
@@ -2062,8 +2067,8 @@ def test_map_from_dicts():
assert arr.to_pylist() == expected
# With omitted values
- data[1] = None
- expected[1] = None
+ data[1] = None # type: ignore[call-overload]
+ expected[1] = None # type: ignore[call-overload]
arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32()))
@@ -2388,6 +2393,7 @@ def test_nested_auto_chunking(ty, char):
}
+@pytest.mark.numpy
@pytest.mark.large_memory
def test_array_from_pylist_data_overflow():
# Regression test for ARROW-12983
@@ -2410,6 +2416,7 @@ def test_array_from_pylist_data_overflow():
assert len(arr.chunks) > 1
+@pytest.mark.numpy
@pytest.mark.slow
@pytest.mark.large_memory
def test_array_from_pylist_offset_overflow():
@@ -2434,6 +2441,7 @@ def test_array_from_pylist_offset_overflow():
assert len(arr.chunks) > 1
+@pytest.mark.numpy
@parametrize_with_collections_types
@pytest.mark.parametrize(('data', 'scalar_data', 'value_type'), [
([True, False, None], [pa.scalar(True), pa.scalar(False), None], pa.bool_()),
@@ -2471,8 +2479,10 @@ def test_array_from_pylist_offset_overflow():
pa.timestamp('us')
),
(
- [pa.MonthDayNano([1, -1, -10100])],
- [pa.scalar(pa.MonthDayNano([1, -1, -10100]))],
+ [pa.MonthDayNano([1, -1, -10100])], # type: ignore[call-arg, arg-type]
+ [pa.scalar(
+ pa.MonthDayNano([1, -1, -10100]) # type: ignore[call-arg, arg-type]
+ )],
pa.month_day_nano_interval()
),
(["a", "b"], [pa.scalar("a"), pa.scalar("b")], pa.string()),
diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py
index 7508d8f0b98..7d652acf62f 100644
--- a/python/pyarrow/tests/test_cpp_internals.py
+++ b/python/pyarrow/tests/test_cpp_internals.py
@@ -20,7 +20,8 @@
import pytest
-from pyarrow._pyarrow_cpp_tests import get_cpp_tests
+from pyarrow._pyarrow_cpp_tests import ( # type: ignore[import-not-found, import-untyped] # noqa: E501
+ get_cpp_tests)
def inject_cpp_tests(ns):
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index f510c6dbe23..530332b2124 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -178,6 +178,7 @@ def test_read_options(pickle_module):
encoding='utf16',
skip_rows_after_names=27)
+ assert opts.block_size is not None
assert opts.block_size > 0
opts.block_size = 12345
assert opts.block_size == 12345
@@ -302,6 +303,7 @@ def test_convert_options(pickle_module):
with pytest.raises(ValueError):
opts.decimal_point = '..'
+ assert opts.auto_dict_max_cardinality is not None
assert opts.auto_dict_max_cardinality > 0
opts.auto_dict_max_cardinality = 99999
assert opts.auto_dict_max_cardinality == 99999
@@ -323,7 +325,7 @@ def test_convert_options(pickle_module):
with pytest.raises(TypeError, match='DataType expected'):
opts.column_types = {'a': None}
with pytest.raises(TypeError):
- opts.column_types = 0
+ opts.column_types = 0 # type: ignore[reportAttributeAccessIssue]
assert isinstance(opts.null_values, list)
assert '' in opts.null_values
@@ -1158,10 +1160,14 @@ def test_auto_dict_encode(self):
table = self.read_bytes(rows, convert_options=opts,
validate_full=False)
assert table.schema == schema
- dict_values = table['a'].chunk(0).dictionary
+ column_chunk = table.column('a').chunk(0)
+ assert isinstance(column_chunk, pa.DictionaryArray)
+ dict_values = column_chunk.dictionary
assert len(dict_values) == 2
assert dict_values[0].as_py() == "ab"
- assert dict_values[1].as_buffer() == b"cd\xff"
+ dict_value = dict_values[1]
+ assert isinstance(dict_value, pa.StringScalar)
+ assert dict_value.as_buffer() == b"cd\xff"
# With invalid UTF8, checked
opts.check_utf8 = True
@@ -1502,7 +1508,7 @@ def signal_from_thread():
# Interruption should have arrived timely
assert last_duration <= 2.0
- e = exc_info.__context__
+ e = exc_info.__context__ # type: ignore[possibly-missing-attribute, misc]
assert isinstance(e, pa.ArrowCancelled)
assert e.signum == signal.SIGINT
@@ -1866,6 +1872,9 @@ def use_threads(self):
class BaseTestCompressedCSVRead:
+ def write_file(self, path, contents):
+ pass
+ csv_filename = ""
def setUp(self):
self.tmpdir = tempfile.mkdtemp(prefix='arrow-csv-test-')
@@ -1997,7 +2006,7 @@ def test_write_quoting_style():
except Exception as e:
# This will trigger when we try to write a comma (,)
# without quotes, which is invalid
- assert isinstance(e, res)
+ assert isinstance(e, res) # type: ignore[invalid-argument-type]
break
assert buf.getvalue() == res
buf.seek(0)
diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py
index e06f479987c..9d03a3bbff2 100644
--- a/python/pyarrow/tests/test_cuda.py
+++ b/python/pyarrow/tests/test_cuda.py
@@ -103,6 +103,7 @@ def make_random_buffer(size, target='host'):
assert size >= 0
buf = pa.allocate_buffer(size)
assert buf.size == size
+ assert isinstance(buf, pa.Buffer)
arr = np.frombuffer(buf, dtype=np.uint8)
assert arr.size == size
arr[:] = np.random.randint(low=1, high=255, size=size, dtype=np.uint8)
@@ -194,12 +195,14 @@ def test_context_device_buffer(size):
np.testing.assert_equal(arr[soffset:soffset + ssize], arr2)
# Creating a device buffer from a slice of an array
- cudabuf = global_context.buffer_from_data(arr, offset=soffset, size=ssize)
+ cudabuf = global_context.buffer_from_data(
+ arr, offset=soffset, size=ssize)
assert cudabuf.size == ssize
arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8)
np.testing.assert_equal(arr[soffset:soffset + ssize], arr2)
- cudabuf = global_context.buffer_from_data(arr[soffset:soffset+ssize])
+ cudabuf = global_context.buffer_from_data(
+ arr[soffset:soffset+ssize])
assert cudabuf.size == ssize
arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8)
np.testing.assert_equal(arr[soffset:soffset + ssize], arr2)
@@ -235,7 +238,8 @@ def test_context_device_buffer(size):
# Creating device buffer from HostBuffer slice
- cudabuf = global_context.buffer_from_data(buf, offset=soffset, size=ssize)
+ cudabuf = global_context.buffer_from_data(
+ buf, offset=soffset, size=ssize)
assert cudabuf.size == ssize
arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8)
np.testing.assert_equal(arr[soffset:soffset+ssize], arr2)
@@ -384,7 +388,8 @@ def test_copy_from_to_host(size):
device_buffer.copy_from_host(buf, position=0, nbytes=nbytes)
# Copy back to host and compare contents
- buf2 = device_buffer.copy_to_host(position=0, nbytes=nbytes)
+ buf2 = device_buffer.copy_to_host(
+ position=0, nbytes=nbytes)
arr2 = np.frombuffer(buf2, dtype=dt)
np.testing.assert_equal(arr, arr2)
@@ -395,7 +400,8 @@ def test_copy_to_host(size):
buf = dbuf.copy_to_host()
assert buf.is_cpu
- np.testing.assert_equal(arr, np.frombuffer(buf, dtype=np.uint8))
+ np.testing.assert_equal(arr, np.frombuffer(
+ buf, dtype=np.uint8))
buf = dbuf.copy_to_host(position=size//4)
assert buf.is_cpu
@@ -437,11 +443,13 @@ def test_copy_to_host(size):
np.frombuffer(buf, dtype=np.uint8))
dbuf.copy_to_host(buf=buf, nbytes=12)
- np.testing.assert_equal(arr[:12], np.frombuffer(buf, dtype=np.uint8)[:12])
+ np.testing.assert_equal(arr[:12], np.frombuffer(
+ buf, dtype=np.uint8)[:12])
dbuf.copy_to_host(buf=buf, nbytes=12, position=6)
- np.testing.assert_equal(arr[6:6+12],
- np.frombuffer(buf, dtype=np.uint8)[:12])
+ np.testing.assert_equal(
+ arr[6:6+12], np.frombuffer(buf, dtype=np.uint8)[:12]
+ )
for (position, nbytes) in [
(0, size+10), (10, size-5),
@@ -450,7 +458,8 @@ def test_copy_to_host(size):
with pytest.raises(ValueError,
match=('requested copy does not '
'fit into host buffer')):
- dbuf.copy_to_host(buf=buf, position=position, nbytes=nbytes)
+ dbuf.copy_to_host(
+ buf=buf, position=position, nbytes=nbytes)
@pytest.mark.parametrize("dest_ctx", ['same', 'another'])
@@ -460,7 +469,9 @@ def test_copy_from_device(dest_ctx, size):
lst = arr.tolist()
if dest_ctx == 'another':
dest_ctx = global_context1
- if buf.context.device_number == dest_ctx.device_number:
+ if (
+ buf.context.device_number == dest_ctx.device_number
+ ):
pytest.skip("not a multi-GPU system")
else:
dest_ctx = buf.context
@@ -563,7 +574,10 @@ def test_buffer_device():
_, buf = make_random_buffer(size=10, target='device')
assert buf.device_type == pa.DeviceAllocationType.CUDA
assert isinstance(buf.device, pa.Device)
- assert buf.device == global_context.memory_manager.device
+ assert (
+ buf.device ==
+ global_context.memory_manager.device
+ )
assert isinstance(buf.memory_manager, pa.MemoryManager)
assert not buf.is_cpu
assert not buf.device.is_cpu
@@ -807,8 +821,9 @@ def test_create_table_with_device_buffers():
def other_process_for_test_IPC(handle_buffer, expected_arr):
- other_context = pa.cuda.Context(0)
- ipc_handle = pa.cuda.IpcMemHandle.from_buffer(handle_buffer)
+ other_context = cuda.Context(0)
+ ipc_handle = cuda.IpcMemHandle.from_buffer(
+ handle_buffer)
ipc_buf = other_context.open_ipc_buffer(ipc_handle)
ipc_buf.context.synchronize()
buf = ipc_buf.copy_to_host()
@@ -848,7 +863,8 @@ def test_copy_to():
batch = pa.record_batch({"col": arr})
batch_cuda = batch.copy_to(dest)
- buf_cuda = batch_cuda["col"].buffers()[1]
+ buf_cuda = batch_cuda.column("col").buffers()[1]
+ assert buf_cuda is not None
assert not buf_cuda.is_cpu
assert buf_cuda.device_type == pa.DeviceAllocationType.CUDA
assert buf_cuda.device == mm_cuda.device
@@ -949,7 +965,8 @@ def test_device_interface_batch_array():
cbatch._export_to_c_device(ptr_array, ptr_schema)
# Delete and recreate C++ objects from exported pointers
del cbatch
- cbatch_new = pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema)
+ cbatch_new = pa.RecordBatch._import_from_c_device(
+ ptr_array, ptr_schema)
assert cbatch_new.schema == schema
batch_new = cbatch_new.copy_to(pa.default_cpu_memory_manager())
assert batch_new.equals(batch)
@@ -957,13 +974,15 @@ def test_device_interface_batch_array():
del cbatch_new
# Now released
with pytest.raises(ValueError, match="Cannot import released ArrowSchema"):
- pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema)
+ pa.RecordBatch._import_from_c_device(
+ ptr_array, ptr_schema)
# Not a struct type
pa.int32()._export_to_c(ptr_schema)
with pytest.raises(ValueError,
match="ArrowSchema describes non-struct type"):
- pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema)
+ pa.RecordBatch._import_from_c_device(
+ ptr_array, ptr_schema)
def test_print_array():
diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py
index 876f3c7f761..4a5bc797533 100644
--- a/python/pyarrow/tests/test_cuda_numba_interop.py
+++ b/python/pyarrow/tests/test_cuda_numba_interop.py
@@ -28,7 +28,6 @@
from numba.cuda.cudadrv.devicearray import DeviceNDArray # noqa: E402
-
context_choices = None
context_choice_ids = ['pyarrow.cuda', 'numba.cuda']
@@ -62,17 +61,19 @@ def test_context(c):
def make_random_buffer(size, target='host', dtype='uint8', ctx=None):
"""Return a host or device buffer with random data.
"""
- dtype = np.dtype(dtype)
+ assert np is not None
+ dtype_obj = np.dtype(dtype)
if target == 'host':
assert size >= 0
- buf = pa.allocate_buffer(size*dtype.itemsize)
- arr = np.frombuffer(buf, dtype=dtype)
+ buf = pa.allocate_buffer(size*dtype_obj.itemsize)
+ arr = np.frombuffer(buf, dtype=dtype_obj)
arr[:] = np.random.randint(low=0, high=255, size=size,
dtype=np.uint8)
return arr, buf
elif target == 'device':
arr, buf = make_random_buffer(size, target='host', dtype=dtype)
- dbuf = ctx.new_buffer(size * dtype.itemsize)
+ assert ctx is not None
+ dbuf = ctx.new_buffer(size * dtype_obj.itemsize)
dbuf.copy_from_host(buf, position=0, nbytes=buf.size)
return arr, dbuf
raise ValueError('invalid target value')
@@ -161,8 +162,8 @@ def __cuda_array_interface__(self):
ids=context_choice_ids)
@pytest.mark.parametrize("dtype", dtypes, ids=dtypes)
def test_numba_memalloc(c, dtype):
+ assert np is not None
ctx, nb_ctx = context_choices[c]
- dtype = np.dtype(dtype)
# Allocate memory using numba context
# Warning: this will not be reflected in pyarrow context manager
# (e.g bytes_allocated does not change)
@@ -198,6 +199,7 @@ def test_pyarrow_memalloc(c, dtype):
ids=context_choice_ids)
@pytest.mark.parametrize("dtype", dtypes, ids=dtypes)
def test_numba_context(c, dtype):
+ assert np is not None
ctx, nb_ctx = context_choices[c]
size = 10
with nb_cuda.gpus[0]:
@@ -209,7 +211,10 @@ def test_numba_context(c, dtype):
np.testing.assert_equal(darr.copy_to_host(), arr)
darr[0] = 99
cbuf.context.synchronize()
- arr2 = np.frombuffer(cbuf.copy_to_host(), dtype=dtype)
+ arr2 = np.frombuffer(
+ cbuf.copy_to_host(),
+ dtype=np.dtype(dtype)
+ )
assert arr2[0] == 99
@@ -217,6 +222,7 @@ def test_numba_context(c, dtype):
ids=context_choice_ids)
@pytest.mark.parametrize("dtype", dtypes, ids=dtypes)
def test_pyarrow_jit(c, dtype):
+ assert np is not None
ctx, nb_ctx = context_choices[c]
@nb_cuda.jit
@@ -234,5 +240,8 @@ def increment_by_one(an_array):
darr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=mem)
increment_by_one[blockspergrid, threadsperblock](darr)
cbuf.context.synchronize()
- arr1 = np.frombuffer(cbuf.copy_to_host(), dtype=arr.dtype)
+ arr1 = np.frombuffer(
+ cbuf.copy_to_host(),
+ dtype=arr.dtype
+ )
np.testing.assert_equal(arr1, arr + 1)
diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py
index a142e66db56..11ef01412a6 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -89,7 +89,7 @@ def test_cython_api(tmpdir):
Basic test for the Cython API.
"""
# Fail early if cython is not found
- import cython # noqa
+ import cython # type: ignore[import-untyped, import-not-found] # noqa
with tmpdir.as_cwd():
# Set up temporary workspace
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index d00c0c4b3eb..ce913612bad 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -32,7 +32,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pytest
import pyarrow as pa
@@ -40,6 +40,7 @@
import pyarrow.csv
import pyarrow.feather
import pyarrow.fs as fs
+from pyarrow.fs import FileInfo
import pyarrow.json
from pyarrow.lib import is_threading_enabled
from pyarrow.tests.util import (FSProtocolClass, ProxyHandler,
@@ -49,17 +50,17 @@
try:
import pandas as pd
except ImportError:
- pd = None
+ pass
try:
import pyarrow.dataset as ds
except ImportError:
- ds = None
+ pass
try:
import pyarrow.parquet as pq
except ImportError:
- pq = None
+ pass
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not dataset'
@@ -395,14 +396,16 @@ def test_filesystem_dataset(mockfs):
# validation of required arguments
with pytest.raises(TypeError, match="incorrect type"):
- ds.FileSystemDataset(fragments, file_format, schema)
+ ds.FileSystemDataset(fragments, file_format, schema) # type: ignore[arg-type]
# validation of root_partition
with pytest.raises(TypeError, match="incorrect type"):
- ds.FileSystemDataset(fragments, schema=schema,
- format=file_format, root_partition=1)
+ ds.FileSystemDataset(
+ fragments, schema=schema, format=file_format,
+ root_partition=1) # type: ignore[arg-type]
# missing required argument in from_paths
with pytest.raises(TypeError, match="incorrect type"):
- ds.FileSystemDataset.from_paths(fragments, format=file_format)
+ ds.FileSystemDataset.from_paths(
+ fragments, format=file_format) # type: ignore[arg-type]
def test_filesystem_dataset_no_filesystem_interaction(dataset_reader):
@@ -827,7 +830,8 @@ def test_partitioning():
load_back = None
with pytest.raises(ValueError,
match="Expected Partitioning or PartitioningFactory"):
- load_back = ds.dataset(tempdir, format='ipc', partitioning=int(0))
+ load_back = ds.dataset(
+ tempdir, format='ipc', partitioning=int(0)) # type: ignore[arg-type]
assert load_back is None
@@ -859,8 +863,8 @@ def test_partitioning_pickling(pickle_module):
)
def test_dataset_partitioning_format(
flavor: str,
- expected_defined_partition: tuple,
- expected_undefined_partition: tuple,
+ expected_defined_partition: tuple[str],
+ expected_undefined_partition: tuple[str],
):
partitioning_schema = pa.schema([("foo", pa.string()), ("bar", pa.string())])
@@ -1215,6 +1219,7 @@ def test_make_fragment(multisourcefs):
parquet_format = ds.ParquetFileFormat()
dataset = ds.dataset('/plain', filesystem=multisourcefs,
format=parquet_format)
+ assert isinstance(dataset, ds.FileSystemDataset)
for path in dataset.files:
fragment = parquet_format.make_fragment(path, multisourcefs)
@@ -1252,7 +1257,9 @@ def test_make_fragment_with_size(s3_example_simple):
assert tbl.equals(table)
# true sizes -> works
- sizes_true = [dataset.filesystem.get_file_info(x).size for x in dataset.files]
+ dataset_file_info = [dataset.filesystem.get_file_info(x) for x in dataset.files]
+ sizes_true = [x.size if isinstance(
+ x, FileInfo) else None for x in dataset_file_info]
fragments_with_size = [file_format.make_fragment(path, fs, file_size=size)
for path, size in zip(paths, sizes_true)]
dataset_with_size = ds.FileSystemDataset(
@@ -1943,6 +1950,7 @@ def test_fragments_repr(tempdir, dataset):
# single-file parquet dataset (no partition information in repr)
table, path = _create_single_file(tempdir)
dataset = ds.dataset(path, format="parquet")
+ assert isinstance(dataset, ds.FileSystemDataset)
fragment = list(dataset.get_fragments())[0]
assert (
repr(fragment) ==
@@ -1954,6 +1962,7 @@ def test_fragments_repr(tempdir, dataset):
path = tempdir / "data.feather"
pa.feather.write_feather(table, path)
dataset = ds.dataset(path, format="feather")
+ assert isinstance(dataset, ds.FileSystemDataset)
fragment = list(dataset.get_fragments())[0]
assert (
repr(fragment) ==
@@ -2065,7 +2074,7 @@ def test_partitioning_factory_segment_encoding(pickled, pickle_module):
actual = factory.finish().to_table(columns={
"date_int": ds.field("date").cast(pa.int64()),
})
- assert actual[0][0].as_py() == 1620086400
+ assert actual.column(0).chunk(0)[0].as_py() == 1620086400
partitioning_factory = ds.DirectoryPartitioning.discover(
["date", "string"], segment_encoding="none")
@@ -2105,7 +2114,7 @@ def test_partitioning_factory_segment_encoding(pickled, pickle_module):
actual = factory.finish().to_table(columns={
"date_int": ds.field("date").cast(pa.int64()),
})
- assert actual[0][0].as_py() == 1620086400
+ assert actual.column(0).chunk(0)[0].as_py() == 1620086400
partitioning_factory = ds.HivePartitioning.discover(
segment_encoding="none")
@@ -2173,7 +2182,7 @@ def test_partitioning_factory_hive_segment_encoding_key_encoded(pickled, pickle_
actual = factory.finish().to_table(columns={
"date_int": ds.field("test'; date").cast(pa.int64()),
})
- assert actual[0][0].as_py() == 1620086400
+ assert actual.column(0).chunk(0)[0].as_py() == 1620086400
partitioning_factory = ds.HivePartitioning.discover(
segment_encoding="uri")
@@ -2231,7 +2240,7 @@ def test_dictionary_partitioning_outer_nulls_raises(tempdir):
def test_positional_keywords_raises(tempdir):
table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
with pytest.raises(TypeError):
- ds.write_dataset(table, tempdir, "basename-{i}.arrow")
+ ds.write_dataset(table, tempdir, "basename-{i}.arrow") # type: ignore[arg-type]
@pytest.mark.parquet
@@ -2245,20 +2254,20 @@ def test_read_partition_keys_only(tempdir):
'key': pa.repeat(0, BATCH_SIZE + 1),
'value': np.arange(BATCH_SIZE + 1)})
pq.write_to_dataset(
- table[:BATCH_SIZE],
+ table[:BATCH_SIZE], # type: ignore[arg-type]
tempdir / 'one', partition_cols=['key'])
pq.write_to_dataset(
- table[:BATCH_SIZE + 1],
+ table[:BATCH_SIZE + 1], # type: ignore[arg-type]
tempdir / 'two', partition_cols=['key'])
table = pq.read_table(tempdir / 'one', columns=['key'])
- assert table['key'].num_chunks == 1
+ assert table.column('key').num_chunks == 1
table = pq.read_table(tempdir / 'two', columns=['key', 'value'])
- assert table['key'].num_chunks == 2
+ assert table.column('key').num_chunks == 2
table = pq.read_table(tempdir / 'two', columns=['key'])
- assert table['key'].num_chunks == 2
+ assert table.column('key').num_chunks == 2
def _has_subdirs(basedir):
@@ -2319,9 +2328,9 @@ def test_partitioning_function():
with pytest.raises(ValueError):
ds.partitioning()
with pytest.raises(ValueError, match="Expected list"):
- ds.partitioning(field_names=schema)
+ ds.partitioning(field_names=schema) # type: ignore[arg-type]
with pytest.raises(ValueError, match="Cannot specify both"):
- ds.partitioning(schema, field_names=schema)
+ ds.partitioning(schema, field_names=schema) # type: ignore[call-overload]
# Hive partitioning
part = ds.partitioning(schema, flavor="hive")
@@ -2332,13 +2341,13 @@ def test_partitioning_function():
assert isinstance(part, ds.PartitioningFactory)
# cannot pass list of names
with pytest.raises(ValueError):
- ds.partitioning(names, flavor="hive")
+ ds.partitioning(names, flavor="hive") # type: ignore[arg-type]
with pytest.raises(ValueError, match="Cannot specify 'field_names'"):
ds.partitioning(field_names=names, flavor="hive")
# unsupported flavor
with pytest.raises(ValueError):
- ds.partitioning(schema, flavor="unsupported")
+ ds.partitioning(schema, flavor="unsupported") # type: ignore[arg-type]
@pytest.mark.parquet
@@ -2353,6 +2362,8 @@ def test_directory_partitioning_dictionary_key(mockfs):
dataset = ds.dataset(
"subdir", format="parquet", filesystem=mockfs, partitioning=part
)
+ assert isinstance(dataset, ds.FileSystemDataset)
+ assert dataset.partitioning is not None
assert dataset.partitioning.schema == schema
table = dataset.to_table()
@@ -2373,6 +2384,8 @@ def test_hive_partitioning_dictionary_key(multisourcefs):
dataset = ds.dataset(
"hive", format="parquet", filesystem=multisourcefs, partitioning=part
)
+ assert isinstance(dataset, ds.FileSystemDataset)
+ assert dataset.partitioning is not None
assert dataset.partitioning.schema == schema
table = dataset.to_table()
@@ -2380,11 +2393,13 @@ def test_hive_partitioning_dictionary_key(multisourcefs):
month_dictionary = list(range(1, 13))
assert table.column('year').type.equals(schema.types[0])
for chunk in table.column('year').chunks:
+ assert isinstance(chunk, pa.DictionaryArray)
actual = chunk.dictionary.to_pylist()
actual.sort()
assert actual == year_dictionary
assert table.column('month').type.equals(schema.types[1])
for chunk in table.column('month').chunks:
+ assert isinstance(chunk, pa.DictionaryArray)
actual = chunk.dictionary.to_pylist()
actual.sort()
assert actual == month_dictionary
@@ -2574,6 +2589,8 @@ def test_construct_from_mixed_child_datasets(mockfs):
'subdir/2/yyy/file1.parquet'], filesystem=mockfs)
b = ds.dataset('subdir', filesystem=mockfs)
+ assert isinstance(a, ds.FileSystemDataset)
+ assert isinstance(b, ds.FileSystemDataset)
dataset = ds.dataset([a, b])
assert isinstance(dataset, ds.UnionDataset)
@@ -2585,8 +2602,8 @@ def test_construct_from_mixed_child_datasets(mockfs):
assert len(dataset.children) == 2
for child in dataset.children:
- assert child.files == ['subdir/1/xxx/file0.parquet',
- 'subdir/2/yyy/file1.parquet']
+ assert child.files == [ # type: ignore[attr-defined]
+ 'subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
def test_construct_empty_dataset():
@@ -2620,7 +2637,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
batch2 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["b"])
with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'):
- ds.dataset([child1, child2])
+ ds.dataset([child1, child2]) # type: ignore[arg-type]
expected = (
"Expected a list of path-like or dataset objects, or a list "
@@ -2628,14 +2645,14 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
"types: int"
)
with pytest.raises(TypeError, match=expected):
- ds.dataset([1, 2, 3])
+ ds.dataset([1, 2, 3]) # type: ignore[arg-type]
expected = (
"Expected a path-like, list of path-likes or a list of Datasets "
"instead of the given type: NoneType"
)
with pytest.raises(TypeError, match=expected):
- ds.dataset(None)
+ ds.dataset(None) # type: ignore[arg-type]
expected = (
"Expected a path-like, list of path-likes or a list of Datasets "
@@ -2662,7 +2679,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
"batches or tables. The given list contains the following types:"
)
with pytest.raises(TypeError, match=expected):
- ds.dataset([batch1, 0])
+ ds.dataset([batch1, 0]) # type: ignore[arg-type]
expected = (
"Expected a list of tables or batches. The given list contains a int"
@@ -2752,7 +2769,7 @@ def test_open_dataset_partitioned_directory(tempdir, dataset_reader, pickle_modu
dataset = ds.dataset(
str(path),
partitioning=ds.partitioning(
- pa.schema([("part", pa.int8())]), flavor="hive"))
+ schema=pa.schema([("part", pa.int8())]), flavor="hive"))
expected_schema = table.schema.append(pa.field("part", pa.int8()))
assert dataset.schema.equals(expected_schema)
@@ -2797,7 +2814,7 @@ def test_open_union_dataset(tempdir, dataset_reader, pickle_module):
_, path = _create_single_file(tempdir)
dataset = ds.dataset(path)
- union = ds.dataset([dataset, dataset])
+ union = ds.dataset([dataset, dataset]) # type: ignore[arg-type]
assert isinstance(union, ds.UnionDataset)
pickled = pickle_module.loads(pickle_module.dumps(union))
@@ -2807,7 +2824,7 @@ def test_open_union_dataset(tempdir, dataset_reader, pickle_module):
def test_open_union_dataset_with_additional_kwargs(multisourcefs):
child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet')
with pytest.raises(ValueError, match="cannot pass any additional"):
- ds.dataset([child], format="parquet")
+ ds.dataset([child], format="parquet") # type: ignore[arg-type]
def test_open_dataset_non_existing_file():
@@ -2894,7 +2911,7 @@ def expected_type(key):
def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module):
# https://issues.apache.org/jira/browse/ARROW-11400
table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)})
- part = ds.partitioning(table.select(['part']).schema, flavor="hive")
+ part = ds.partitioning(schema=table.select(['part']).schema, flavor="hive")
ds.write_dataset(table, tempdir, partitioning=part, format="feather")
dataset = ds.dataset(
@@ -2902,7 +2919,7 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module)
partitioning=ds.HivePartitioning.discover(infer_dictionary=True)
)
expected = pa.table(
- {'col': table['col'], 'part': table['part'].dictionary_encode()}
+ {'col': table.column('col'), 'part': table.column('part').dictionary_encode()}
)
assert dataset.to_table().equals(expected)
fragment = list(dataset.get_fragments())[0]
@@ -2987,7 +3004,7 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple):
assert dataset.to_table().equals(table)
# directly passing the fsspec-handler
- fs = PyFileSystem(FSSpecHandler(fs))
+ fs = PyFileSystem(FSSpecHandler(fs)) # type: ignore[abstract]
dataset = ds.dataset(path, format="parquet", filesystem=fs)
assert dataset.to_table().equals(table)
@@ -3089,7 +3106,7 @@ def test_file_format_inspect_fsspec(tempdir):
format = ds.ParquetFileFormat()
# manually creating a PyFileSystem instead of using fs._ensure_filesystem
# which would convert an fsspec local filesystem to a native one
- filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs))
+ filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) # type: ignore[abstract]
schema = format.inspect(path, filesystem)
assert schema.equals(table.schema)
@@ -3107,11 +3124,11 @@ def test_filter_timestamp(tempdir, dataset_reader):
"id": range(10)})
# write dataset partitioned on dates (as strings)
- part = ds.partitioning(table.select(['dates']).schema, flavor="hive")
+ part = ds.partitioning(schema=table.select(['dates']).schema, flavor="hive")
ds.write_dataset(table, path, partitioning=part, format="feather")
# read dataset partitioned on dates (as timestamps)
- part = ds.partitioning(pa.schema([("dates", pa.timestamp("s"))]),
+ part = ds.partitioning(schema=pa.schema([("dates", pa.timestamp("s"))]),
flavor="hive")
dataset = ds.dataset(path, format="feather", partitioning=part)
@@ -3162,7 +3179,7 @@ def test_filter_compute_expression(tempdir, dataset_reader):
filter_ = pc.is_in(ds.field('A'), pa.array(["a", "b"]))
assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 3
- filter_ = pc.hour(ds.field('B')) >= 3
+ filter_ = pc.hour(ds.field('B')) >= 3 # type: ignore[operator]
assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 2
days = pc.days_between(ds.field('B'), ds.field("C"))
@@ -3194,12 +3211,12 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
assert child1.schema != child2.schema != child3.schema
- assembled = ds.dataset([child1, child2, child3])
+ assembled = ds.dataset([child1, child2, child3]) # type: ignore[arg-type]
assert isinstance(assembled, ds.UnionDataset)
msg = 'cannot pass any additional arguments'
with pytest.raises(ValueError, match=msg):
- ds.dataset([child1, child2], filesystem=multisourcefs)
+ ds.dataset([child1, child2], filesystem=multisourcefs) # type: ignore[arg-type]
expected_schema = pa.schema([
('date', pa.date32()),
@@ -3213,7 +3230,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
assert assembled.schema.equals(expected_schema)
assert assembled.to_table().schema.equals(expected_schema)
- assembled = ds.dataset([child1, child3])
+ assembled = ds.dataset([child1, child3]) # type: ignore[arg-type]
expected_schema = pa.schema([
('date', pa.date32()),
('index', pa.int64()),
@@ -3230,6 +3247,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
('color', pa.string()),
('date', pa.date32()),
])
+ # type: ignore[arg-type]
assembled = ds.dataset([child1, child3], schema=expected_schema)
assert assembled.to_table().schema.equals(expected_schema)
@@ -3238,6 +3256,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
('color', pa.string()),
('unknown', pa.string()) # fill with nulls
])
+ # type: ignore[arg-type]
assembled = ds.dataset([child1, child3], schema=expected_schema)
assert assembled.to_table().schema.equals(expected_schema)
@@ -3248,7 +3267,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
child4 = ds.dataset(path)
with pytest.raises(pa.ArrowTypeError, match='Unable to merge'):
- ds.dataset([child1, child4])
+ ds.dataset([child1, child4]) # type: ignore[arg-type]
def test_dataset_from_a_list_of_local_directories_raises(multisourcefs):
@@ -3259,7 +3278,7 @@ def test_dataset_from_a_list_of_local_directories_raises(multisourcefs):
def test_union_dataset_filesystem_datasets(multisourcefs):
# without partitioning
- dataset = ds.dataset([
+ dataset = ds.dataset([ # type: ignore[arg-type]
ds.dataset('/plain', filesystem=multisourcefs),
ds.dataset('/schema', filesystem=multisourcefs),
ds.dataset('/hive', filesystem=multisourcefs),
@@ -3273,7 +3292,7 @@ def test_union_dataset_filesystem_datasets(multisourcefs):
assert dataset.schema.equals(expected_schema)
# with hive partitioning for two hive sources
- dataset = ds.dataset([
+ dataset = ds.dataset([ # type: ignore[arg-type]
ds.dataset('/plain', filesystem=multisourcefs),
ds.dataset('/schema', filesystem=multisourcefs),
ds.dataset('/hive', filesystem=multisourcefs, partitioning='hive')
@@ -3333,7 +3352,7 @@ def _check_dataset(schema, expected, expected_schema=None):
# Specifying with differing field types
schema = pa.schema([('a', 'int32'), ('b', 'float64')])
dataset = ds.dataset(str(tempdir / "data.parquet"), schema=schema)
- expected = pa.table([table['a'].cast('int32'),
+ expected = pa.table([table['a'].cast('int32'), # type: ignore[arg-type]
table['b']],
names=['a', 'b'])
_check_dataset(schema, expected)
@@ -3834,7 +3853,7 @@ def test_parquet_dataset_factory_fsspec(tempdir):
fsspec_fs = fsspec.filesystem("file")
# manually creating a PyFileSystem, because passing the local fsspec
# filesystem would internally be converted to native LocalFileSystem
- filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs))
+ filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) # type: ignore[abstract]
dataset = ds.parquet_dataset(metadata_path, filesystem=filesystem)
assert dataset.schema.equals(table.schema)
assert len(dataset.files) == 4
@@ -4042,12 +4061,14 @@ def test_filter_mismatching_schema(tempdir, dataset_reader):
# filtering on a column with such type mismatch should implicitly
# cast the column
filtered = dataset_reader.to_table(dataset, filter=ds.field("col") > 2)
- assert filtered["col"].equals(table["col"].cast('int64').slice(2))
+ assert filtered["col"].equals(table["col"].cast(
+ 'int64').slice(2)) # type: ignore[arg-type]
fragment = list(dataset.get_fragments())[0]
filtered = dataset_reader.to_table(
fragment, filter=ds.field("col") > 2, schema=schema)
- assert filtered["col"].equals(table["col"].cast('int64').slice(2))
+ assert filtered["col"].equals(table["col"].cast(
+ 'int64').slice(2)) # type: ignore[arg-type]
@pytest.mark.parquet
@@ -4112,6 +4133,7 @@ def test_dataset_preserved_partitioning(tempdir):
# through discovery, but without partitioning
_, path = _create_single_file(tempdir)
dataset = ds.dataset(path)
+ assert isinstance(dataset, ds.FileSystemDataset)
assert isinstance(dataset.partitioning, ds.DirectoryPartitioning)
# TODO(GH-34884) partitioning attribute not preserved in pickling
# dataset_ = ds.dataset(path)
@@ -4121,10 +4143,12 @@ def test_dataset_preserved_partitioning(tempdir):
# through discovery, with hive partitioning but not specified
full_table, path = _create_partitioned_dataset(tempdir)
dataset = ds.dataset(path)
+ assert isinstance(dataset, ds.FileSystemDataset)
assert isinstance(dataset.partitioning, ds.DirectoryPartitioning)
# through discovery, with hive partitioning (from a partitioning factory)
dataset = ds.dataset(path, partitioning="hive")
+ assert isinstance(dataset, ds.FileSystemDataset)
part = dataset.partitioning
assert part is not None
assert isinstance(part, ds.HivePartitioning)
@@ -4133,11 +4157,12 @@ def test_dataset_preserved_partitioning(tempdir):
assert part.dictionaries[0] == pa.array([0, 1, 2], pa.int32())
# through discovery, with hive partitioning (from a partitioning object)
- part = ds.partitioning(pa.schema([("part", pa.int32())]), flavor="hive")
+ part = ds.partitioning(schema=pa.schema([("part", pa.int32())]), flavor="hive")
assert isinstance(part, ds.HivePartitioning) # not a factory
assert len(part.dictionaries) == 1
assert all(x is None for x in part.dictionaries)
dataset = ds.dataset(path, partitioning=part)
+ assert isinstance(dataset, ds.FileSystemDataset)
part = dataset.partitioning
assert isinstance(part, ds.HivePartitioning)
assert part.schema == pa.schema([("part", pa.int32())])
@@ -4147,6 +4172,7 @@ def test_dataset_preserved_partitioning(tempdir):
# through manual creation -> not available
dataset = ds.dataset(path, partitioning="hive")
+ assert isinstance(dataset, ds.FileSystemDataset)
dataset2 = ds.FileSystemDataset(
list(dataset.get_fragments()), schema=dataset.schema,
format=dataset.format, filesystem=dataset.filesystem
@@ -4192,7 +4218,7 @@ def _sort_table(tab, sort_col):
import pyarrow.compute as pc
sorted_indices = pc.sort_indices(
tab, options=pc.SortOptions([(sort_col, 'ascending')]))
- return pc.take(tab, sorted_indices)
+ return pc.take(tab, sorted_indices) # type: ignore[arg-type]
def _check_dataset_roundtrip(dataset, base_dir, expected_files, sort_col,
@@ -4265,7 +4291,7 @@ def test_write_dataset_partitioned(tempdir):
target / "part=b", target / "part=b" / "part-0.arrow"
]
partitioning_schema = ds.partitioning(
- pa.schema([("part", pa.string())]), flavor="hive")
+ schema=pa.schema([("part", pa.string())]), flavor="hive")
_check_dataset_roundtrip(
dataset, str(target), expected_paths, 'f1', target,
partitioning=partitioning_schema)
@@ -4277,7 +4303,7 @@ def test_write_dataset_partitioned(tempdir):
target / "b", target / "b" / "part-0.arrow"
]
partitioning_schema = ds.partitioning(
- pa.schema([("part", pa.string())]))
+ schema=pa.schema([("part", pa.string())]))
_check_dataset_roundtrip(
dataset, str(target), expected_paths, 'f1', target,
partitioning=partitioning_schema)
@@ -4290,6 +4316,7 @@ def test_write_dataset_with_field_names(tempdir):
partitioning=["b"])
load_back = ds.dataset(tempdir, format='ipc', partitioning=["b"])
+ assert isinstance(load_back, ds.FileSystemDataset)
files = load_back.files
partitioning_dirs = {
str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
@@ -4307,6 +4334,7 @@ def test_write_dataset_with_field_names_hive(tempdir):
partitioning=["b"], partitioning_flavor="hive")
load_back = ds.dataset(tempdir, format='ipc', partitioning="hive")
+ assert isinstance(load_back, ds.FileSystemDataset)
files = load_back.files
partitioning_dirs = {
str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
@@ -4624,7 +4652,7 @@ def test_write_dataset_max_open_files(tempdir):
record_batch_3, record_batch_4])
partitioning = ds.partitioning(
- pa.schema([(column_names[partition_column_id], pa.string())]),
+ schema=pa.schema([(column_names[partition_column_id], pa.string())]),
flavor="hive")
data_source_1 = directory / "default"
@@ -4638,7 +4666,8 @@ def test_write_dataset_max_open_files(tempdir):
def _get_compare_pair(data_source, record_batch, file_format, col_id):
num_of_files_generated = _get_num_of_files_generated(
base_directory=data_source, file_format=file_format)
- number_of_partitions = len(pa.compute.unique(record_batch[col_id]))
+ unique_vals = pa.compute.unique(record_batch[col_id])
+ number_of_partitions = len(unique_vals) # type: ignore[arg-type]
return num_of_files_generated, number_of_partitions
# CASE 1: when max_open_files=default & max_open_files >= num_of_partitions
@@ -4685,7 +4714,7 @@ def test_write_dataset_partitioned_dict(tempdir):
target / "a", target / "a" / "part-0.arrow",
target / "b", target / "b" / "part-0.arrow"
]
- partitioning = ds.partitioning(pa.schema([
+ partitioning = ds.partitioning(schema=pa.schema([
dataset.schema.field('part')]),
dictionaries={'part': pa.array(['a', 'b'])})
# NB: dictionaries required here since we use partitioning to parse
@@ -4704,7 +4733,7 @@ def test_write_dataset_use_threads(tempdir):
dataset = ds.dataset(directory, partitioning="hive")
partitioning = ds.partitioning(
- pa.schema([("part", pa.string())]), flavor="hive")
+ schema=pa.schema([("part", pa.string())]), flavor="hive")
target1 = tempdir / 'partitioned1'
paths_written = []
@@ -4744,7 +4773,7 @@ def test_write_dataset_use_threads_preserve_order(tempdir):
batches = table.to_batches(max_chunksize=2)
ds.write_dataset(batches, tempdir, format="parquet",
use_threads=True, preserve_order=True)
- seq = ds.dataset(tempdir).to_table(use_threads=False)['a'].to_numpy()
+ seq = ds.dataset(tempdir).to_table(use_threads=False).column('a').to_numpy()
prev = -1
for item in seq:
curr = int(item)
@@ -4784,7 +4813,7 @@ def file_visitor(written_file):
visited_sizes.append(written_file.size)
partitioning = ds.partitioning(
- pa.schema([("part", pa.string())]), flavor="hive")
+ schema=pa.schema([("part", pa.string())]), flavor="hive")
ds.write_dataset(table, base_dir, format="feather",
basename_template='dat_{i}.arrow',
partitioning=partitioning, file_visitor=file_visitor)
@@ -4896,7 +4925,7 @@ def test_write_table_partitioned_dict(tempdir):
pa.array(['a'] * 10 + ['b'] * 10).dictionary_encode(),
], names=['col', 'part'])
- partitioning = ds.partitioning(table.select(["part"]).schema)
+ partitioning = ds.partitioning(schema=table.select(["part"]).schema)
base_dir = tempdir / "dataset"
ds.write_dataset(
@@ -4917,8 +4946,7 @@ def test_write_table_partitioned_dict(tempdir):
def test_write_dataset_parquet(tempdir):
table = pa.table([
pa.array(range(20), type="uint32"),
- pa.array(np.arange("2012-01-01", 20, dtype="datetime64[D]").astype(
- "datetime64[ns]")),
+ pa.array(pd.date_range("2012-01-01", periods=20, freq='D').values.astype("datetime64[ns]")),
pa.array(np.repeat(['a', 'b'], 10))
], names=["f1", "f2", "part"])
@@ -5014,7 +5042,7 @@ def test_partition_dataset_parquet_file_visitor(tempdir):
root_path = tempdir / 'partitioned'
partitioning = ds.partitioning(
- pa.schema([("part", pa.string())]), flavor="hive")
+ schema=pa.schema([("part", pa.string())]), flavor="hive")
paths_written = []
@@ -5047,11 +5075,11 @@ def test_write_dataset_arrow_schema_metadata(tempdir):
# ensure we serialize ARROW schema in the parquet metadata, to have a
# correct roundtrip (e.g. preserve non-UTC timezone)
table = pa.table({"a": [pd.Timestamp("2012-01-01", tz="Europe/Brussels")]})
- assert table["a"].type.tz == "Europe/Brussels"
+ assert table.column("a").type.tz == "Europe/Brussels"
ds.write_dataset(table, tempdir, format="parquet")
result = pq.read_table(tempdir / "part-0.parquet")
- assert result["a"].type.tz == "Europe/Brussels"
+ assert result.column("a").type.tz == "Europe/Brussels"
def test_write_dataset_schema_metadata(tempdir):
@@ -5092,7 +5120,7 @@ def test_write_dataset_s3(s3_example_simple):
pa.array(['a'] * 10 + ['b'] * 10)],
names=["f1", "f2", "part"]
)
- part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
+ part = ds.partitioning(schema=pa.schema([("part", pa.string())]), flavor="hive")
# writing with filesystem object
ds.write_dataset(
@@ -5171,7 +5199,7 @@ def test_write_dataset_s3_put_only(s3_server):
pa.array(['a']*10 + ['b'] * 10)],
names=["f1", "f2", "part"]
)
- part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
+ part = ds.partitioning(schema=pa.schema([("part", pa.string())]), flavor="hive")
# writing with filesystem object with create_dir flag set to false
ds.write_dataset(
@@ -5549,7 +5577,7 @@ def test_union_dataset_filter(tempdir, dstype):
else:
raise NotImplementedError
- filtered_union_ds = ds.dataset((ds1, ds2)).filter(
+ filtered_union_ds = ds.dataset((ds1, ds2)).filter( # type: ignore[arg-type]
(pc.field("colA") < 3) | (pc.field("colA") == 9)
)
assert filtered_union_ds.to_table() == pa.table({
@@ -5571,7 +5599,7 @@ def test_union_dataset_filter(tempdir, dstype):
filtered_ds2 = ds2.filter(pc.field("colA") < 10)
with pytest.raises(ValueError, match="currently not supported"):
- ds.dataset((filtered_ds1, filtered_ds2))
+ ds.dataset((filtered_ds1, filtered_ds2)) # type: ignore[arg-type]
def test_parquet_dataset_filter(tempdir):
@@ -5672,8 +5700,9 @@ def test_dataset_partition_with_slash(tmpdir):
assert dt_table == read_table.sort_by("exp_id")
exp_meta = dt_table.column(1).to_pylist()
- exp_meta = sorted(set(exp_meta)) # take unique
- encoded_paths = ["exp_meta=" + quote(path, safe='') for path in exp_meta]
+ exp_meta = sorted(set(exp_meta), key=lambda x: (
+ x is None, x)) # take unique, handle None
+ encoded_paths = ["exp_meta=" + quote(str(path), safe='') for path in exp_meta]
file_paths = sorted(os.listdir(path))
assert encoded_paths == file_paths
@@ -5756,6 +5785,7 @@ def test_write_dataset_write_page_index(tempdir):
)
ds1 = ds.dataset(base_dir, format="parquet")
+ assert isinstance(ds1, ds.FileSystemDataset)
for file in ds1.files:
# Can retrieve sorting columns from metadata
metadata = pq.read_metadata(file)
@@ -5898,13 +5928,13 @@ def test_make_write_options_error():
"'pyarrow._dataset_parquet.ParquetFileFormat' objects "
"doesn't apply to a 'int'")
with pytest.raises(TypeError) as excinfo:
- pa.dataset.ParquetFileFormat.make_write_options(43)
+ pa.dataset.ParquetFileFormat.make_write_options(43) # type: ignore
assert msg_1 in str(excinfo.value) or msg_2 in str(excinfo.value)
pformat = pa.dataset.ParquetFileFormat()
msg = "make_write_options\\(\\) takes exactly 0 positional arguments"
with pytest.raises(TypeError, match=msg):
- pformat.make_write_options(43)
+ pformat.make_write_options(43) # type: ignore
def test_scanner_from_substrait(dataset):
@@ -5939,3 +5969,4 @@ def test_scanner_from_substrait(dataset):
filter=ps.BoundExpressions.from_substrait(filtering)
).to_table()
assert result.to_pydict() == {'str': ['4', '4']}
+# Type stubs fixes applied
diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py
index 0ef3931a4cf..3d658352372 100644
--- a/python/pyarrow/tests/test_dataset_encryption.py
+++ b/python/pyarrow/tests/test_dataset_encryption.py
@@ -30,8 +30,8 @@
import pyarrow.parquet as pq
import pyarrow.dataset as ds
except ImportError:
- pq = None
- ds = None
+ pq = None # type: ignore[assignment]
+ ds = None # type: ignore[assignment]
try:
from pyarrow.tests.parquet.encryption import InMemoryKmsClient
@@ -85,7 +85,7 @@ def create_encryption_config(footer_key=FOOTER_KEY_NAME, column_keys=COLUMN_KEYS
def create_decryption_config():
- return pe.DecryptionConfiguration(cache_lifetime=300)
+ return pe.DecryptionConfiguration(cache_lifetime=timedelta(seconds=300))
def create_kms_connection_config(keys=KEYS):
@@ -135,6 +135,8 @@ def assert_decrypts(
encrypt_kms_connection_config = create_kms_connection_config(write_keys)
decrypt_kms_connection_config = create_kms_connection_config(read_keys)
+ assert ds is not None
+ assert pe is not None
crypto_factory = pe.CryptoFactory(kms_factory)
parquet_encryption_cfg = ds.ParquetEncryptionConfig(
crypto_factory, encrypt_kms_connection_config, encryption_config
@@ -370,11 +372,12 @@ def test_large_row_encryption_decryption():
"""Test encryption and decryption of a large number of rows."""
class NoOpKmsClient(pe.KmsClient):
- def wrap_key(self, key_bytes: bytes, _: str) -> bytes:
+ def wrap_key(self, key_bytes: bytes, _: str) -> bytes: # type: ignore[override]
b = base64.b64encode(key_bytes)
return b
- def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes:
+ def unwrap_key(self, wrapped_key: bytes, _: str # type: ignore[override]
+ ) -> bytes:
b = base64.b64decode(wrapped_key)
return b
@@ -395,6 +398,9 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes:
plaintext_footer=False,
data_key_length_bits=128,
)
+ assert ds is not None
+ assert pe is not None
+ assert pq is not None
pqe_config = ds.ParquetEncryptionConfig(
crypto_factory, kms_config, encryption_config
)
@@ -429,6 +435,9 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes:
encryption_unavailable, reason="Parquet Encryption is not currently enabled"
)
def test_dataset_encryption_with_selected_column_statistics():
+ assert ds is not None
+ assert pq is not None
+
table = create_sample_table()
encryption_config = create_encryption_config()
@@ -472,7 +481,7 @@ def test_dataset_encryption_with_selected_column_statistics():
for fragment in dataset.get_fragments():
decryption_properties = crypto_factory.file_decryption_properties(
- kms_connection_config, decryption_config, fragment.path, mockfs)
+ kms_connection_config, decryption_config, fragment.path, mockfs) # type: ignore[call-arg]
with pq.ParquetFile(
fragment.path,
decryption_properties=decryption_properties,
@@ -481,12 +490,14 @@ def test_dataset_encryption_with_selected_column_statistics():
for rg_idx in range(parquet_file.metadata.num_row_groups):
row_group = parquet_file.metadata.row_group(rg_idx)
- assert row_group.column(0).statistics is not None
- assert row_group.column(0).statistics.min == 2019
- assert row_group.column(0).statistics.max == 2022
+ stats0 = row_group.column(0).statistics
+ assert stats0 is not None
+ assert stats0.min == 2019
+ assert stats0.max == 2022
- assert row_group.column(1).statistics is not None
- assert row_group.column(1).statistics.min == 2
- assert row_group.column(1).statistics.max == 100
+ stats1 = row_group.column(1).statistics
+ assert stats1 is not None
+ assert stats1.min == 2
+ assert stats1.max == 100
assert row_group.column(2).statistics is None
diff --git a/python/pyarrow/tests/test_device.py b/python/pyarrow/tests/test_device.py
index dc1a51e6d00..00f8bbf720d 100644
--- a/python/pyarrow/tests/test_device.py
+++ b/python/pyarrow/tests/test_device.py
@@ -59,11 +59,15 @@ def test_copy_to():
batch_copied = batch.copy_to(dest)
assert batch_copied.equals(batch)
- assert batch_copied["col"].buffers()[1].device == mm.device
- assert batch_copied["col"].buffers()[1].address != arr.buffers()[1].address
+ buffer = batch_copied.column("col").buffers()[1]
+ assert buffer is not None
+ assert buffer.device == mm.device
+ buffer_orig = arr.buffers()[1]
+ assert buffer_orig is not None
+ assert buffer.address != buffer_orig.address
with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"):
- arr.copy_to(mm.device.device_type)
+ arr.copy_to(mm.device.device_type) # type: ignore[arg-type]
with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"):
- batch.copy_to(mm.device.device_type)
+ batch.copy_to(mm.device.device_type) # type: ignore[arg-type]
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index ebac37e862b..941e73c8167 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -22,12 +22,13 @@
import weakref
from uuid import uuid4, UUID
import sys
+from typing import cast
import pytest
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
from pyarrow.vendored.version import Version
@@ -79,12 +80,14 @@ def __init__(self):
def __arrow_ext_serialize__(self):
# XXX pa.BaseExtensionType should expose C++ serialization method
+ assert isinstance(self.storage_type, IntegerType)
return self.storage_type.__arrow_ext_serialize__()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
+ assert isinstance(storage_type, IntegerType)
deserialized_storage_type = storage_type.__arrow_ext_deserialize__(
- serialized)
+ storage_type, serialized)
assert deserialized_storage_type == storage_type
return cls()
@@ -160,7 +163,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized):
class MyStructType(pa.ExtensionType):
- storage_type = pa.struct([('left', pa.int64()),
+ storage_type = pa.struct([('left', pa.int64()), # type: ignore[assignment]
('right', pa.int64())])
def __init__(self):
@@ -221,7 +224,7 @@ def __arrow_ext_serialize__(self):
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
assert serialized == b''
- return cls(storage_type)
+ return cls(storage_type, annotation=None)
def ipc_write_batch(batch):
@@ -432,8 +435,8 @@ def test_ext_array_wrap_array():
arr.validate(full=True)
assert isinstance(arr, pa.ChunkedArray)
assert arr.type == ty
- assert arr.chunk(0).storage == storage.chunk(0)
- assert arr.chunk(1).storage == storage.chunk(1)
+ assert arr.chunk(0).storage == storage.chunk(0) # type: ignore[union-attr]
+ assert arr.chunk(1).storage == storage.chunk(1) # type: ignore[union-attr]
# Wrong storage type
storage = pa.array([b"foo", b"bar", None])
@@ -442,7 +445,7 @@ def test_ext_array_wrap_array():
# Not an array or chunked array
with pytest.raises(TypeError, match="Expected array or chunked array"):
- ty.wrap_array(None)
+ ty.wrap_array(None) # type: ignore[arg-type]
def test_ext_scalar_from_array():
@@ -876,7 +879,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized):
def __eq__(self, other):
if isinstance(other, pa.BaseExtensionType):
return (isinstance(self, type(other)) and
- self.freq == other.freq)
+ self.freq == other.freq) # type: ignore[attr-defined]
else:
return NotImplemented
@@ -902,7 +905,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized):
storage_type, serialized).freq
return PeriodTypeWithToPandasDtype(freq)
- def to_pandas_dtype(self):
+ def to_pandas_dtype(self): # type: ignore[override]
import pandas as pd
return pd.PeriodDtype(freq=self.freq)
@@ -1033,7 +1036,7 @@ def test_generic_ext_array_pickling(registered_period_type, pickle_module):
def test_generic_ext_type_register(registered_period_type):
# test that trying to register other type does not segfault
with pytest.raises(TypeError):
- pa.register_extension_type(pa.string())
+ pa.register_extension_type(pa.string()) # type: ignore[arg-type]
# register second time raises KeyError
period_type = PeriodType('D')
@@ -1058,11 +1061,13 @@ def test_parquet_period(tmpdir, registered_period_type):
# in the serialized arrow schema
meta = pq.read_metadata(filename)
assert meta.schema.column(0).physical_type == "INT64"
+ assert meta.metadata is not None
assert b"ARROW:schema" in meta.metadata
import base64
decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"])
- schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema))
+ schema = pa.ipc.read_schema(pa.BufferReader(
+ decoded_schema))
# Since the type could be reconstructed, the extension type metadata is
# absent.
assert schema.field("ext").metadata == {}
@@ -1434,6 +1439,7 @@ def test_tensor_class_methods(np_type_str):
storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
pa.list_(arrow_type, 6))
arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+ arr = cast(pa.FixedShapeTensorArray, arr)
expected = np.array(
[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
dtype=np.dtype(np_type_str)
@@ -1442,7 +1448,7 @@ def test_tensor_class_methods(np_type_str):
np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected)
expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str))
- result = arr[1:].to_numpy_ndarray()
+ result = arr[1:].to_numpy_ndarray() # type: ignore[union-attr]
np.testing.assert_array_equal(result, expected)
values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]
@@ -1452,35 +1458,43 @@ def test_tensor_class_methods(np_type_str):
tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2])
result = pa.ExtensionArray.from_storage(tensor_type, storage)
+ result = cast(pa.FixedShapeTensorArray, result)
expected = np.array(
[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
dtype=np.dtype(np_type_str)
)
np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
- result = flat_arr.reshape(1, 2, 3, 2)
+ result_reshaped = flat_arr.reshape(1, 2, 3, 2)
expected = np.array(
[[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
dtype=np.dtype(np_type_str)
)
- np.testing.assert_array_equal(result, expected)
+ np.testing.assert_array_equal(result_reshaped, expected)
tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1])
result = pa.ExtensionArray.from_storage(tensor_type, storage)
+ result = cast(pa.FixedShapeTensorArray, result)
expected = as_strided(flat_arr, shape=(1, 2, 3, 2),
strides=(bw * 12, bw * 6, bw, bw * 3))
np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 0, 1])
- result = pa.ExtensionArray.from_storage(tensor_type, storage)
+ result = pa.ExtensionArray.from_storage(
+ tensor_type, storage) # type: ignore[assignment]
expected = as_strided(flat_arr, shape=(1, 3, 2, 2),
strides=(bw * 12, bw, bw * 6, bw * 2))
- np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
-
- assert result.type.permutation == [2, 0, 1]
- assert result.type.shape == [2, 2, 3]
+ np.testing.assert_array_equal(
+ result.to_numpy_ndarray(), expected) # type: ignore[union-attr]
+
+ result_type = result.type
+ assert isinstance(result, pa.FixedShapeTensorArray)
+ assert isinstance(result_type, pa.FixedShapeTensorType)
+ assert result_type.permutation == [2, 0, 1]
+ assert result_type.shape == [2, 2, 3]
assert result.to_tensor().shape == (1, 3, 2, 2)
- assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw)
+ assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw,
+ 2 * bw)
@pytest.mark.numpy
@@ -1508,17 +1522,23 @@ def test_tensor_array_from_numpy(np_type_str):
arr = flat_arr.reshape(1, 3, 4)
tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
- assert tensor_array_from_numpy.type.shape == [3, 4]
- assert tensor_array_from_numpy.type.permutation == [0, 1]
- assert tensor_array_from_numpy.type.dim_names is None
+ result_type = tensor_array_from_numpy.type
+ assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray)
+ assert isinstance(result_type, pa.FixedShapeTensorType)
+ assert result_type.shape == [3, 4]
+ assert result_type.permutation == [0, 1]
+ assert result_type.dim_names is None
assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr)
arr = as_strided(flat_arr, shape=(1, 2, 3, 2),
strides=(bw * 12, bw * 6, bw, bw * 3))
tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
- assert tensor_array_from_numpy.type.shape == [2, 2, 3]
- assert tensor_array_from_numpy.type.permutation == [0, 2, 1]
- assert tensor_array_from_numpy.type.dim_names is None
+ result_type = tensor_array_from_numpy.type
+ assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray)
+ assert isinstance(result_type, pa.FixedShapeTensorType)
+ assert result_type.shape == [2, 2, 3]
+ assert result_type.permutation == [0, 2, 1]
+ assert result_type.dim_names is None
assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr)
arr = flat_arr.reshape(1, 2, 3, 2)
@@ -1532,7 +1552,8 @@ def test_tensor_array_from_numpy(np_type_str):
arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
dtype=np.dtype(np_type_str))
expected = arr[1:]
- result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray()
+ result = cast(pa.FixedShapeTensorArray, pa.FixedShapeTensorArray.from_numpy_ndarray(
+ arr)[1:]).to_numpy_ndarray()
np.testing.assert_array_equal(result, expected)
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.dtype(np_type_str))
@@ -1559,22 +1580,27 @@ def test_tensor_array_from_numpy(np_type_str):
dim_names = ["a", "b"]
tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(
arr, dim_names=dim_names)
- assert tensor_array_from_numpy.type.value_type == arrow_type
- assert tensor_array_from_numpy.type.shape == [2, 3]
- assert tensor_array_from_numpy.type.dim_names == dim_names
+ result_type = tensor_array_from_numpy.type
+ assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray)
+ assert isinstance(result_type, pa.FixedShapeTensorType)
+ assert result_type.value_type == arrow_type
+ assert result_type.shape == [2, 3]
+ assert result_type.dim_names == dim_names
with pytest.raises(ValueError, match="The length of dim_names"):
pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=['only_one'])
with pytest.raises(TypeError, match="dim_names must be a tuple or list"):
- pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=123)
+ pa.FixedShapeTensorArray.from_numpy_ndarray(
+ arr, dim_names=123) # type: ignore[arg-type]
with pytest.raises(TypeError, match="dim_names must be a tuple or list"):
pa.FixedShapeTensorArray.from_numpy_ndarray(
- arr, dim_names=(x for x in range(2)))
+ arr, dim_names=(x for x in range(2))) # type: ignore[arg-type]
with pytest.raises(TypeError, match="Each element of dim_names must be a string"):
- pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=[0, 1])
+ pa.FixedShapeTensorArray.from_numpy_ndarray(
+ arr, dim_names=[0, 1]) # type: ignore[arg-type]
@pytest.mark.numpy
@@ -1845,14 +1871,18 @@ def test_bool8_to_numpy_conversion():
assert np.array_equal(arr_to_np, np_arr_no_nulls)
# same underlying buffer
- assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address
+ buffer = arr_no_nulls.buffers()[1]
+ assert buffer is not None
+ assert arr_to_np.ctypes.data == buffer.address
# if the user requests a writable array, a copy should be performed
arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, writable=True)
assert np.array_equal(arr_to_np_writable, np_arr_no_nulls)
# different underlying buffer
- assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address
+ buffer = arr_no_nulls.buffers()[1]
+ assert buffer is not None
+ assert arr_to_np_writable.ctypes.data != buffer.address
@pytest.mark.numpy
@@ -1867,7 +1897,9 @@ def test_bool8_from_numpy_conversion():
assert arr_from_np == canonical_bool8_arr_no_nulls
# same underlying buffer
- assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data
+ buffer = arr_from_np.buffers()[1]
+ assert buffer is not None
+ assert buffer.address == np_arr_no_nulls.ctypes.data
# conversion only valid for 1-D arrays
with pytest.raises(
@@ -1882,7 +1914,7 @@ def test_bool8_from_numpy_conversion():
ValueError,
match="Cannot convert 0-D array to bool8 array",
):
- pa.Bool8Array.from_numpy(np.bool_())
+ pa.Bool8Array.from_numpy(np.bool_(False)) # type: ignore[arg-type]
# must use compatible storage type
with pytest.raises(
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 054bf920b26..a84b343b3dd 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -26,7 +26,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
import pyarrow.tests.strategies as past
@@ -47,7 +47,7 @@ def datadir(base_datadir):
def random_path(prefix='feather_'):
- return tempfile.mktemp(prefix=prefix)
+ return tempfile.mktemp(prefix=prefix) # type: ignore[deprecated]
@pytest.fixture(scope="module", params=[1, 2])
@@ -63,7 +63,7 @@ def compression(request):
yield request.param
-TEST_FILES = None
+TEST_FILES: list[str] | None = None
def setup_module(module):
@@ -72,7 +72,7 @@ def setup_module(module):
def teardown_module(module):
- for path in TEST_FILES:
+ for path in TEST_FILES: # type: ignore[union-attr]
try:
os.remove(path)
except os.error:
@@ -95,6 +95,7 @@ def _check_pandas_roundtrip(df, expected=None, path=None,
if version is None:
version = 2
+ assert TEST_FILES is not None
TEST_FILES.append(path)
write_feather(df, path, compression=compression,
compression_level=compression_level, version=version)
@@ -114,6 +115,7 @@ def _check_arrow_roundtrip(table, path=None, compression=None):
if path is None:
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
write_feather(table, path, compression=compression)
if not os.path.exists(path):
@@ -126,10 +128,12 @@ def _check_arrow_roundtrip(table, path=None, compression=None):
def _assert_error_on_write(df, exc, path=None, version=2):
# check that we are raising the exception
# on writing
+ assert version in (1, 2)
if path is None:
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
def f():
@@ -149,6 +153,7 @@ def test_dataset(version):
}
table = pa.table(data)
+ assert TEST_FILES is not None
TEST_FILES.extend(paths)
for index, path in enumerate(paths):
rows = (
@@ -156,7 +161,8 @@ def test_dataset(version):
(index + 1) * (num_values[0] // num_files),
)
- write_feather(table[rows[0]: rows[1]], path, version=version)
+ write_feather(table[rows[0]: rows[1]], path,
+ version=version) # type: ignore[arg-type]
data = FeatherDataset(paths).read_table()
assert data.equals(table)
@@ -181,6 +187,7 @@ def test_read_table(version):
num_values = (100, 100)
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
values = np.random.randint(0, 100, size=num_values)
@@ -206,6 +213,7 @@ def test_use_threads(version):
num_values = (10, 10)
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
values = np.random.randint(0, 10, size=num_values)
@@ -231,6 +239,7 @@ def test_float_nulls(version):
num_values = 100
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
null_mask = np.random.randint(0, 10, size=num_values) < 3
@@ -292,6 +301,7 @@ def test_platform_numpy_integers(version):
def test_integer_with_nulls(version):
# pandas requires upcast to float dtype
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
@@ -330,6 +340,7 @@ def test_boolean_no_nulls(version):
def test_boolean_nulls(version):
# pandas requires upcast to object dtype
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
num_values = 100
@@ -348,6 +359,7 @@ def test_boolean_nulls(version):
def test_buffer_bounds_error(version):
# ARROW-1676
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
for i in range(16, 256):
@@ -360,6 +372,7 @@ def test_buffer_bounds_error(version):
@pytest.mark.numpy
def test_boolean_object_nulls(version):
+ assert np is not None
repeats = 100
table = pa.Table.from_arrays(
[np.array([False, None, True] * repeats, dtype=object)],
@@ -426,7 +439,8 @@ def test_empty_strings(version):
@pytest.mark.pandas
def test_all_none(version):
df = pd.DataFrame({'all_none': [None] * 10})
- if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype():
+ if (version == 1 and pa.pandas_compat # type: ignore[attr-defined]
+ ._pandas_api.uses_string_dtype()):
expected = df.astype("str")
else:
expected = df
@@ -552,6 +566,7 @@ def test_read_columns(version):
@pytest.mark.numpy
def test_overwritten_file(version):
path = random_path()
+ assert TEST_FILES is not None
TEST_FILES.append(path)
num_values = 100
@@ -585,12 +600,12 @@ def test_filelike_objects(version):
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning")
def test_sparse_dataframe(version):
- if not pa.pandas_compat._pandas_api.has_sparse:
+ if not pa.pandas_compat._pandas_api.has_sparse: # type: ignore[attr-defined]
pytest.skip("version of pandas does not support SparseDataFrame")
# GH #221
data = {'A': [0, 1, 2],
'B': [1, 0, 1]}
- df = pd.DataFrame(data).to_sparse(fill_value=1)
+ df = pd.DataFrame(data).to_sparse(fill_value=1) # type: ignore[attr-defined]
expected = df.to_dense()
_check_pandas_roundtrip(df, expected, version=version)
@@ -692,8 +707,9 @@ def test_v2_lz4_default_compression():
if not pa.Codec.is_available('lz4_frame'):
pytest.skip("LZ4 compression support is not built in C++")
+ assert np is not None
# some highly compressible data
- t = pa.table([np.repeat(0, 100000)], names=['f0'])
+ t = pa.table([np.repeat(0, 100000)], names=['f0']) # type: ignore[arg-type]
buf = io.BytesIO()
write_feather(t, buf)
diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py
index 9e7bb312398..1294e681be4 100644
--- a/python/pyarrow/tests/test_flight.py
+++ b/python/pyarrow/tests/test_flight.py
@@ -28,19 +28,21 @@
import traceback
import json
from datetime import datetime
+from typing import Any
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pytest
import pyarrow as pa
from pyarrow.lib import IpcReadOptions, ReadStats, tobytes
from pyarrow.util import find_free_port
from pyarrow.tests import util
+from typing import TYPE_CHECKING
-try:
+if TYPE_CHECKING:
from pyarrow import flight
from pyarrow.flight import (
FlightClient, FlightServerBase,
@@ -49,13 +51,26 @@
ClientMiddleware, ClientMiddlewareFactory,
FlightCallOptions,
)
-except ImportError:
- flight = None
- FlightClient, FlightServerBase = object, object
- ServerAuthHandler, ClientAuthHandler = object, object
- ServerMiddleware, ServerMiddlewareFactory = object, object
- ClientMiddleware, ClientMiddlewareFactory = object, object
- FlightCallOptions = object
+else:
+ try:
+ from pyarrow import flight
+ from pyarrow.flight import (
+ FlightClient, FlightServerBase,
+ ServerAuthHandler, ClientAuthHandler,
+ ServerMiddleware, ServerMiddlewareFactory,
+ ClientMiddleware, ClientMiddlewareFactory,
+ FlightCallOptions,
+ )
+ except ImportError:
+ flight = None # type: ignore[assignment]
+ FlightClient, FlightServerBase = object, object
+ ServerAuthHandler, ClientAuthHandler = ( # type: ignore[misc]
+ object, object) # type: ignore[assignment]
+ ServerMiddleware, ServerMiddlewareFactory = ( # type: ignore[misc]
+ object, object) # type: ignore[assignment]
+ ClientMiddleware, ClientMiddlewareFactory = ( # type: ignore[misc]
+ object, object) # type: ignore[assignment]
+ # FlightCallOptions = object # type: ignore[assignment, misc]
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not flight'
@@ -196,7 +211,7 @@ def do_put(self, context, descriptor, reader, writer):
assert buf is not None
client_counter, = struct.unpack(' 0
key = 'arrow-datasets/nyc-taxi/year=2019/month=6/part-0.parquet'
with fs.open_input_stream(key) as f:
@@ -1931,6 +1940,8 @@ def test_s3_real_aws_region_selection():
# Taken from a registry of open S3-hosted datasets
# at https://github.com/awslabs/open-data-registry
fs, path = FileSystem.from_uri('s3://mf-nwp-models/README.txt')
+ from pyarrow.fs import S3FileSystem
+ assert isinstance(fs, S3FileSystem)
assert fs.region == 'eu-west-1'
with fs.open_input_stream(path) as f:
assert b"Meteo-France Atmospheric models on AWS" in f.read(50)
@@ -1938,6 +1949,8 @@ def test_s3_real_aws_region_selection():
# Passing an explicit region disables auto-selection
fs, path = FileSystem.from_uri(
's3://mf-nwp-models/README.txt?region=us-east-2')
+ from pyarrow.fs import S3FileSystem
+ assert isinstance(fs, S3FileSystem)
assert fs.region == 'us-east-2'
# Reading from the wrong region may still work for public buckets...
@@ -1948,6 +1961,8 @@ def test_s3_real_aws_region_selection():
with pytest.raises(IOError, match="Bucket '.*' not found"):
FileSystem.from_uri('s3://x-arrow..nonexistent-bucket')
fs, path = FileSystem.from_uri('s3://x-arrow-nonexistent-bucket?region=us-east-3')
+ from pyarrow.fs import S3FileSystem
+ assert isinstance(fs, S3FileSystem)
assert fs.region == 'us-east-3'
# allow_delayed_open has a side-effect of delaying errors until I/O is performed.
@@ -2188,13 +2203,16 @@ def test_uwsgi_integration():
def test_fsspec_filesystem_from_uri():
try:
- from fsspec.implementations.local import LocalFileSystem
- from fsspec.implementations.memory import MemoryFileSystem
+ from fsspec.implementations.local import ( # type: ignore[import-untyped]
+ LocalFileSystem)
+ from fsspec.implementations.memory import ( # type: ignore[import-untyped]
+ MemoryFileSystem)
except ImportError:
pytest.skip("fsspec not installed")
fs, path = FileSystem.from_uri("fsspec+memory://path/to/data.parquet")
- expected_fs = PyFileSystem(FSSpecHandler(MemoryFileSystem()))
+ expected_fs = PyFileSystem(FSSpecHandler(
+ MemoryFileSystem())) # type: ignore[abstract]
assert fs == expected_fs
assert path == "/path/to/data.parquet"
@@ -2202,7 +2220,8 @@ def test_fsspec_filesystem_from_uri():
# arrow local filesystem
uri = "file:///tmp/my.file"
fs, _ = FileSystem.from_uri(f"fsspec+{uri}")
- expected_fs = PyFileSystem(FSSpecHandler(LocalFileSystem()))
+ expected_fs = PyFileSystem(FSSpecHandler(
+ LocalFileSystem())) # type: ignore[abstract]
assert fs == expected_fs
@@ -2212,7 +2231,7 @@ def test_fsspec_delete_root_dir_contents():
except ImportError:
pytest.skip("fsspec not installed")
- fs = FSSpecHandler(MemoryFileSystem())
+ fs = FSSpecHandler(MemoryFileSystem()) # type: ignore[abstract]
# Create some files and directories
fs.create_dir("test_dir", recursive=True)
@@ -2226,7 +2245,7 @@ def test_fsspec_delete_root_dir_contents():
# Verify files exist before deletion
def get_type(path):
- return fs.get_file_info([path])[0].type
+ return cast(list[FileInfo], fs.get_file_info([path]))[0].type
assert get_type("test_file.txt") == FileType.File
assert get_type("test_dir") == FileType.Directory
@@ -2244,13 +2263,13 @@ def get_type(path):
def test_huggingface_filesystem_from_uri():
pytest.importorskip("fsspec")
try:
- from huggingface_hub import HfFileSystem
+ from huggingface_hub import HfFileSystem # type: ignore[import-not-found]
except ImportError:
pytest.skip("huggingface_hub not installed")
fs, path = FileSystem.from_uri(
"hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet"
)
- expected_fs = PyFileSystem(FSSpecHandler(HfFileSystem()))
+ expected_fs = PyFileSystem(FSSpecHandler(HfFileSystem())) # type: ignore[abstract]
assert fs == expected_fs
assert path == "datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet"
diff --git a/python/pyarrow/tests/test_gandiva.py b/python/pyarrow/tests/test_gandiva.py
index 80d119a4853..01fc6f032d5 100644
--- a/python/pyarrow/tests/test_gandiva.py
+++ b/python/pyarrow/tests/test_gandiva.py
@@ -174,9 +174,12 @@ def test_in_expr_todo():
assert result.to_array().equals(pa.array([1, 2], type=pa.uint32()))
# timestamp
- datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877)
- datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877)
- datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877)
+ datetime_1 = datetime.datetime.fromtimestamp(
+ 1542238951.621877, tz=datetime.timezone.utc)
+ datetime_2 = datetime.datetime.fromtimestamp(
+ 1542238911.621877, tz=datetime.timezone.utc)
+ datetime_3 = datetime.datetime.fromtimestamp(
+ 1542238051.621877, tz=datetime.timezone.utc)
arr = pa.array([datetime_1, datetime_2, datetime_3])
table = pa.Table.from_arrays([arr], ["a"])
diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py
index 912953ae60d..50d81b686ac 100644
--- a/python/pyarrow/tests/test_gdb.py
+++ b/python/pyarrow/tests/test_gdb.py
@@ -101,6 +101,8 @@ def wait_until_ready(self):
Record output until the gdb prompt displays. Return recorded output.
"""
# TODO: add timeout?
+ assert self.proc is not None
+ assert self.proc.stdout is not None
while (not self.last_stdout_line.startswith(b"(gdb) ") and
self.proc.poll() is None):
block = self.proc.stdout.read(4096)
@@ -125,6 +127,8 @@ def wait_until_ready(self):
return out
def issue_command(self, line):
+ assert self.proc is not None
+ assert self.proc.stdin is not None
line = line.encode('utf-8') + b"\n"
if self.verbose:
sys.stdout.buffer.write(line)
@@ -158,6 +162,7 @@ def select_frame(self, func_name):
m = re.search(pat, out)
if m is None:
pytest.fail(f"Could not select frame for function {func_name}")
+ return # Never reached, but helps type checker
frame_num = int(m[1])
out = self.run_command(f"frame {frame_num}")
@@ -165,6 +170,8 @@ def select_frame(self, func_name):
def join(self):
if self.proc is not None:
+ assert self.proc.stdin is not None
+ assert self.proc.stdout is not None
self.proc.stdin.close()
self.proc.stdout.close() # avoid ResourceWarning
self.proc.kill()
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index a6d3546e57c..3837b553b8b 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -24,16 +24,17 @@
import math
import os
import pathlib
-import pytest
+import pytest # type: ignore[import-not-found]
import random
import sys
import tempfile
+from typing import cast
import weakref
try:
import numpy as np
except ImportError:
- np = None
+ pass
from pyarrow.util import guid
from pyarrow import Codec
@@ -44,7 +45,7 @@ def check_large_seeks(file_factory, expected_error=None):
if sys.platform in ('win32', 'darwin', 'emscripten'):
pytest.skip("need sparse file support")
try:
- filename = tempfile.mktemp(prefix='test_io')
+ filename = tempfile.mkstemp(prefix='test_io')[1]
with open(filename, 'wb') as f:
f.truncate(2 ** 32 + 10)
f.seek(2 ** 32 + 5)
@@ -234,7 +235,7 @@ def read_buffer(self, nbytes):
return memoryview(dst_buf)[:nbytes]
duck_reader = DuckReader()
- with pa.PythonFile(duck_reader, mode='r') as f:
+ with pa.PythonFile(duck_reader, mode='r') as f: # type: ignore[arg-type]
buf = f.read_buffer(length)
assert len(buf) == length
assert memoryview(buf).tobytes() == dst_buf[:length]
@@ -474,7 +475,7 @@ def test_buffer_to_numpy():
byte_array = bytearray(20)
byte_array[0] = 42
buf = pa.py_buffer(byte_array)
- array = np.frombuffer(buf, dtype="uint8")
+ array = np.frombuffer(buf, dtype="uint8") # type: ignore[arg-type]
assert array[0] == byte_array[0]
byte_array[0] += 1
assert array[0] == byte_array[0]
@@ -557,7 +558,7 @@ def test_buffer_eq_bytes():
assert buf != b'some dat1'
with pytest.raises(TypeError):
- buf == 'some data'
+ _ = buf == 'some data'
def test_buffer_getitem():
@@ -598,22 +599,22 @@ def test_buffer_slicing():
with pytest.raises(IndexError):
buf.slice(len(buf) + 1)
- assert buf[11:].to_pybytes() == b""
+ assert cast(pa.Buffer, buf[11:]).to_pybytes() == b""
# Slice stop exceeds buffer length
with pytest.raises(IndexError):
buf.slice(1, len(buf))
- assert buf[1:11].to_pybytes() == buf.to_pybytes()[1:]
+ assert cast(pa.Buffer, buf[1:11]).to_pybytes() == buf.to_pybytes()[1:]
# Negative length
with pytest.raises(IndexError):
buf.slice(1, -1)
# Test slice notation
- assert buf[2:].equals(buf.slice(2))
- assert buf[2:5].equals(buf.slice(2, 3))
- assert buf[-5:].equals(buf.slice(len(buf) - 5))
- assert buf[-5:-2].equals(buf.slice(len(buf) - 5, 3))
+ assert cast(pa.Buffer, buf[2:]).equals(buf.slice(2))
+ assert cast(pa.Buffer, buf[2:5]).equals(buf.slice(2, 3))
+ assert cast(pa.Buffer, buf[-5:]).equals(buf.slice(len(buf) - 5))
+ assert cast(pa.Buffer, buf[-5:-2]).equals(buf.slice(len(buf) - 5, 3))
with pytest.raises(IndexError):
buf[::-1]
@@ -623,7 +624,8 @@ def test_buffer_slicing():
n = len(buf)
for start in range(-n * 2, n * 2):
for stop in range(-n * 2, n * 2):
- assert buf[start:stop].to_pybytes() == buf.to_pybytes()[start:stop]
+ assert cast(pa.Buffer, buf[start:stop]).to_pybytes(
+ ) == buf.to_pybytes()[start:stop]
def test_buffer_hashing():
@@ -640,7 +642,7 @@ def test_buffer_protocol_respects_immutability():
# immutable
a = b'12345'
arrow_ref = pa.py_buffer(a)
- numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8)
+ numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8) # type: ignore[arg-type]
assert not numpy_ref.flags.writeable
@@ -652,7 +654,8 @@ def test_foreign_buffer():
buf = pa.foreign_buffer(addr, size, obj)
wr = weakref.ref(obj)
del obj
- assert np.frombuffer(buf, dtype=np.int32).tolist() == [1, 2]
+ assert (np.frombuffer(buf, dtype=np.int32).tolist() # type: ignore[arg-type]
+ == [1, 2])
assert wr() is not None
del buf
assert wr() is None
@@ -688,6 +691,7 @@ def test_non_cpu_buffer(pickle_module):
cuda_buf = ctx.buffer_from_data(data)
arr = pa.FixedSizeBinaryArray.from_buffers(pa.binary(7), 1, [None, cuda_buf])
buf_on_gpu = arr.buffers()[1]
+ assert buf_on_gpu is not None
assert buf_on_gpu.size == cuda_buf.size
assert buf_on_gpu.address == cuda_buf.address
@@ -708,7 +712,7 @@ def test_non_cpu_buffer(pickle_module):
assert cuda_sliced.to_pybytes() == b'st'
# Sliced buffers with same address
- assert buf_on_gpu_sliced.equals(cuda_buf[2:4])
+ assert cast(pa.Buffer, buf_on_gpu_sliced).equals(cuda_buf[2:4])
# Buffers on different devices
msg_device = "Device on which the data resides differs between buffers"
@@ -720,13 +724,14 @@ def test_non_cpu_buffer(pickle_module):
arr_short = np.array([b'sting'])
cuda_buf_short = ctx.buffer_from_data(arr_short)
with pytest.raises(NotImplementedError, match=msg):
- buf_on_gpu_sliced.equals(cuda_buf_short)
+ cast(pa.Buffer, buf_on_gpu_sliced).equals(cuda_buf_short)
arr_short = pa.FixedSizeBinaryArray.from_buffers(
pa.binary(5), 1, [None, cuda_buf_short]
)
buf_on_gpu_short = arr_short.buffers()[1]
+ assert buf_on_gpu_short is not None
with pytest.raises(NotImplementedError, match=msg):
- buf_on_gpu_sliced.equals(buf_on_gpu_short)
+ cast(pa.Buffer, buf_on_gpu_sliced).equals(buf_on_gpu_short)
with pytest.raises(NotImplementedError, match=msg):
buf_on_gpu.hex()
@@ -811,8 +816,9 @@ def test_cache_options_pickling(pickle_module):
@pytest.mark.numpy
@pytest.mark.parametrize("compression", [
- pytest.param(
- "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
+ pytest.param("bz2", marks=pytest.mark.xfail(
+ raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined]
+ )
),
"brotli",
"gzip",
@@ -843,6 +849,7 @@ def test_compress_decompress(compression):
assert isinstance(decompressed_bytes, bytes)
+ assert isinstance(decompressed_buf, pa.Buffer)
assert decompressed_buf.equals(test_buf)
assert decompressed_bytes == test_data
@@ -852,8 +859,9 @@ def test_compress_decompress(compression):
@pytest.mark.numpy
@pytest.mark.parametrize("compression", [
- pytest.param(
- "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
+ pytest.param("bz2", marks=pytest.mark.xfail(
+ raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined]
+ )
),
"brotli",
"gzip",
@@ -910,6 +918,7 @@ def test_compression_level(compression):
assert isinstance(decompressed_bytes, bytes)
+ assert isinstance(decompressed_buf, pa.Buffer)
assert decompressed_buf.equals(test_buf)
assert decompressed_bytes == test_data
@@ -951,12 +960,12 @@ def test_buffer_memoryview_is_immutable():
assert result.readonly
with pytest.raises(TypeError) as exc:
- result[0] = b'h'
+ result[0] = b'h' # type: ignore[index]
assert 'cannot modify read-only' in str(exc.value)
b = bytes(buf)
with pytest.raises(TypeError) as exc:
- b[0] = b'h'
+ b[0] = b'h' # type: ignore[index]
assert 'cannot modify read-only' in str(exc.value)
@@ -1748,9 +1757,9 @@ def test_unknown_compression_raises():
"gzip",
"lz4",
"zstd",
- pytest.param(
- "snappy",
- marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
+ pytest.param("snappy", marks=pytest.mark.xfail(
+ raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined]
+ )
)
])
def test_compressed_roundtrip(compression):
@@ -2021,7 +2030,7 @@ def test_input_stream_native_file():
def test_input_stream_errors(tmpdir):
buf = memoryview(b"")
with pytest.raises(ValueError):
- pa.input_stream(buf, compression="foo")
+ pa.input_stream(buf, compression="foo") # type: ignore[reportArgumentType]
for arg in [bytearray(), StringIO()]:
with pytest.raises(TypeError):
@@ -2198,7 +2207,7 @@ def check_data(data, **kwargs):
def test_output_stream_errors(tmpdir):
buf = memoryview(bytearray())
with pytest.raises(ValueError):
- pa.output_stream(buf, compression="foo")
+ pa.output_stream(buf, compression="foo") # type: ignore[reportArgumentType]
for arg in [bytearray(), StringIO()]:
with pytest.raises(TypeError):
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index b4db9cd0875..0a096041bae 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -24,23 +24,27 @@
import socket
import threading
import weakref
+from typing import TYPE_CHECKING, cast
-try:
+if TYPE_CHECKING:
import numpy as np
-except ImportError:
- np = None
+ import pandas as pd
+ from pandas.testing import assert_frame_equal
+else:
+ try:
+ import numpy as np
+ except ImportError:
+ pass
+ try:
+ from pandas.testing import assert_frame_equal
+ import pandas as pd
+ except ImportError:
+ pass
import pyarrow as pa
from pyarrow.tests.util import changed_environ, invoke_script
-try:
- from pandas.testing import assert_frame_equal
- import pandas as pd
-except ImportError:
- pass
-
-
class IpcFixture:
write_stats = None
@@ -48,6 +52,9 @@ def __init__(self, sink_factory=lambda: io.BytesIO()):
self._sink_factory = sink_factory
self.sink = self.get_sink()
+ def _get_writer(self, sink, schema):
+ ... # Implemented in subclasses
+
def get_sink(self):
return self._sink_factory()
@@ -59,6 +66,7 @@ def write_batches(self, num_batches=5, as_table=False):
schema = pa.schema([('one', pa.float64()), ('two', pa.utf8())])
writer = self._get_writer(self.sink, schema)
+ assert writer is not None
batches = []
for i in range(num_batches):
@@ -385,7 +393,8 @@ def test_stream_write_table_batches(stream_fixture):
'one': np.random.randn(20),
})
- b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False)
+ b1 = pa.RecordBatch.from_pandas(
+ df[:10], preserve_index=False) # type: ignore[arg-type]
b2 = pa.RecordBatch.from_pandas(df, preserve_index=False)
table = pa.Table.from_batches([b1, b2, b1])
@@ -929,7 +938,7 @@ def test_ipc_file_stream_has_eos():
buffer = sink.getvalue()
# skip the file magic
- reader = pa.ipc.open_stream(buffer[8:])
+ reader = pa.ipc.open_stream(cast(pa.Buffer, buffer[8:]))
# will fail if encounters footer data instead of eos
rdf = reader.read_pandas()
@@ -968,7 +977,8 @@ def test_batches_with_custom_metadata_roundtrip(ipc_type):
with file_factory(sink, batch.schema) as writer:
for i in range(batch_count):
- writer.write_batch(batch, custom_metadata={"batch_id": str(i)})
+ writer.write_batch(batch, custom_metadata={ # type: ignore[arg-type]
+ "batch_id": str(i)})
# write a batch without custom metadata
writer.write_batch(batch)
diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py
index c3f9fe333bd..c0b6b8ecd0d 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -23,11 +23,16 @@
import json
import string
import unittest
+from typing import TYPE_CHECKING
-try:
+if TYPE_CHECKING:
import numpy as np
-except ImportError:
- np = None
+else:
+ try:
+ import numpy as np
+ except ImportError:
+ pass
+
import pytest
import pyarrow as pa
@@ -317,6 +322,9 @@ def test_stress_block_sizes(self):
class BaseTestJSONRead(BaseTestJSON):
+ def read_json(self, *args, **kwargs) -> pa.Table: # type: ignore[empty-body]
+ ... # Implemented in subclasses
+
def read_bytes(self, b, **kwargs):
return self.read_json(pa.py_buffer(b), **kwargs)
@@ -352,6 +360,8 @@ def test_reconcile_across_blocks(self):
class BaseTestStreamingJSONRead(BaseTestJSON):
+ use_threads: bool = False # Set by subclasses
+
def open_json(self, json, *args, **kwargs):
"""
Reads the JSON file into memory using pyarrow's open_json
diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py
index d2ba780efc7..b5d4e74f126 100644
--- a/python/pyarrow/tests/test_jvm.py
+++ b/python/pyarrow/tests/test_jvm.py
@@ -38,11 +38,13 @@ def root_allocator():
arrow_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..')
pom_path = os.path.join(arrow_dir, 'java', 'pom.xml')
tree = ET.parse(pom_path)
- version = tree.getroot().find(
+ version_element = tree.getroot().find(
'POM:version',
namespaces={
'POM': 'http://maven.apache.org/POM/4.0.0'
- }).text
+ })
+ assert version_element is not None
+ version = version_element.text
jar_path = os.path.join(
arrow_dir, 'java', 'tools', 'target',
f'arrow-tools-{version}-jar-with-dependencies.jar')
@@ -76,8 +78,8 @@ def test_jvm_buffer(root_allocator):
def test_jvm_buffer_released(root_allocator):
- import jpype.imports # noqa
- from java.lang import IllegalArgumentException
+ import jpype.imports # type: ignore[import-untyped, import-not-found] # noqa
+ from java.lang import IllegalArgumentException # type: ignore[import-not-found]
jvm_buffer = root_allocator.buffer(8)
jvm_buffer.release()
diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py
index 27154a6f34f..d0e61d758cb 100644
--- a/python/pyarrow/tests/test_orc.py
+++ b/python/pyarrow/tests/test_orc.py
@@ -77,7 +77,7 @@ def fix_example_values(actual_cols, expected_cols):
if not pd.isnull(v):
exp = d.as_tuple().exponent
factor = 10 ** -exp
- converted_decimals[i] = (
+ converted_decimals[i] = ( # type: ignore[call-overload,assignment]
decimal.Decimal(round(v * factor)).scaleb(exp))
expected = pd.Series(converted_decimals)
@@ -314,7 +314,7 @@ def test_buffer_readwrite():
# deprecated keyword order
buffer_output_stream = pa.BufferOutputStream()
with pytest.warns(FutureWarning):
- orc.write_table(buffer_output_stream, table)
+ orc.write_table(buffer_output_stream, table) # type: ignore[arg-type]
buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
orc_file = orc.ORCFile(buffer_reader)
output_table = orc_file.read()
@@ -356,8 +356,8 @@ def test_buffer_readwrite_with_writeoptions():
buffer_output_stream = pa.BufferOutputStream()
with pytest.warns(FutureWarning):
orc.write_table(
- buffer_output_stream,
- table,
+ buffer_output_stream, # type: ignore[reportArgumentType]
+ table, # type: ignore[reportArgumentType]
compression='uncompressed',
file_version='0.11',
row_index_stride=20000,
@@ -444,20 +444,20 @@ def test_buffer_readwrite_with_bad_writeoptions():
orc.write_table(
table,
buffer_output_stream,
- compression=0,
+ compression=0, # type: ignore[reportArgumentType]
)
with pytest.raises(ValueError):
orc.write_table(
table,
buffer_output_stream,
- compression='none',
+ compression='none', # type: ignore[reportArgumentType]
)
with pytest.raises(ValueError):
orc.write_table(
table,
buffer_output_stream,
- compression='zlid',
+ compression='zlid', # type: ignore[reportArgumentType]
)
# compression_block_size must be a positive integer
@@ -487,20 +487,20 @@ def test_buffer_readwrite_with_bad_writeoptions():
orc.write_table(
table,
buffer_output_stream,
- compression_strategy=0,
+ compression_strategy=0, # type: ignore[reportArgumentType]
)
with pytest.raises(ValueError):
orc.write_table(
table,
buffer_output_stream,
- compression_strategy='no',
+ compression_strategy='no', # type: ignore[reportArgumentType]
)
with pytest.raises(ValueError):
orc.write_table(
table,
buffer_output_stream,
- compression_strategy='large',
+ compression_strategy='large', # type: ignore[reportArgumentType]
)
# row_index_stride must be a positive integer
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 7f9b04eaabd..b151ef4a80b 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -28,37 +28,34 @@
import hypothesis as h
import hypothesis.strategies as st
import pytest
-try:
- import numpy as np
- import numpy.testing as npt
- try:
- _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning
- except AttributeError:
- from numpy.exceptions import (
- VisibleDeprecationWarning as _np_VisibleDeprecationWarning
- )
-except ImportError:
- np = None
+import pyarrow as pa
from pyarrow.pandas_compat import get_logical_type, _pandas_api
from pyarrow.tests.util import invoke_script, random_ascii, rands
import pyarrow.tests.strategies as past
import pyarrow.tests.util as test_util
from pyarrow.vendored.version import Version
-import pyarrow as pa
try:
from pyarrow import parquet as pq
except ImportError:
pass
-try:
- import pandas as pd
- import pandas.testing as tm
- from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
-except ImportError:
- pass
+pd = pytest.importorskip("pandas")
+np = pytest.importorskip("numpy")
+
+import numpy.testing as npt # noqa: E402
+import pandas.testing as tm # noqa: E402
+from .pandas_examples import dataframe_with_arrays, dataframe_with_lists # noqa: E402
+try:
+ _np_VisibleDeprecationWarning = (
+ np.VisibleDeprecationWarning # type: ignore[attr-defined]
+ )
+except AttributeError:
+ from numpy.exceptions import (
+ VisibleDeprecationWarning as _np_VisibleDeprecationWarning
+ )
# Marks all of the tests in this module
pytestmark = pytest.mark.pandas
@@ -77,14 +74,10 @@ def _alltypes_example(size=100):
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
- 'datetime[s]': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[s]'),
- 'datetime[ms]': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[ms]'),
- 'datetime[us]': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[us]'),
- 'datetime[ns]': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[ns]'),
+ 'datetime[s]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='s').values,
+ 'datetime[ms]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ms').values,
+ 'datetime[us]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='us').values,
+ 'datetime[ns]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ns').values,
'timedelta64[s]': np.arange(0, size, dtype='timedelta64[s]'),
'timedelta64[ms]': np.arange(0, size, dtype='timedelta64[ms]'),
'timedelta64[us]': np.arange(0, size, dtype='timedelta64[us]'),
@@ -98,7 +91,7 @@ def _alltypes_example(size=100):
def _check_pandas_roundtrip(df, expected=None, use_threads=False,
expected_schema=None,
check_dtype=True, schema=None,
- preserve_index=False,
+ preserve_index: bool | None = False,
as_batch=False):
klass = pa.RecordBatch if as_batch else pa.Table
table = klass.from_pandas(df, schema=schema,
@@ -714,7 +707,7 @@ def test_mismatch_metadata_schema(self):
# OPTION 1: casting after conversion
table = pa.Table.from_pandas(df)
# cast the "datetime" column to be tz-aware
- new_col = table["datetime"].cast(pa.timestamp('ns', tz="UTC"))
+ new_col = table.column(0).cast(pa.timestamp('ns', tz="UTC"))
new_table1 = table.set_column(
0, pa.field("datetime", new_col.type), new_col
)
@@ -982,7 +975,7 @@ def test_float_with_null_as_integer(self):
schema = pa.schema([pa.field('has_nulls', ty)])
result = pa.Table.from_pandas(df, schema=schema,
preserve_index=False)
- assert result[0].chunk(0).equals(expected)
+ assert result.column(0).chunk(0).equals(expected)
def test_int_object_nulls(self):
arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
@@ -1144,7 +1137,7 @@ def test_python_datetime(self):
})
table = pa.Table.from_pandas(df)
- assert isinstance(table[0].chunk(0), pa.TimestampArray)
+ assert isinstance(table.column(0).chunk(0), pa.TimestampArray)
result = table.to_pandas()
# Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
@@ -1201,7 +1194,7 @@ class MyDatetime(datetime):
df = pd.DataFrame({"datetime": pd.Series(date_array, dtype=object)})
table = pa.Table.from_pandas(df)
- assert isinstance(table[0].chunk(0), pa.TimestampArray)
+ assert isinstance(table.column(0).chunk(0), pa.TimestampArray)
result = table.to_pandas()
@@ -1225,7 +1218,7 @@ class MyDate(date):
df = pd.DataFrame({"date": pd.Series(date_array, dtype=object)})
table = pa.Table.from_pandas(df)
- assert isinstance(table[0].chunk(0), pa.Date32Array)
+ assert isinstance(table.column(0).chunk(0), pa.Date32Array)
result = table.to_pandas()
expected_df = pd.DataFrame(
@@ -1737,7 +1730,7 @@ def test_bytes_to_binary(self):
df = pd.DataFrame({'strings': values})
table = pa.Table.from_pandas(df)
- assert table[0].type == pa.binary()
+ assert table.column(0).type == pa.binary()
values2 = [b'qux', b'foo', None, b'barz', b'qux', None]
expected = pd.DataFrame({'strings': values2})
@@ -1758,7 +1751,7 @@ def test_bytes_exceed_2gb(self):
arr = None
table = pa.Table.from_pandas(df)
- assert table[0].num_chunks == 2
+ assert table.column(0).num_chunks == 2
@pytest.mark.large_memory
@pytest.mark.parametrize('char', ['x', b'x'])
@@ -1900,13 +1893,13 @@ def test_table_str_to_categorical_without_na(self, string_type):
zero_copy_only=True)
# chunked array
- result = table["strings"].to_pandas(strings_to_categorical=True)
+ result = table.column("strings").to_pandas(strings_to_categorical=True)
expected = pd.Series(pd.Categorical(values), name="strings")
tm.assert_series_equal(result, expected)
with pytest.raises(pa.ArrowInvalid):
- table["strings"].to_pandas(strings_to_categorical=True,
- zero_copy_only=True)
+ table.column("strings").to_pandas(strings_to_categorical=True,
+ zero_copy_only=True)
@pytest.mark.parametrize(
"string_type", [pa.string(), pa.large_string(), pa.string_view()]
@@ -1927,13 +1920,13 @@ def test_table_str_to_categorical_with_na(self, string_type):
zero_copy_only=True)
# chunked array
- result = table["strings"].to_pandas(strings_to_categorical=True)
+ result = table.column("strings").to_pandas(strings_to_categorical=True)
expected = pd.Series(pd.Categorical(values), name="strings")
tm.assert_series_equal(result, expected)
with pytest.raises(pa.ArrowInvalid):
- table["strings"].to_pandas(strings_to_categorical=True,
- zero_copy_only=True)
+ table.column("strings").to_pandas(strings_to_categorical=True,
+ zero_copy_only=True)
# Regression test for ARROW-2101
def test_array_of_bytes_to_strings(self):
@@ -2515,7 +2508,7 @@ def test_auto_chunking_on_list_overflow(self):
table = pa.Table.from_pandas(df)
table.validate(full=True)
- column_a = table[0]
+ column_a = table.column(0)
assert column_a.num_chunks == 2
assert len(column_a.chunk(0)) == 2**21 - 1
assert len(column_a.chunk(1)) == 1
@@ -3159,9 +3152,8 @@ def test_strided_data_import(self):
boolean_objects[5] = None
cases.append(boolean_objects)
- cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
- dtype='datetime64[ms]')
- .reshape(N, K).copy())
+ cases.append(pd.date_range("2016-01-01T00:00:00.001", periods=N * K, freq='ms')
+ .values.reshape(N, K).copy())
strided_mask = (random_numbers > 0).astype(bool)[:, 0]
@@ -3775,8 +3767,8 @@ def test_recordbatchlist_to_pandas():
def test_recordbatch_table_pass_name_to_pandas():
rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0'])
t = pa.table([pa.array([1, 2, 3, 4])], names=['a0'])
- assert rb[0].to_pandas().name == 'a0'
- assert t[0].to_pandas().name == 'a0'
+ assert rb.column(0).to_pandas().name == 'a0'
+ assert t.column(0).to_pandas().name == 'a0'
# ----------------------------------------------------------------------
@@ -4314,13 +4306,13 @@ def test_array_protocol():
# default conversion
result = pa.table(df)
expected = pa.array([1, 2, None], pa.int64())
- assert result[0].chunk(0).equals(expected)
+ assert result.column(0).chunk(0).equals(expected)
# with specifying schema
schema = pa.schema([('a', pa.float64())])
result = pa.table(df, schema=schema)
expected2 = pa.array([1, 2, None], pa.float64())
- assert result[0].chunk(0).equals(expected2)
+ assert result.column(0).chunk(0).equals(expected2)
# pass Series to pa.array
result = pa.array(df['a'])
@@ -4450,7 +4442,7 @@ def __init__(self):
def __arrow_ext_serialize__(self):
return b''
- def to_pandas_dtype(self):
+ def to_pandas_dtype(self): # type: ignore[override]
return pd.Int64Dtype()
@@ -4550,7 +4542,7 @@ def test_array_to_pandas():
expected = pd.Series(arr)
tm.assert_series_equal(result, expected)
- result = pa.table({"col": arr})["col"].to_pandas()
+ result = pa.table({"col": arr}).column("col").to_pandas()
expected = pd.Series(arr, name="col")
tm.assert_series_equal(result, expected)
@@ -4609,7 +4601,6 @@ def test_array_to_pandas_types_mapper():
assert result.dtype == np.dtype("int64")
-@pytest.mark.pandas
def test_chunked_array_to_pandas_types_mapper():
# https://issues.apache.org/jira/browse/ARROW-9664
if Version(pd.__version__) < Version("1.2.0"):
@@ -5100,7 +5091,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced():
ty = pa.list_(pa.map_(pa.string(), pa.list_(pa.string())))
- def assert_roundtrip(series: pd.Series, data) -> None:
+ def assert_roundtrip(series, data):
array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, type=ty))
array_roundtrip.validate(full=True)
assert data.equals(array_roundtrip)
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index 65f0c608136..20a33a382e4 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -20,11 +20,12 @@
import pytest
import weakref
from collections.abc import Sequence, Mapping
+from typing import cast
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
import pyarrow.compute as pc
@@ -68,7 +69,7 @@
pa.Time32Scalar),
(datetime.datetime.now().time(), None, pa.Time64Scalar),
(datetime.timedelta(days=1), None, pa.DurationScalar),
- (pa.MonthDayNano([1, -1, -10100]), None,
+ (pa.MonthDayNano([1, -1, -10100]), None, # type: ignore[call-arg, arg-type]
pa.MonthDayNanoIntervalScalar),
({'a': 1, 'b': [1, 2]}, None, pa.StructScalar),
([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar),
@@ -360,7 +361,8 @@ def test_time_from_datetime_time():
def test_temporal_values(value, time_type: pa.DataType):
time_scalar = pa.scalar(value, type=time_type)
time_scalar.validate(full=True)
- assert time_scalar.value == value
+ assert (time_scalar.value # type: ignore[union-attr, reportAttributeAccessIssue]
+ == value)
def test_cast():
@@ -422,7 +424,9 @@ def test_timestamp():
expected = pd.Timestamp('2000-01-01 12:34:56')
assert arrow_arr[0].as_py() == expected
- assert arrow_arr[0].value * 1000**i == expected.value
+ value = cast(pa.TimestampScalar, arrow_arr[0]).value
+ assert value is not None
+ assert value * 1000**i == expected.value
tz = 'America/New_York'
arrow_type = pa.timestamp(unit, tz=tz)
@@ -434,7 +438,9 @@ def test_timestamp():
.tz_convert(tz))
assert arrow_arr[0].as_py() == expected
- assert arrow_arr[0].value * 1000**i == expected.value
+ value = cast(pa.TimestampScalar, arrow_arr[0]).value
+ assert value is not None
+ assert value * 1000**i == expected.value
@pytest.mark.nopandas
@@ -529,7 +535,7 @@ def test_duration_nanos_nopandas():
def test_month_day_nano_interval():
- triple = pa.MonthDayNano([-3600, 1800, -50])
+ triple = pa.MonthDayNano([-3600, 1800, -50]) # type: ignore[invalid-argument-type]
arr = pa.array([triple])
assert isinstance(arr[0].as_py(), pa.MonthDayNano)
assert arr[0].as_py() == triple
@@ -577,7 +583,7 @@ def test_binary(value, ty, scalar_typ):
with pytest.raises(ValueError):
memoryview(s)
else:
- assert buf.to_pybytes() == value
+ assert buf.to_pybytes() == value # type: ignore[union-attr]
assert isinstance(buf, pa.Buffer)
assert bytes(s) == value
@@ -852,7 +858,7 @@ def test_dictionary(pickle_module):
assert arr.to_pylist() == expected
for j, (i, v) in enumerate(zip(indices, expected)):
- s = arr[j]
+ s = cast(pa.DictionaryScalar, arr[j])
assert s.as_py() == v
assert s.value.as_py() == v
@@ -868,14 +874,14 @@ def test_run_end_encoded():
values = [1, 2, 1, None, 3]
arr = pa.RunEndEncodedArray.from_arrays(run_ends, values)
- scalar = arr[0]
+ scalar = cast(pa.RunEndEncodedScalar, arr[0])
assert isinstance(scalar, pa.RunEndEncodedScalar)
assert isinstance(scalar.value, pa.Int64Scalar)
assert scalar.value == pa.array(values)[0]
assert scalar.as_py() == 1
# null -> .value is still a scalar, as_py returns None
- scalar = arr[10]
+ scalar = cast(pa.RunEndEncodedScalar, arr[10])
assert isinstance(scalar.value, pa.Int64Scalar)
assert scalar.as_py() is None
@@ -901,13 +907,13 @@ def test_union(pickle_module):
with pytest.raises(pa.ArrowNotImplementedError):
pickle_module.loads(pickle_module.dumps(s))
- assert arr[0].type_code == 0
+ assert cast(pa.UnionScalar, arr[0]).type_code == 0
assert arr[0].as_py() == "a"
- assert arr[1].type_code == 0
+ assert cast(pa.UnionScalar, arr[1]).type_code == 0
assert arr[1].as_py() == "b"
- assert arr[2].type_code == 1
+ assert cast(pa.UnionScalar, arr[2]).type_code == 1
assert arr[2].as_py() == 3
- assert arr[3].type_code == 1
+ assert cast(pa.UnionScalar, arr[3]).type_code == 1
assert arr[3].as_py() == 4
# dense
@@ -927,9 +933,9 @@ def test_union(pickle_module):
with pytest.raises(pa.ArrowNotImplementedError):
pickle_module.loads(pickle_module.dumps(s))
- assert arr[0].type_code == 0
+ assert cast(pa.UnionScalar, arr[0]).type_code == 0
assert arr[0].as_py() == b'a'
- assert arr[5].type_code == 1
+ assert cast(pa.UnionScalar, arr[5]).type_code == 1
assert arr[5].as_py() == 3
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index 029e14ca162..5a7b9989358 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -23,7 +23,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
import pyarrow.tests.util as test_util
@@ -259,7 +259,7 @@ def test_schema():
child 0, item: int8"""
with pytest.raises(TypeError):
- pa.schema([None])
+ pa.schema([None]) # type: ignore[list-item]
def test_schema_weakref():
@@ -548,7 +548,7 @@ def test_schema_equals_invalid_type():
for val in [None, 'string', pa.array([1, 2])]:
with pytest.raises(TypeError):
- schema.equals(val)
+ schema.equals(val) # type: ignore[invalid-argument-type]
def test_schema_equality_operators():
@@ -594,7 +594,7 @@ def test_schema_get_fields():
with pytest.raises(KeyError):
schema.field('other')
with pytest.raises(TypeError):
- schema.field(0.0)
+ schema.field(0.0) # type: ignore[arg-type]
with pytest.raises(IndexError):
schema.field(4)
@@ -706,6 +706,7 @@ def test_empty_table():
assert table.schema == schema
+@pytest.mark.numpy
@pytest.mark.pandas
def test_schema_from_pandas():
import pandas as pd
@@ -782,7 +783,7 @@ def test_schema_merge():
# raise proper error when passing a non-Schema value
with pytest.raises(TypeError):
- pa.unify_schemas([a, 1])
+ pa.unify_schemas([a, 1]) # type: ignore[list-item]
def test_undecodable_metadata():
diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py
index eca8090d77a..2ce48b651b1 100644
--- a/python/pyarrow/tests/test_sparse_tensor.py
+++ b/python/pyarrow/tests/test_sparse_tensor.py
@@ -26,15 +26,16 @@
import pyarrow as pa
try:
- from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix
+ from scipy.sparse import ( # type: ignore[reportMissingModuleSource]
+ csr_array, coo_array, csr_matrix, coo_matrix)
except ImportError:
- coo_matrix = None
- csr_matrix = None
- csr_array = None
- coo_array = None
+ coo_matrix = None # type: ignore[assignment, misc]
+ csr_matrix = None # type: ignore[assignment, misc]
+ csr_array = None # type: ignore[assignment, misc]
+ coo_array = None # type: ignore[assignment, misc]
try:
- import sparse
+ import sparse # type: ignore[import-untyped, import-not-found]
except ImportError:
sparse = None
@@ -401,7 +402,7 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type):
assert np.array_equal(array, result_array)
-@pytest.mark.skipif(not coo_matrix, reason="requires scipy")
+@pytest.mark.skipif(coo_matrix is None, reason="requires scipy")
@pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix))
@pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs)
def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type,
@@ -443,7 +444,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type,
assert out_scipy_matrix.has_canonical_format
-@pytest.mark.skipif(not csr_matrix, reason="requires scipy")
+@pytest.mark.skipif(csr_matrix is None, reason="requires scipy")
@pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix))
@pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs)
def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type,
@@ -483,7 +484,8 @@ def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type):
shape = (4, 6)
dim_names = ("x", "y")
- sparse_array = sparse.COO(data=data, coords=coords, shape=shape)
+ sparse_array = sparse.COO( # type: ignore[reportOptionalMemberAccess]
+ data=data, coords=coords, shape=shape)
sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array,
dim_names=dim_names)
out_sparse_array = sparse_tensor.to_pydata_sparse()
diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py
index babb839b534..9505b9a11b0 100644
--- a/python/pyarrow/tests/test_strategies.py
+++ b/python/pyarrow/tests/test_strategies.py
@@ -25,7 +25,7 @@
@h.given(past.all_types)
def test_types(ty):
- assert isinstance(ty, pa.lib.DataType)
+ assert isinstance(ty, pa.DataType)
@h.given(past.all_fields)
@@ -41,7 +41,7 @@ def test_schemas(schema):
@pytest.mark.numpy
@h.given(past.all_arrays)
def test_arrays(array):
- assert isinstance(array, pa.lib.Array)
+ assert isinstance(array, pa.Array)
@pytest.mark.numpy
diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py
index fcd1c8d48c5..9ad65f0738d 100644
--- a/python/pyarrow/tests/test_substrait.py
+++ b/python/pyarrow/tests/test_substrait.py
@@ -25,13 +25,10 @@
from pyarrow.lib import tobytes
from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError
-try:
- import pyarrow.substrait as substrait
-except ImportError:
- substrait = None
-
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not substrait'
+substrait = pytest.importorskip('pyarrow.substrait')
+_substrait = pytest.importorskip('pyarrow._substrait')
pytestmark = pytest.mark.substrait
@@ -85,7 +82,7 @@ def test_run_serialized_query(tmpdir, use_threads):
query = tobytes(substrait_query.replace(
"FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri()))
- buf = pa._substrait._parse_json_plan(query)
+ buf = _substrait._parse_json_plan(query)
reader = substrait.run_query(buf, use_threads=use_threads)
res_tb = reader.read_all()
@@ -116,7 +113,7 @@ def test_invalid_plan():
]
}
"""
- buf = pa._substrait._parse_json_plan(tobytes(query))
+ buf = _substrait._parse_json_plan(tobytes(query))
exec_message = "Plan has no relations"
with pytest.raises(ArrowInvalid, match=exec_message):
substrait.run_query(buf)
@@ -162,7 +159,7 @@ def test_binary_conversion_with_json_options(tmpdir, use_threads):
path = _write_dummy_data_to_disk(tmpdir, file_name, table)
query = tobytes(substrait_query.replace(
"FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri()))
- buf = pa._substrait._parse_json_plan(tobytes(query))
+ buf = _substrait._parse_json_plan(tobytes(query))
reader = substrait.run_query(buf, use_threads=use_threads)
res_tb = reader.read_all()
@@ -181,7 +178,7 @@ def has_function(fns, ext_file, fn_name):
def test_get_supported_functions():
- supported_functions = pa._substrait.get_supported_functions()
+ supported_functions = _substrait.get_supported_functions()
# It probably doesn't make sense to exhaustively verify this list but
# we can check a sample aggregate and a sample non-aggregate entry
assert has_function(supported_functions,
@@ -232,8 +229,8 @@ def table_provider(names, schema):
}
"""
- buf = pa._substrait._parse_json_plan(tobytes(substrait_query))
- reader = pa.substrait.run_query(
+ buf = _substrait._parse_json_plan(tobytes(substrait_query))
+ reader = substrait.run_query(
buf, table_provider=table_provider, use_threads=use_threads)
res_tb = reader.read_all()
assert res_tb == test_table_1
@@ -275,7 +272,7 @@ def table_provider(names, _):
}
"""
- buf = pa._substrait._parse_json_plan(tobytes(substrait_query))
+ buf = _substrait._parse_json_plan(tobytes(substrait_query))
exec_message = "Invalid NamedTable Source"
with pytest.raises(ArrowInvalid, match=exec_message):
substrait.run_query(buf, table_provider=table_provider)
@@ -317,7 +314,7 @@ def table_provider(names, _):
}
"""
query = tobytes(substrait_query)
- buf = pa._substrait._parse_json_plan(tobytes(query))
+ buf = _substrait._parse_json_plan(tobytes(query))
exec_message = "names for NamedTable not provided"
with pytest.raises(ArrowInvalid, match=exec_message):
substrait.run_query(buf, table_provider=table_provider)
@@ -436,8 +433,8 @@ def table_provider(names, _):
}
"""
- buf = pa._substrait._parse_json_plan(substrait_query)
- reader = pa.substrait.run_query(
+ buf = _substrait._parse_json_plan(substrait_query)
+ reader = substrait.run_query(
buf, table_provider=table_provider, use_threads=use_threads)
res_tb = reader.read_all()
@@ -559,9 +556,9 @@ def table_provider(names, _):
}
"""
- buf = pa._substrait._parse_json_plan(substrait_query)
+ buf = _substrait._parse_json_plan(substrait_query)
with pytest.raises(pa.ArrowKeyError) as excinfo:
- pa.substrait.run_query(buf, table_provider=table_provider)
+ substrait.run_query(buf, table_provider=table_provider)
assert "No function registered" in str(excinfo.value)
@@ -598,8 +595,8 @@ def table_provider(names, schema):
}
"""
- buf = pa._substrait._parse_json_plan(tobytes(substrait_query))
- reader = pa.substrait.run_query(
+ buf = _substrait._parse_json_plan(tobytes(substrait_query))
+ reader = substrait.run_query(
buf, table_provider=table_provider, use_threads=use_threads)
res_tb = reader.read_all()
@@ -744,8 +741,8 @@ def table_provider(names, _):
],
}
"""
- buf = pa._substrait._parse_json_plan(substrait_query)
- reader = pa.substrait.run_query(
+ buf = _substrait._parse_json_plan(substrait_query)
+ reader = substrait.run_query(
buf, table_provider=table_provider, use_threads=False)
res_tb = reader.read_all()
@@ -913,8 +910,8 @@ def table_provider(names, _):
],
}
"""
- buf = pa._substrait._parse_json_plan(substrait_query)
- reader = pa.substrait.run_query(
+ buf = _substrait._parse_json_plan(substrait_query)
+ reader = substrait.run_query(
buf, table_provider=table_provider, use_threads=False)
res_tb = reader.read_all()
@@ -929,8 +926,8 @@ def table_provider(names, _):
@pytest.mark.parametrize("expr", [
- pc.equal(pc.field("x"), 7),
- pc.equal(pc.field("x"), pc.field("y")),
+ pc.equal(pc.field("x"), 7), # type: ignore[attr-defined]
+ pc.equal(pc.field("x"), pc.field("y")), # type: ignore[attr-defined]
pc.field("x") > 50
])
def test_serializing_expressions(expr):
@@ -939,8 +936,8 @@ def test_serializing_expressions(expr):
pa.field("y", pa.int32())
])
- buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema)
- returned = pa.substrait.deserialize_expressions(buf)
+ buf = substrait.serialize_expressions([expr], ["test_expr"], schema)
+ returned = substrait.deserialize_expressions(buf)
assert schema == returned.schema
assert len(returned.expressions) == 1
assert "test_expr" in returned.expressions
@@ -958,8 +955,8 @@ def test_arrow_specific_types():
schema = pa.schema([pa.field(name, typ) for name, (typ, _) in fields.items()])
def check_round_trip(expr):
- buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema)
- returned = pa.substrait.deserialize_expressions(buf)
+ buf = substrait.serialize_expressions([expr], ["test_expr"], schema)
+ returned = substrait.deserialize_expressions(buf)
assert schema == returned.schema
for name, (typ, val) in fields.items():
@@ -986,8 +983,8 @@ def test_arrow_one_way_types():
def check_one_way(field):
expr = pc.is_null(pc.field(field.name))
- buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema)
- returned = pa.substrait.deserialize_expressions(buf)
+ buf = substrait.serialize_expressions([expr], ["test_expr"], schema)
+ returned = substrait.deserialize_expressions(buf)
assert alt_schema == returned.schema
for field in schema:
@@ -1003,14 +1000,14 @@ def test_invalid_expression_ser_des():
bad_expr = pc.equal(pc.field("z"), 7)
# Invalid number of names
with pytest.raises(ValueError) as excinfo:
- pa.substrait.serialize_expressions([expr], [], schema)
+ substrait.serialize_expressions([expr], [], schema)
assert 'need to have the same length' in str(excinfo.value)
with pytest.raises(ValueError) as excinfo:
- pa.substrait.serialize_expressions([expr], ["foo", "bar"], schema)
+ substrait.serialize_expressions([expr], ["foo", "bar"], schema)
assert 'need to have the same length' in str(excinfo.value)
# Expression doesn't match schema
with pytest.raises(ValueError) as excinfo:
- pa.substrait.serialize_expressions([bad_expr], ["expr"], schema)
+ substrait.serialize_expressions([bad_expr], ["expr"], schema)
assert 'No match for FieldRef' in str(excinfo.value)
@@ -1020,8 +1017,8 @@ def test_serializing_multiple_expressions():
pa.field("y", pa.int32())
])
exprs = [pc.equal(pc.field("x"), 7), pc.equal(pc.field("x"), pc.field("y"))]
- buf = pa.substrait.serialize_expressions(exprs, ["first", "second"], schema)
- returned = pa.substrait.deserialize_expressions(buf)
+ buf = substrait.serialize_expressions(exprs, ["first", "second"], schema)
+ returned = substrait.deserialize_expressions(buf)
assert schema == returned.schema
assert len(returned.expressions) == 2
@@ -1037,8 +1034,8 @@ def test_serializing_with_compute():
])
expr = pc.equal(pc.field("x"), 7)
expr_norm = pc.equal(pc.field(0), 7)
- buf = expr.to_substrait(schema)
- returned = pa.substrait.deserialize_expressions(buf)
+ buf = expr.to_substrait(schema) # type: ignore[union-attr]
+ returned = substrait.deserialize_expressions(buf)
assert schema == returned.schema
assert len(returned.expressions) == 1
@@ -1046,13 +1043,13 @@ def test_serializing_with_compute():
assert str(returned.expressions["expression"]) == str(expr_norm)
# Compute can't deserialize messages with multiple expressions
- buf = pa.substrait.serialize_expressions([expr, expr], ["first", "second"], schema)
+ buf = substrait.serialize_expressions([expr, expr], ["first", "second"], schema)
with pytest.raises(ValueError) as excinfo:
pc.Expression.from_substrait(buf)
assert 'contained multiple expressions' in str(excinfo.value)
# Deserialization should be possible regardless of the expression name
- buf = pa.substrait.serialize_expressions([expr], ["weirdname"], schema)
+ buf = substrait.serialize_expressions([expr], ["weirdname"], schema)
expr2 = pc.Expression.from_substrait(buf)
assert str(expr2) == str(expr_norm)
@@ -1069,11 +1066,11 @@ def test_serializing_udfs():
exprs = [pc.shift_left(a, b)]
with pytest.raises(ArrowNotImplementedError):
- pa.substrait.serialize_expressions(exprs, ["expr"], schema)
+ substrait.serialize_expressions(exprs, ["expr"], schema)
- buf = pa.substrait.serialize_expressions(
+ buf = substrait.serialize_expressions(
exprs, ["expr"], schema, allow_arrow_extensions=True)
- returned = pa.substrait.deserialize_expressions(buf)
+ returned = substrait.deserialize_expressions(buf)
assert schema == returned.schema
assert len(returned.expressions) == 1
assert str(returned.expressions["expr"]) == str(exprs[0])
@@ -1085,19 +1082,19 @@ def test_serializing_schema():
pa.field("x", pa.int32()),
pa.field("y", pa.string())
])
- returned = pa.substrait.deserialize_schema(substrait_schema)
+ returned = substrait.deserialize_schema(substrait_schema)
assert expected_schema == returned
- arrow_substrait_schema = pa.substrait.serialize_schema(returned)
+ arrow_substrait_schema = substrait.serialize_schema(returned)
assert arrow_substrait_schema.schema == substrait_schema
- returned = pa.substrait.deserialize_schema(arrow_substrait_schema)
+ returned = substrait.deserialize_schema(arrow_substrait_schema)
assert expected_schema == returned
- returned = pa.substrait.deserialize_schema(arrow_substrait_schema.schema)
+ returned = substrait.deserialize_schema(arrow_substrait_schema.schema)
assert expected_schema == returned
- returned = pa.substrait.deserialize_expressions(arrow_substrait_schema.expression)
+ returned = substrait.deserialize_expressions(arrow_substrait_schema.expression)
assert returned.schema == expected_schema
@@ -1114,7 +1111,7 @@ def SerializeToString(self):
b'\x1a\x19\n\x06\x12\x04\n\x02\x12\x00\x1a\x0fproject_version'
b'"0\n\x0fproject_version\n\x0fproject_release'
b'\x12\x0c\n\x04:\x02\x10\x01\n\x04b\x02\x10\x01')
- exprs = pa.substrait.BoundExpressions.from_substrait(FakeMessage(message))
+ exprs = substrait.BoundExpressions.from_substrait(FakeMessage(message))
assert len(exprs.expressions) == 2
assert 'project_release' in exprs.expressions
assert 'project_version' in exprs.expressions
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index b65fb7d952c..6263afd03a5 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -18,12 +18,13 @@
from collections import OrderedDict
from collections.abc import Iterable
import sys
+from typing import cast
import weakref
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pytest
import pyarrow as pa
import pyarrow.compute as pc
@@ -418,7 +419,8 @@ def test_to_pandas_empty_table():
table = pa.table(df)
result = table.schema.empty_table().to_pandas()
assert result.shape == (0, 2)
- tm.assert_frame_equal(result, df.iloc[:0])
+ expected = df.iloc[:0]
+ tm.assert_frame_equal(result, expected)
@pytest.mark.pandas
@@ -486,12 +488,25 @@ def test_chunked_array_unify_dictionaries():
pa.array(["foo", "bar", None, "foo"]).dictionary_encode(),
pa.array(["quux", None, "foo"]).dictionary_encode(),
])
- assert arr.chunk(0).dictionary.equals(pa.array(["foo", "bar"]))
- assert arr.chunk(1).dictionary.equals(pa.array(["quux", "foo"]))
+ chunk_0 = arr.chunk(0)
+ assert isinstance(chunk_0, pa.DictionaryArray)
+ assert chunk_0.dictionary.equals(pa.array(["foo", "bar"]))
+
+ chunk_1 = arr.chunk(1)
+ assert isinstance(chunk_1, pa.DictionaryArray)
+ assert chunk_1.dictionary.equals(pa.array(["quux", "foo"]))
+
arr = arr.unify_dictionaries()
expected_dict = pa.array(["foo", "bar", "quux"])
- assert arr.chunk(0).dictionary.equals(expected_dict)
- assert arr.chunk(1).dictionary.equals(expected_dict)
+
+ chunk_0 = arr.chunk(0)
+ assert isinstance(chunk_0, pa.DictionaryArray)
+ assert chunk_0.dictionary.equals(expected_dict)
+
+ chunk_1 = arr.chunk(1)
+ assert isinstance(chunk_1, pa.DictionaryArray)
+ assert chunk_1.dictionary.equals(expected_dict)
+
assert arr.to_pylist() == ["foo", "bar", None, "foo", "quux", None, "foo"]
@@ -716,7 +731,7 @@ def test_recordbatch_take():
def test_recordbatch_column_sets_private_name():
# ARROW-6429
rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0'])
- assert rb[0]._name == 'a0'
+ assert rb.column(0)._name == 'a0'
def test_recordbatch_from_arrays_validate_schema():
@@ -798,7 +813,7 @@ def test_recordbatch_get_field():
batch.field('d')
with pytest.raises(TypeError):
- batch.field(None)
+ batch.field(None) # type: ignore[arg-type]
with pytest.raises(IndexError):
batch.field(4)
@@ -819,7 +834,7 @@ def test_recordbatch_select_column():
batch.column('d')
with pytest.raises(TypeError):
- batch.column(None)
+ batch.column(None) # type: ignore[arg-type]
with pytest.raises(IndexError):
batch.column(4)
@@ -933,7 +948,10 @@ def test_table_from_struct_array_chunked_array():
[[{"ints": 1}, {"floats": 1.0}]],
type=pa.struct([("ints", pa.int32()), ("floats", pa.float32())]),
)
- result = pa.Table.from_struct_array(chunked_struct_array)
+ assert isinstance(chunked_struct_array.type, pa.StructType)
+ # Cast to the proper type for type checker
+ struct_chunked_array = cast(pa.ChunkedArray, chunked_struct_array)
+ result = pa.Table.from_struct_array(struct_chunked_array)
assert result.equals(pa.Table.from_arrays(
[
pa.array([1, None], type=pa.int32()),
@@ -1189,7 +1207,7 @@ def test_recordbatch_to_tensor_null():
batch.to_tensor()
result = batch.to_tensor(null_to_nan=True, row_major=False)
- x = np.column_stack([arr1, arr2]).astype(np.float64, order="F")
+ x = np.column_stack([arr1, arr2]).astype(np.float64, order="F") # type: ignore[no-matching-overload]
expected = pa.Tensor.from_numpy(x)
np.testing.assert_equal(result.to_numpy(), x)
@@ -1223,7 +1241,7 @@ def test_recordbatch_to_tensor_null():
)
result = batch.to_tensor(null_to_nan=True, row_major=False)
- x = np.column_stack([arr1, arr2]).astype(np.float32, order="F")
+ x = np.column_stack([arr1, arr2]).astype(np.float32, order="F") # type: ignore[no-matching-overload]
expected = pa.Tensor.from_numpy(x)
np.testing.assert_equal(result.to_numpy(), x)
@@ -1339,7 +1357,7 @@ def test_recordbatchlist_schema_equals():
def test_table_column_sets_private_name():
# ARROW-6429
t = pa.table([pa.array([1, 2, 3, 4])], names=['a0'])
- assert t[0]._name == 'a0'
+ assert t.column(0)._name == 'a0'
def test_table_equals():
@@ -1500,7 +1518,8 @@ def test_table_from_arrays_preserves_column_metadata():
field1 = pa.field('field2', pa.int64(), nullable=False)
table = pa.Table.from_arrays([arr0, arr1],
schema=pa.schema([field0, field1]))
- assert b"a" in table.field(0).metadata
+ field0_metadata = table.field(0).metadata
+ assert field0_metadata is not None and b"a" in field0_metadata
assert table.field(1).nullable is False
@@ -1565,7 +1584,7 @@ def test_table_get_field():
table.field('d')
with pytest.raises(TypeError):
- table.field(None)
+ table.field(None) # type: ignore[arg-type]
with pytest.raises(IndexError):
table.field(4)
@@ -1586,7 +1605,7 @@ def test_table_select_column():
table.column('d')
with pytest.raises(TypeError):
- table.column(None)
+ table.column(None) # type: ignore[arg-type]
with pytest.raises(IndexError):
table.column(4)
@@ -1879,22 +1898,41 @@ def test_table_unify_dictionaries():
table = pa.Table.from_batches([batch1, batch2])
table = table.replace_schema_metadata({b"key1": b"value1"})
- assert table.column(0).chunk(0).dictionary.equals(
- pa.array(["foo", "bar"]))
- assert table.column(0).chunk(1).dictionary.equals(
- pa.array(["quux", "foo"]))
- assert table.column(1).chunk(0).dictionary.equals(
- pa.array([123, 456, 789]))
- assert table.column(1).chunk(1).dictionary.equals(
- pa.array([456, 789]))
+ chunk_0_0 = table.column(0).chunk(0)
+ assert isinstance(chunk_0_0, pa.DictionaryArray)
+ assert chunk_0_0.dictionary.equals(pa.array(["foo", "bar"]))
+
+ chunk_0_1 = table.column(0).chunk(1)
+ assert isinstance(chunk_0_1, pa.DictionaryArray)
+ assert chunk_0_1.dictionary.equals(pa.array(["quux", "foo"]))
+
+ chunk_1_0 = table.column(1).chunk(0)
+ assert isinstance(chunk_1_0, pa.DictionaryArray)
+ assert chunk_1_0.dictionary.equals(pa.array([123, 456, 789]))
+
+ chunk_1_1 = table.column(1).chunk(1)
+ assert isinstance(chunk_1_1, pa.DictionaryArray)
+ assert chunk_1_1.dictionary.equals(pa.array([456, 789]))
table = table.unify_dictionaries(pa.default_memory_pool())
expected_dict_0 = pa.array(["foo", "bar", "quux"])
expected_dict_1 = pa.array([123, 456, 789])
- assert table.column(0).chunk(0).dictionary.equals(expected_dict_0)
- assert table.column(0).chunk(1).dictionary.equals(expected_dict_0)
- assert table.column(1).chunk(0).dictionary.equals(expected_dict_1)
- assert table.column(1).chunk(1).dictionary.equals(expected_dict_1)
+
+ chunk_0_0 = table.column(0).chunk(0)
+ assert isinstance(chunk_0_0, pa.DictionaryArray)
+ assert chunk_0_0.dictionary.equals(expected_dict_0)
+
+ chunk_0_1 = table.column(0).chunk(1)
+ assert isinstance(chunk_0_1, pa.DictionaryArray)
+ assert chunk_0_1.dictionary.equals(expected_dict_0)
+
+ chunk_1_0 = table.column(1).chunk(0)
+ assert isinstance(chunk_1_0, pa.DictionaryArray)
+ assert chunk_1_0.dictionary.equals(expected_dict_1)
+
+ chunk_1_1 = table.column(1).chunk(1)
+ assert isinstance(chunk_1_1, pa.DictionaryArray)
+ assert chunk_1_1.dictionary.equals(expected_dict_1)
assert table.to_pydict() == {
'a': ["foo", "bar", None, "foo", "quux", "foo", None, "quux"],
@@ -1964,13 +2002,13 @@ def test_concat_tables_invalid_option():
t = pa.Table.from_arrays([list(range(10))], names=('a',))
with pytest.raises(ValueError, match="Invalid promote_options: invalid"):
- pa.concat_tables([t, t], promote_options="invalid")
+ pa.concat_tables([t, t], promote_options="invalid") # type: ignore[arg-type]
def test_concat_tables_none_table():
# ARROW-11997
with pytest.raises(AttributeError):
- pa.concat_tables([None])
+ pa.concat_tables([None]) # type: ignore[arg-type]
@pytest.mark.pandas
@@ -2113,7 +2151,7 @@ def test_concat_batches_different_schema():
def test_concat_batches_none_batches():
# ARROW-11997
with pytest.raises(AttributeError):
- pa.concat_batches([None])
+ pa.concat_batches([None]) # type: ignore[arg-type]
@pytest.mark.parametrize(
@@ -2264,7 +2302,7 @@ def test_from_arrays_schema(data, klass):
# with different and incompatible schema
schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))])
with pytest.raises((NotImplementedError, TypeError)):
- pa.Table.from_pydict(data, schema=schema)
+ pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type]
# Cannot pass both schema and metadata / names
with pytest.raises(ValueError):
@@ -2369,7 +2407,7 @@ def test_table_from_pydict_arrow_arrays(data, klass):
# with different and incompatible schema
schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))])
with pytest.raises((NotImplementedError, TypeError)):
- pa.Table.from_pydict(data, schema=schema)
+ pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type]
@pytest.mark.parametrize('data, klass', [
@@ -2386,7 +2424,7 @@ def test_table_from_pydict_schema(data, klass):
schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64()),
('ints', pa.int64())])
with pytest.raises(KeyError, match='ints'):
- pa.Table.from_pydict(data, schema=schema)
+ pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type]
# data has columns not present in schema -> ignored
schema = pa.schema([('strs', pa.utf8())])
@@ -2590,10 +2628,10 @@ def test_table_factory_function_args_pandas():
def test_factory_functions_invalid_input():
with pytest.raises(TypeError, match="Expected pandas DataFrame, python"):
- pa.table("invalid input")
+ pa.table("invalid input") # type: ignore[arg-type]
with pytest.raises(TypeError, match="Expected pandas DataFrame"):
- pa.record_batch("invalid input")
+ pa.record_batch("invalid input") # type: ignore[arg-type]
def test_table_repr_to_string():
@@ -2727,8 +2765,8 @@ def test_table_function_unicode_schema():
schema = pa.schema([(col_a, pa.int32()), (col_b, pa.string())])
result = pa.table(d, schema=schema)
- assert result[0].chunk(0).equals(pa.array([1, 2, 3], type='int32'))
- assert result[1].chunk(0).equals(pa.array(['a', 'b', 'c'], type='string'))
+ assert result.column(0).chunk(0).equals(pa.array([1, 2, 3], type='int32'))
+ assert result.column(1).chunk(0).equals(pa.array(['a', 'b', 'c'], type='string'))
def test_table_take_vanilla_functionality():
@@ -3603,7 +3641,7 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr
# equals() test
with pytest.raises(NotImplementedError):
- cuda_chunked_array == cuda_chunked_array
+ cuda_chunked_array == cuda_chunked_array # type: ignore[reportUnusedExpression]
# to_pandas() test
with pytest.raises(NotImplementedError):
@@ -3860,7 +3898,7 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
# __dataframe__() test
with pytest.raises(NotImplementedError):
- from_dataframe(cuda_recordbatch.__dataframe__())
+ from_dataframe(cuda_recordbatch.__dataframe__()) # type: ignore[misc]
def verify_cuda_table(table, expected_schema):
@@ -4059,7 +4097,7 @@ def test_table_non_cpu(cuda_context, cpu_table, cuda_table,
# __dataframe__() test
with pytest.raises(NotImplementedError):
- from_dataframe(cuda_table.__dataframe__())
+ from_dataframe(cuda_table.__dataframe__()) # type: ignore[misc]
# __reduce__() test
with pytest.raises(NotImplementedError):
diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py
index debb1066280..c3726fdbbf4 100644
--- a/python/pyarrow/tests/test_tensor.py
+++ b/python/pyarrow/tests/test_tensor.py
@@ -213,7 +213,7 @@ def test_tensor_memoryview():
dtype = data.dtype
lst = data.tolist()
tensor = pa.Tensor.from_numpy(data)
- m = memoryview(tensor)
+ m = memoryview(tensor) # type: ignore[reportArgumentType]
assert m.format == expected_format
assert m.shape == data.shape
assert m.strides == data.strides
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 539f0172454..c224392510d 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -24,16 +24,22 @@
import pytest
import hypothesis as h
import hypothesis.strategies as st
-try:
- import hypothesis.extra.pytz as tzst
-except ImportError:
- tzst = None
+from typing import Any, TYPE_CHECKING
import weakref
-try:
+if TYPE_CHECKING:
import numpy as np
-except ImportError:
- np = None
+ import hypothesis.extra.pytz as tzst
+else:
+ try:
+ import numpy as np
+ except ImportError:
+ np = None
+ try:
+ import hypothesis.extra.pytz as tzst
+ except ImportError:
+ tzst = None
+
import pyarrow as pa
import pyarrow.types as types
import pyarrow.tests.strategies as past
@@ -411,7 +417,7 @@ def test_tzinfo_to_string_errors():
if tzst:
timezones = tzst.timezones()
else:
- timezones = st.none()
+ timezones = st.none() # type: ignore[assignment]
@h.given(timezones)
@@ -465,7 +471,7 @@ class BuggyTimezone2(datetime.tzinfo):
def tzname(self, dt):
return None
- def utcoffset(self, dt):
+ def utcoffset(self, dt): # type: ignore[override]
return "one hour"
class BuggyTimezone3(datetime.tzinfo):
@@ -473,7 +479,7 @@ class BuggyTimezone3(datetime.tzinfo):
Wrong timezone name type
"""
- def tzname(self, dt):
+ def tzname(self, dt): # type: ignore[override]
return 240
def utcoffset(self, dt):
@@ -732,13 +738,13 @@ def test_struct_type():
# Neither integer nor string
with pytest.raises(TypeError):
- ty[None]
+ ty[None] # type: ignore[reportArgumentType]
with pytest.raises(TypeError):
- ty.field(None)
+ ty.field(None) # type: ignore[reportArgumentType]
for a, b in zip(ty, fields):
- a == b
+ assert a == b
# Construct from list of tuples
ty = pa.struct([('a', pa.int64()),
@@ -746,7 +752,7 @@ def test_struct_type():
('b', pa.int32())])
assert list(ty) == fields
for a, b in zip(ty, fields):
- a == b
+ assert a == b
# Construct from mapping
fields = [pa.field('a', pa.int64()),
@@ -755,7 +761,7 @@ def test_struct_type():
('b', pa.int32())]))
assert list(ty) == fields
for a, b in zip(ty, fields):
- a == b
+ assert a == b
# Invalid args
with pytest.raises(TypeError):
@@ -862,7 +868,7 @@ def test_dictionary_type():
# invalid index type raises
with pytest.raises(TypeError):
- pa.dictionary(pa.string(), pa.int64())
+ pa.dictionary(pa.string(), pa.int64()) # type: ignore[reportArgumentType]
def test_dictionary_ordered_equals():
@@ -951,7 +957,7 @@ def test_run_end_encoded_type():
pa.run_end_encoded(None, pa.utf8())
with pytest.raises(ValueError):
- pa.run_end_encoded(pa.int8(), pa.utf8())
+ pa.run_end_encoded(pa.int8(), pa.utf8()) # type: ignore[reportArgumentType]
@pytest.mark.parametrize('t,check_func', [
@@ -1084,12 +1090,12 @@ def test_timedelta_overflow():
pa.scalar(d, type=pa.duration('ns'))
# microsecond resolution, not overflow
- pa.scalar(d, type=pa.duration('us')).as_py() == d
+ assert pa.scalar(d, type=pa.duration('us')).as_py() == d
# second/millisecond resolution, not overflow
for d in [datetime.timedelta.min, datetime.timedelta.max]:
- pa.scalar(d, type=pa.duration('ms')).as_py() == d
- pa.scalar(d, type=pa.duration('s')).as_py() == d
+ _ = pa.scalar(d, type=pa.duration('ms')).as_py() == d
+ _ = pa.scalar(d, type=pa.duration('s')).as_py() == d
def test_type_equality_operators():
@@ -1127,11 +1133,11 @@ def test_key_value_metadata():
assert m1 != {'a': 'A', 'b': 'C'}
with pytest.raises(TypeError):
- pa.KeyValueMetadata({'a': 1})
+ pa.KeyValueMetadata({'a': 1}) # type: ignore[reportArgumentType]
with pytest.raises(TypeError):
- pa.KeyValueMetadata({1: 'a'})
+ pa.KeyValueMetadata({1: 'a'}) # type: ignore[reportArgumentType]
with pytest.raises(TypeError):
- pa.KeyValueMetadata(a=1)
+ pa.KeyValueMetadata(a=1) # type: ignore[reportArgumentType]
expected = [(b'a', b'A'), (b'b', b'B')]
result = [(k, v) for k, v in m3.items()]
@@ -1258,6 +1264,7 @@ def test_field_metadata():
assert f1.metadata is None
assert f2.metadata == {}
+ assert f3.metadata is not None
assert f3.metadata[b'bizz'] == b'bazz'
@@ -1394,7 +1401,7 @@ def __arrow_c_schema__(self):
return self.schema.__arrow_c_schema__()
-class SchemaMapping(Mapping):
+class SchemaMapping(Mapping[Any, Any]):
def __init__(self, schema):
self.schema = schema
diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py
index 93004a30618..e028f1c0484 100644
--- a/python/pyarrow/tests/test_udf.py
+++ b/python/pyarrow/tests/test_udf.py
@@ -21,7 +21,7 @@
try:
import numpy as np
except ImportError:
- np = None
+ pass
import pyarrow as pa
from pyarrow import compute as pc
@@ -35,7 +35,7 @@
try:
import pyarrow.dataset as ds
except ImportError:
- ds = None
+ pass
def mock_udf_context(batch_length=10):
@@ -381,6 +381,7 @@ def check_scalar_function(func_fixture,
func = pc.get_function(name)
assert func.name == name
+ assert batch_length is not None
result = pc.call_function(name, inputs, length=batch_length)
expected_output = function(mock_udf_context(batch_length), *inputs)
@@ -580,8 +581,8 @@ def identity(ctx, val):
}
with pytest.raises(TypeError,
match="DataType expected, got "):
- pc.register_scalar_function(identity, func_name,
- doc, in_types, out_type)
+ pc.register_scalar_function(
+ identity, func_name, doc, in_types, out_type) # type: ignore[arg-type]
def test_wrong_input_type_declaration():
@@ -597,8 +598,9 @@ def identity(ctx, val):
}
with pytest.raises(TypeError,
match="DataType expected, got "):
- pc.register_scalar_function(identity, func_name, doc,
- in_types, out_type)
+ pc.register_scalar_function(
+ identity, func_name, doc, in_types, # type: ignore[arg-type]
+ out_type)
def test_scalar_udf_context(unary_func_fixture):
diff --git a/python/pyarrow/tests/test_without_numpy.py b/python/pyarrow/tests/test_without_numpy.py
index 55c12602ce8..c5f5671aabc 100644
--- a/python/pyarrow/tests/test_without_numpy.py
+++ b/python/pyarrow/tests/test_without_numpy.py
@@ -50,6 +50,7 @@ def test_tensor_to_np():
arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
storage = pa.array(arr, pa.list_(pa.int32(), 4))
tensor_array = pa.ExtensionArray.from_storage(tensor_type, storage)
+ assert isinstance(tensor_array, pa.FixedShapeTensorArray)
tensor = tensor_array.to_tensor()
msg = "Cannot return a numpy.ndarray if NumPy is not present"
diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
index 7e3dd4324e9..fca0fec1122 100644
--- a/python/pyarrow/tests/util.py
+++ b/python/pyarrow/tests/util.py
@@ -171,7 +171,8 @@ def get_modified_env_with_pythonpath():
existing_pythonpath = env.get('PYTHONPATH', '')
module_path = os.path.abspath(
- os.path.dirname(os.path.dirname(pa.__file__)))
+ os.path.dirname(os.path.dirname( # type: ignore[no-matching-overload]
+ pa.__file__)))
if existing_pythonpath:
new_pythonpath = os.pathsep.join((module_path, existing_pythonpath))
@@ -336,6 +337,7 @@ def _ensure_minio_component_version(component, minimum_year):
stderr=subprocess.PIPE, encoding='utf-8') as proc:
if proc.wait(10) != 0:
return False
+ assert proc.stdout is not None
stdout = proc.stdout.read()
pattern = component + r' version RELEASE\.(\d+)-.*'
version_match = re.search(pattern, stdout)
@@ -367,6 +369,8 @@ def _run_mc_command(mcdir, *args):
cmd_str = ' '.join(full_args)
print(f'Cmd: {cmd_str}')
print(f' Return: {retval}')
+ assert proc.stdout is not None
+ assert proc.stderr is not None
print(f' Stdout: {proc.stdout.read()}')
print(f' Stderr: {proc.stderr.read()}')
if retval != 0:
diff --git a/python/pyarrow/vendored/docscrape.py b/python/pyarrow/vendored/docscrape.py
index 6c4d6e01400..47aeeed40ae 100644
--- a/python/pyarrow/vendored/docscrape.py
+++ b/python/pyarrow/vendored/docscrape.py
@@ -18,7 +18,7 @@
import sys
-def strip_blank_lines(l):
+def strip_blank_lines(l): # noqa: E741
"Remove leading and trailing blank lines from a list of lines"
while l and not l[0].strip():
del l[0]
@@ -62,7 +62,7 @@ def read(self):
return ''
def seek_next_non_empty_line(self):
- for l in self[self._l:]:
+ for l in self[self._l:]: # noqa: E741
if l.strip():
break
else:
@@ -185,8 +185,9 @@ def _is_at_section(self):
l2 = self._doc.peek(1).strip() # ---------- or ==========
if len(l2) >= 3 and (set(l2) in ({'-'}, {'='})) and len(l2) != len(l1):
snip = '\n'.join(self._doc._str[:2])+'...'
- self._error_location("potentially wrong underline length... \n%s \n%s in \n%s"
- % (l1, l2, snip), error=False)
+ self._error_location(
+ "potentially wrong underline length... \n%s \n%s in \n%s"
+ % (l1, l2, snip), error=False)
return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1))
def _strip(self, doc):
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 0a730fd4f78..8031c333a64 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -84,11 +84,11 @@ zip-safe=false
include-package-data=true
[tool.setuptools.packages.find]
-include = ["pyarrow"]
+include = ["pyarrow", "pyarrow.*"]
namespaces = false
[tool.setuptools.package-data]
-pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"]
+pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd", "py.typed"]
[tool.setuptools_scm]
root = '..'
@@ -96,3 +96,27 @@ version_file = 'pyarrow/_generated_version.py'
version_scheme = 'guess-next-dev'
git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"'
fallback_version = '23.0.0a0'
+
+[tool.mypy]
+files = ["pyarrow"]
+exclude = 'pyarrow/interchange/.*|pyarrow/tests/interchange/.*|pyarrow/vendored/.*|pyarrow/tests/test_cuda*'
+mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs"
+
+[tool.pyright]
+pythonPlatform = "All"
+pythonVersion = "3.10"
+include = ["pyarrow"]
+exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/interchange", "pyarrow/tests/test_cuda*"]
+stubPath = "pyarrow-stubs"
+typeCheckingMode = "basic"
+
+[tool.ty.src]
+include = ["pyarrow"]
+exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/interchange", "pyarrow/tests/test_cuda*"]
+
+[tool.ty.environment]
+root = ["pyarrow"]
+
+[tool.ty.rules]
+unresolved-import = "ignore"
+unresolved-attribute = "ignore"
diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py
index 53d3dd52bd8..6015cc211c1 100644
--- a/python/scripts/run_emscripten_tests.py
+++ b/python/scripts/run_emscripten_tests.py
@@ -114,7 +114,7 @@ def end_headers(self):
def run_server_thread(dist_dir, q):
- global _SERVER_ADDRESS
+ global _SERVER_ADDRESS # noqa: F824
os.chdir(dist_dir)
server = http.server.HTTPServer(("", 0), TemplateOverrider)
q.put(server.server_address)
diff --git a/python/setup.py b/python/setup.py
index a27bd3baefd..a25d2d76b36 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -121,8 +121,35 @@ def build_extensions(self):
def run(self):
self._run_cmake()
+ self._copy_stubs()
_build_ext.run(self)
+ def _copy_stubs(self):
+ """Copy .pyi stub files from pyarrow-stubs to the build directory."""
+ build_cmd = self.get_finalized_command('build')
+ build_lib = os.path.abspath(build_cmd.build_lib)
+
+ stubs_src = pjoin(setup_dir, 'pyarrow-stubs', 'pyarrow')
+ stubs_dest = pjoin(build_lib, 'pyarrow')
+
+ if os.path.exists(stubs_src):
+ print(f"-- Copying stub files from {stubs_src} to {stubs_dest}")
+ for root, dirs, files in os.walk(stubs_src):
+ # Calculate relative path from stubs_src
+ rel_dir = os.path.relpath(root, stubs_src)
+ dest_dir = pjoin(stubs_dest, rel_dir) if rel_dir != '.' else stubs_dest
+
+ # Create destination directory if needed
+ if not os.path.exists(dest_dir):
+ os.makedirs(dest_dir)
+
+ # Copy .pyi files
+ for file in files:
+ if file.endswith('.pyi'):
+ src_file = pjoin(root, file)
+ dest_file = pjoin(dest_dir, file)
+ shutil.copy2(src_file, dest_file)
+
# adapted from cmake_build_ext in dynd-python
# github.com/libdynd/dynd-python