From 2044e990623b53dd2199474762e136a4f5c700fb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Dec 2025 22:24:28 +0000 Subject: [PATCH 1/7] Initial plan From 68f8583d70c408f0f9e293fbd8f117dfe1ce52e8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Dec 2025 22:35:46 +0000 Subject: [PATCH 2/7] Add support for multidimensional numpy arrays in pa.array() Co-authored-by: NickCrews <10820686+NickCrews@users.noreply.github.com> --- python/pyarrow/array.pxi | 15 +++++++++++++ python/pyarrow/tests/test_array.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 575b628db3a..24470fbe003 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -299,6 +299,21 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, mask = None if values.mask is np.ma.nomask else values.mask values = values.data + # Handle multidimensional numpy arrays by converting to nested lists + if isinstance(values, np.ndarray) and values.ndim > 1: + if mask is not None: + raise NotImplementedError( + "mask is not supported for multidimensional arrays") + if size is not None: + raise NotImplementedError( + "size is not supported for multidimensional arrays") + # Convert to list and use sequence conversion path + result = _sequence_to_array(values.tolist(), None, None, type, pool, + c_from_pandas) + if extension_type is not None: + result = ExtensionArray.from_storage(extension_type, result) + return result + if mask is not None: if mask.dtype != np.bool_: raise TypeError("Mask must be boolean dtype") diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..d63b139c525 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2478,6 +2478,42 @@ def test_array_from_numpy_datetime(dtype, type): assert arr.equals(expected) +@pytest.mark.numpy +def test_array_from_numpy_multidimensional(): + # GH-XXXXX: support reading multidimensional numpy arrays + # Test 2D array + np_arr_2d = np.arange(6).reshape(2, 3) + pa_arr_2d = pa.array(np_arr_2d) + expected_2d = pa.array([[0, 1, 2], [3, 4, 5]]) + assert pa_arr_2d.equals(expected_2d) + + # Test 3D array (example from the issue) + np_arr_3d = np.arange(24).reshape(2, 3, 4) + pa_arr_3d = pa.array(np_arr_3d) + expected_3d = pa.array(np_arr_3d.tolist()) + assert pa_arr_3d.equals(expected_3d) + + # Test with different dtypes + np_arr_float = np.array([[1.5, 2.5], [3.5, 4.5]]) + pa_arr_float = pa.array(np_arr_float) + expected_float = pa.array([[1.5, 2.5], [3.5, 4.5]]) + assert pa_arr_float.equals(expected_float) + + # Test with explicit type + np_arr_typed = np.array([[1, 2], [3, 4]], dtype=np.int32) + pa_arr_typed = pa.array(np_arr_typed, type=pa.list_(pa.int32())) + expected_typed = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32())) + assert pa_arr_typed.equals(expected_typed) + + # Test that mask is not supported for multidimensional arrays + with pytest.raises(NotImplementedError, match="mask is not supported"): + pa.array(np_arr_2d, mask=np.array([True, False])) + + # Test that size is not supported for multidimensional arrays + with pytest.raises(NotImplementedError, match="size is not supported"): + pa.array(np_arr_2d, size=2) + + @pytest.mark.numpy def test_array_from_different_numpy_datetime_units_raises(): data = [ From 46c098cf62a73bf293c25508586e297f489357db Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Dec 2025 22:37:45 +0000 Subject: [PATCH 3/7] Add documentation for multidimensional numpy array support Co-authored-by: NickCrews <10820686+NickCrews@users.noreply.github.com> --- python/pyarrow/array.pxi | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 24470fbe003..c548957c799 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -174,6 +174,12 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, Notes ----- + Multidimensional numpy arrays are supported and will be converted to + nested list arrays. For example, a 2D array of shape (2, 3) will be + converted to a list array of 2 lists, each containing 3 elements. + Note that mask and size parameters are not supported for multidimensional + arrays. + Timezone will be preserved in the returned array for timezone-aware data, else no timezone will be returned for naive timestamps. Internally, UTC values are stored for timezone-aware data with the @@ -229,6 +235,24 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) >>> arr.type.index_type DataType(int16) + + Multidimensional numpy arrays are supported: + + >>> np_2d = np.arange(6).reshape(2, 3) + >>> pa.array(np_2d) + + [ + [ + 0, + 1, + 2 + ], + [ + 3, + 4, + 5 + ] + ] """ cdef: CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) From ab2e1ac0dde6590ec52f4ba095085dcb39ded374 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Dec 2025 22:39:19 +0000 Subject: [PATCH 4/7] Address code review feedback: fix formatting and remove placeholder Co-authored-by: NickCrews <10820686+NickCrews@users.noreply.github.com> --- python/pyarrow/array.pxi | 4 ++-- python/pyarrow/tests/test_array.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index c548957c799..d06295cabc6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -332,8 +332,8 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, raise NotImplementedError( "size is not supported for multidimensional arrays") # Convert to list and use sequence conversion path - result = _sequence_to_array(values.tolist(), None, None, type, pool, - c_from_pandas) + result = _sequence_to_array(values.tolist(), None, None, type, pool, + c_from_pandas) if extension_type is not None: result = ExtensionArray.from_storage(extension_type, result) return result diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d63b139c525..0a8f953430b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2480,35 +2480,35 @@ def test_array_from_numpy_datetime(dtype, type): @pytest.mark.numpy def test_array_from_numpy_multidimensional(): - # GH-XXXXX: support reading multidimensional numpy arrays + # Support reading multidimensional numpy arrays # Test 2D array np_arr_2d = np.arange(6).reshape(2, 3) pa_arr_2d = pa.array(np_arr_2d) expected_2d = pa.array([[0, 1, 2], [3, 4, 5]]) assert pa_arr_2d.equals(expected_2d) - + # Test 3D array (example from the issue) np_arr_3d = np.arange(24).reshape(2, 3, 4) pa_arr_3d = pa.array(np_arr_3d) expected_3d = pa.array(np_arr_3d.tolist()) assert pa_arr_3d.equals(expected_3d) - + # Test with different dtypes np_arr_float = np.array([[1.5, 2.5], [3.5, 4.5]]) pa_arr_float = pa.array(np_arr_float) expected_float = pa.array([[1.5, 2.5], [3.5, 4.5]]) assert pa_arr_float.equals(expected_float) - + # Test with explicit type np_arr_typed = np.array([[1, 2], [3, 4]], dtype=np.int32) pa_arr_typed = pa.array(np_arr_typed, type=pa.list_(pa.int32())) expected_typed = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32())) assert pa_arr_typed.equals(expected_typed) - + # Test that mask is not supported for multidimensional arrays with pytest.raises(NotImplementedError, match="mask is not supported"): pa.array(np_arr_2d, mask=np.array([True, False])) - + # Test that size is not supported for multidimensional arrays with pytest.raises(NotImplementedError, match="size is not supported"): pa.array(np_arr_2d, size=2) From 52afba2c174fc5d9c4245566bb6a7922686c7431 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 02:19:27 +0000 Subject: [PATCH 5/7] Implement efficient recursive FixedSizeListArray for multidimensional arrays Co-authored-by: NickCrews <10820686+NickCrews@users.noreply.github.com> --- python/pyarrow/array.pxi | 35 +++++++++++++++++++++++++++--- python/pyarrow/tests/test_array.py | 30 +++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index d06295cabc6..3525984069f 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -177,6 +177,9 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, Multidimensional numpy arrays are supported and will be converted to nested list arrays. For example, a 2D array of shape (2, 3) will be converted to a list array of 2 lists, each containing 3 elements. + For C-contiguous arrays (default numpy layout), conversion is zero-copy + and very efficient. For non-contiguous arrays (e.g., transposed, sliced, + or Fortran-ordered), a conversion via Python lists is used. Note that mask and size parameters are not supported for multidimensional arrays. @@ -331,9 +334,35 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, if size is not None: raise NotImplementedError( "size is not supported for multidimensional arrays") - # Convert to list and use sequence conversion path - result = _sequence_to_array(values.tolist(), None, None, type, pool, - c_from_pandas) + + # For efficiency, use recursive FixedSizeListArray construction + # for C-contiguous arrays (zero-copy), otherwise use .tolist() + if values.flags['C_CONTIGUOUS']: + # Efficient path: flatten to 1D, convert with zero-copy, + # then wrap in nested FixedSizeListArray layers + shape = values.shape + flat = values.ravel() + + # Convert flattened 1D array to Arrow (zero-copy) + base_arr = _ndarray_to_array(flat, None, None, c_from_pandas, + safe, pool) + + # Build nested FixedSizeListArray from innermost to outermost + # For shape (2, 3, 4), we create: + # FixedSizeList[4] -> FixedSizeList[3] -> 2 elements + result = base_arr + for dim_size in reversed(shape[1:]): + result = FixedSizeListArray.from_arrays(result, int(dim_size)) + + # Apply explicit type if provided + if type is not None: + result = result.cast(type, safe=safe, memory_pool=memory_pool) + else: + # Non-contiguous arrays: fallback to .tolist() + # This handles transposed, sliced, and F-contiguous arrays + result = _sequence_to_array(values.tolist(), None, None, type, pool, + c_from_pandas) + if extension_type is not None: result = ExtensionArray.from_storage(extension_type, result) return result diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 0a8f953430b..7bb095452a8 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2513,6 +2513,36 @@ def test_array_from_numpy_multidimensional(): with pytest.raises(NotImplementedError, match="size is not supported"): pa.array(np_arr_2d, size=2) + # Test with transposed (non-contiguous) array + np_arr_transposed = np_arr_2d.T + assert not np_arr_transposed.flags['C_CONTIGUOUS'] + pa_arr_transposed = pa.array(np_arr_transposed) + expected_transposed = pa.array(np_arr_transposed.tolist()) + assert pa_arr_transposed.equals(expected_transposed) + + # Test with sliced (non-contiguous) array + np_arr_sliced = np.arange(12).reshape(3, 4)[:, ::2] + assert not np_arr_sliced.flags['C_CONTIGUOUS'] + pa_arr_sliced = pa.array(np_arr_sliced) + expected_sliced = pa.array(np_arr_sliced.tolist()) + assert pa_arr_sliced.equals(expected_sliced) + + # Test with Fortran-contiguous array + np_arr_fortran = np.asfortranarray(np.arange(6).reshape(2, 3)) + assert not np_arr_fortran.flags['C_CONTIGUOUS'] + assert np_arr_fortran.flags['F_CONTIGUOUS'] + pa_arr_fortran = pa.array(np_arr_fortran) + expected_fortran = pa.array(np_arr_fortran.tolist()) + assert pa_arr_fortran.equals(expected_fortran) + + # Verify that C-contiguous arrays use efficient path + # (result should be identical to tolist() result) + np_arr_contiguous = np.arange(24).reshape(2, 3, 4) + assert np_arr_contiguous.flags['C_CONTIGUOUS'] + pa_arr_efficient = pa.array(np_arr_contiguous) + pa_arr_tolist = pa.array(np_arr_contiguous.tolist()) + assert pa_arr_efficient.equals(pa_arr_tolist) + @pytest.mark.numpy def test_array_from_different_numpy_datetime_units_raises(): From 4136be2c0f12c0e4beacc224f8586697d4459d5a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 06:40:40 +0000 Subject: [PATCH 6/7] Fix Python/Cython formatting (remove trailing whitespace) Co-authored-by: NickCrews <10820686+NickCrews@users.noreply.github.com> --- python/pyarrow/array.pxi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 3525984069f..255fa0acbb0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -334,7 +334,7 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, if size is not None: raise NotImplementedError( "size is not supported for multidimensional arrays") - + # For efficiency, use recursive FixedSizeListArray construction # for C-contiguous arrays (zero-copy), otherwise use .tolist() if values.flags['C_CONTIGUOUS']: @@ -342,18 +342,18 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, # then wrap in nested FixedSizeListArray layers shape = values.shape flat = values.ravel() - + # Convert flattened 1D array to Arrow (zero-copy) base_arr = _ndarray_to_array(flat, None, None, c_from_pandas, - safe, pool) - + safe, pool) + # Build nested FixedSizeListArray from innermost to outermost # For shape (2, 3, 4), we create: # FixedSizeList[4] -> FixedSizeList[3] -> 2 elements result = base_arr for dim_size in reversed(shape[1:]): result = FixedSizeListArray.from_arrays(result, int(dim_size)) - + # Apply explicit type if provided if type is not None: result = result.cast(type, safe=safe, memory_pool=memory_pool) @@ -362,7 +362,7 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, # This handles transposed, sliced, and F-contiguous arrays result = _sequence_to_array(values.tolist(), None, None, type, pool, c_from_pandas) - + if extension_type is not None: result = ExtensionArray.from_storage(extension_type, result) return result From 937b64cb96905edaacc31925f66385571a5c78d5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 06:45:49 +0000 Subject: [PATCH 7/7] Fix tests: compare against tolist() results for correct type matching Co-authored-by: NickCrews <10820686+NickCrews@users.noreply.github.com> --- python/pyarrow/tests/test_array.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 7bb095452a8..da6a3c2e0ee 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2484,7 +2484,7 @@ def test_array_from_numpy_multidimensional(): # Test 2D array np_arr_2d = np.arange(6).reshape(2, 3) pa_arr_2d = pa.array(np_arr_2d) - expected_2d = pa.array([[0, 1, 2], [3, 4, 5]]) + expected_2d = pa.array(np_arr_2d.tolist()) assert pa_arr_2d.equals(expected_2d) # Test 3D array (example from the issue) @@ -2496,13 +2496,13 @@ def test_array_from_numpy_multidimensional(): # Test with different dtypes np_arr_float = np.array([[1.5, 2.5], [3.5, 4.5]]) pa_arr_float = pa.array(np_arr_float) - expected_float = pa.array([[1.5, 2.5], [3.5, 4.5]]) + expected_float = pa.array(np_arr_float.tolist()) assert pa_arr_float.equals(expected_float) # Test with explicit type np_arr_typed = np.array([[1, 2], [3, 4]], dtype=np.int32) pa_arr_typed = pa.array(np_arr_typed, type=pa.list_(pa.int32())) - expected_typed = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32())) + expected_typed = pa.array(np_arr_typed.tolist(), type=pa.list_(pa.int32())) assert pa_arr_typed.equals(expected_typed) # Test that mask is not supported for multidimensional arrays