diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 575b628db3a..255fa0acbb0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -174,6 +174,15 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, Notes ----- + Multidimensional numpy arrays are supported and will be converted to + nested list arrays. For example, a 2D array of shape (2, 3) will be + converted to a list array of 2 lists, each containing 3 elements. + For C-contiguous arrays (default numpy layout), conversion is zero-copy + and very efficient. For non-contiguous arrays (e.g., transposed, sliced, + or Fortran-ordered), a conversion via Python lists is used. + Note that mask and size parameters are not supported for multidimensional + arrays. + Timezone will be preserved in the returned array for timezone-aware data, else no timezone will be returned for naive timestamps. Internally, UTC values are stored for timezone-aware data with the @@ -229,6 +238,24 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) >>> arr.type.index_type DataType(int16) + + Multidimensional numpy arrays are supported: + + >>> np_2d = np.arange(6).reshape(2, 3) + >>> pa.array(np_2d) + + [ + [ + 0, + 1, + 2 + ], + [ + 3, + 4, + 5 + ] + ] """ cdef: CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) @@ -299,6 +326,47 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, mask = None if values.mask is np.ma.nomask else values.mask values = values.data + # Handle multidimensional numpy arrays by converting to nested lists + if isinstance(values, np.ndarray) and values.ndim > 1: + if mask is not None: + raise NotImplementedError( + "mask is not supported for multidimensional arrays") + if size is not None: + raise NotImplementedError( + "size is not supported for multidimensional arrays") + + # For efficiency, use recursive FixedSizeListArray construction + # for C-contiguous arrays (zero-copy), otherwise use .tolist() + if values.flags['C_CONTIGUOUS']: + # Efficient path: flatten to 1D, convert with zero-copy, + # then wrap in nested FixedSizeListArray layers + shape = values.shape + flat = values.ravel() + + # Convert flattened 1D array to Arrow (zero-copy) + base_arr = _ndarray_to_array(flat, None, None, c_from_pandas, + safe, pool) + + # Build nested FixedSizeListArray from innermost to outermost + # For shape (2, 3, 4), we create: + # FixedSizeList[4] -> FixedSizeList[3] -> 2 elements + result = base_arr + for dim_size in reversed(shape[1:]): + result = FixedSizeListArray.from_arrays(result, int(dim_size)) + + # Apply explicit type if provided + if type is not None: + result = result.cast(type, safe=safe, memory_pool=memory_pool) + else: + # Non-contiguous arrays: fallback to .tolist() + # This handles transposed, sliced, and F-contiguous arrays + result = _sequence_to_array(values.tolist(), None, None, type, pool, + c_from_pandas) + + if extension_type is not None: + result = ExtensionArray.from_storage(extension_type, result) + return result + if mask is not None: if mask.dtype != np.bool_: raise TypeError("Mask must be boolean dtype") diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..da6a3c2e0ee 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2478,6 +2478,72 @@ def test_array_from_numpy_datetime(dtype, type): assert arr.equals(expected) +@pytest.mark.numpy +def test_array_from_numpy_multidimensional(): + # Support reading multidimensional numpy arrays + # Test 2D array + np_arr_2d = np.arange(6).reshape(2, 3) + pa_arr_2d = pa.array(np_arr_2d) + expected_2d = pa.array(np_arr_2d.tolist()) + assert pa_arr_2d.equals(expected_2d) + + # Test 3D array (example from the issue) + np_arr_3d = np.arange(24).reshape(2, 3, 4) + pa_arr_3d = pa.array(np_arr_3d) + expected_3d = pa.array(np_arr_3d.tolist()) + assert pa_arr_3d.equals(expected_3d) + + # Test with different dtypes + np_arr_float = np.array([[1.5, 2.5], [3.5, 4.5]]) + pa_arr_float = pa.array(np_arr_float) + expected_float = pa.array(np_arr_float.tolist()) + assert pa_arr_float.equals(expected_float) + + # Test with explicit type + np_arr_typed = np.array([[1, 2], [3, 4]], dtype=np.int32) + pa_arr_typed = pa.array(np_arr_typed, type=pa.list_(pa.int32())) + expected_typed = pa.array(np_arr_typed.tolist(), type=pa.list_(pa.int32())) + assert pa_arr_typed.equals(expected_typed) + + # Test that mask is not supported for multidimensional arrays + with pytest.raises(NotImplementedError, match="mask is not supported"): + pa.array(np_arr_2d, mask=np.array([True, False])) + + # Test that size is not supported for multidimensional arrays + with pytest.raises(NotImplementedError, match="size is not supported"): + pa.array(np_arr_2d, size=2) + + # Test with transposed (non-contiguous) array + np_arr_transposed = np_arr_2d.T + assert not np_arr_transposed.flags['C_CONTIGUOUS'] + pa_arr_transposed = pa.array(np_arr_transposed) + expected_transposed = pa.array(np_arr_transposed.tolist()) + assert pa_arr_transposed.equals(expected_transposed) + + # Test with sliced (non-contiguous) array + np_arr_sliced = np.arange(12).reshape(3, 4)[:, ::2] + assert not np_arr_sliced.flags['C_CONTIGUOUS'] + pa_arr_sliced = pa.array(np_arr_sliced) + expected_sliced = pa.array(np_arr_sliced.tolist()) + assert pa_arr_sliced.equals(expected_sliced) + + # Test with Fortran-contiguous array + np_arr_fortran = np.asfortranarray(np.arange(6).reshape(2, 3)) + assert not np_arr_fortran.flags['C_CONTIGUOUS'] + assert np_arr_fortran.flags['F_CONTIGUOUS'] + pa_arr_fortran = pa.array(np_arr_fortran) + expected_fortran = pa.array(np_arr_fortran.tolist()) + assert pa_arr_fortran.equals(expected_fortran) + + # Verify that C-contiguous arrays use efficient path + # (result should be identical to tolist() result) + np_arr_contiguous = np.arange(24).reshape(2, 3, 4) + assert np_arr_contiguous.flags['C_CONTIGUOUS'] + pa_arr_efficient = pa.array(np_arr_contiguous) + pa_arr_tolist = pa.array(np_arr_contiguous.tolist()) + assert pa_arr_efficient.equals(pa_arr_tolist) + + @pytest.mark.numpy def test_array_from_different_numpy_datetime_units_raises(): data = [