diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index c4446c5c7ed..c620b0eb512 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -276,6 +276,12 @@ G_BEGIN_DECLS * #GArrowJoinOptions is a class to customize the `binary_join_element_wise` * function. * + * #GArrowListFlattenOptions is a class to customize the `list_flatten` + * function. + * + * #GArrowMapLookupOptions is a class to customize the `map_lookup` + * function. + * * There are many functions to compute data on an array. */ @@ -7400,6 +7406,264 @@ garrow_join_options_new(void) return GARROW_JOIN_OPTIONS(options); } +enum { + PROP_LIST_FLATTEN_OPTIONS_RECURSIVE = 1, +}; + +G_DEFINE_TYPE(GArrowListFlattenOptions, + garrow_list_flatten_options, + GARROW_TYPE_FUNCTION_OPTIONS) + +static void +garrow_list_flatten_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto options = garrow_list_flatten_options_get_raw(GARROW_LIST_FLATTEN_OPTIONS(object)); + + switch (prop_id) { + case PROP_LIST_FLATTEN_OPTIONS_RECURSIVE: + options->recursive = g_value_get_boolean(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_list_flatten_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto options = garrow_list_flatten_options_get_raw(GARROW_LIST_FLATTEN_OPTIONS(object)); + + switch (prop_id) { + case PROP_LIST_FLATTEN_OPTIONS_RECURSIVE: + g_value_set_boolean(value, options->recursive); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_list_flatten_options_init(GArrowListFlattenOptions *object) +{ + auto priv = GARROW_FUNCTION_OPTIONS_GET_PRIVATE(object); + priv->options = static_cast( + new arrow::compute::ListFlattenOptions()); +} + +static void +garrow_list_flatten_options_class_init(GArrowListFlattenOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = garrow_list_flatten_options_set_property; + gobject_class->get_property = garrow_list_flatten_options_get_property; + + arrow::compute::ListFlattenOptions options; + + GParamSpec *spec; + /** + * GArrowListFlattenOptions:recursive: + * + * If true, the list is flattened recursively until a non-list array is formed. + * + * Since: 23.0.0 + */ + spec = g_param_spec_boolean( + "recursive", + "Recursive", + "If true, the list is flattened recursively until a non-list array is formed", + options.recursive, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_LIST_FLATTEN_OPTIONS_RECURSIVE, + spec); +} + +/** + * garrow_list_flatten_options_new: + * + * Returns: A newly created #GArrowListFlattenOptions. + * + * Since: 23.0.0 + */ +GArrowListFlattenOptions * +garrow_list_flatten_options_new(void) +{ + auto options = g_object_new(GARROW_TYPE_LIST_FLATTEN_OPTIONS, NULL); + return GARROW_LIST_FLATTEN_OPTIONS(options); +} + +typedef struct GArrowMapLookupOptionsPrivate_ +{ + GArrowScalar *query_key; +} GArrowMapLookupOptionsPrivate; + +enum { + PROP_MAP_LOOKUP_OPTIONS_QUERY_KEY = 1, + PROP_MAP_LOOKUP_OPTIONS_OCCURRENCE, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowMapLookupOptions, + garrow_map_lookup_options, + GARROW_TYPE_FUNCTION_OPTIONS) + +#define GARROW_MAP_LOOKUP_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_map_lookup_options_get_instance_private(GARROW_MAP_LOOKUP_OPTIONS(object))) + +static void +garrow_map_lookup_options_dispose(GObject *object) +{ + auto priv = GARROW_MAP_LOOKUP_OPTIONS_GET_PRIVATE(object); + + if (priv->query_key) { + g_object_unref(priv->query_key); + priv->query_key = NULL; + } + + G_OBJECT_CLASS(garrow_map_lookup_options_parent_class)->dispose(object); +} + +static void +garrow_map_lookup_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_MAP_LOOKUP_OPTIONS_GET_PRIVATE(object); + auto options = garrow_map_lookup_options_get_raw(GARROW_MAP_LOOKUP_OPTIONS(object)); + + switch (prop_id) { + case PROP_MAP_LOOKUP_OPTIONS_QUERY_KEY: + { + auto query_key = g_value_get_object(value); + if (priv->query_key != query_key) { + if (priv->query_key) { + g_object_unref(priv->query_key); + } + priv->query_key = GARROW_SCALAR(query_key); + if (priv->query_key) { + g_object_ref(priv->query_key); + options->query_key = garrow_scalar_get_raw(priv->query_key); + } else { + options->query_key = nullptr; + } + } + } + break; + case PROP_MAP_LOOKUP_OPTIONS_OCCURRENCE: + options->occurrence = + static_cast(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_map_lookup_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_MAP_LOOKUP_OPTIONS_GET_PRIVATE(object); + auto options = garrow_map_lookup_options_get_raw(GARROW_MAP_LOOKUP_OPTIONS(object)); + + switch (prop_id) { + case PROP_MAP_LOOKUP_OPTIONS_QUERY_KEY: + g_value_set_object(value, priv->query_key); + break; + case PROP_MAP_LOOKUP_OPTIONS_OCCURRENCE: + g_value_set_enum(value, static_cast(options->occurrence)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_map_lookup_options_init(GArrowMapLookupOptions *object) +{ + auto priv = GARROW_FUNCTION_OPTIONS_GET_PRIVATE(object); + priv->options = static_cast( + new arrow::compute::MapLookupOptions()); +} + +static void +garrow_map_lookup_options_class_init(GArrowMapLookupOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_map_lookup_options_dispose; + gobject_class->set_property = garrow_map_lookup_options_set_property; + gobject_class->get_property = garrow_map_lookup_options_get_property; + + arrow::compute::MapLookupOptions options; + + GParamSpec *spec; + /** + * GArrowMapLookupOptions:query-key: + * + * The key to lookup in the map. + * + * Since: 23.0.0 + */ + spec = g_param_spec_object("query-key", + "Query key", + "The key to lookup in the map", + GARROW_TYPE_SCALAR, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_MAP_LOOKUP_OPTIONS_QUERY_KEY, spec); + + /** + * GArrowMapLookupOptions:occurrence: + * + * Whether to return the first, last, or all matching values. + * + * Since: 23.0.0 + */ + spec = g_param_spec_enum("occurrence", + "Occurrence", + "Whether to return the first, last, or all matching values", + GARROW_TYPE_MAP_LOOKUP_OCCURRENCE, + static_cast(options.occurrence), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_MAP_LOOKUP_OPTIONS_OCCURRENCE, + spec); +} + +/** + * garrow_map_lookup_options_new: + * @query_key: (nullable): A #GArrowScalar to be looked up. + * @occurrence: A #GArrowMapLookupOccurrence. + * + * Returns: A newly created #GArrowMapLookupOptions. + * + * Since: 23.0.0 + */ +GArrowMapLookupOptions * +garrow_map_lookup_options_new(GArrowScalar *query_key, + GArrowMapLookupOccurrence occurrence) +{ + return GARROW_MAP_LOOKUP_OPTIONS(g_object_new(GARROW_TYPE_MAP_LOOKUP_OPTIONS, + "query-key", + query_key, + "occurrence", + occurrence, + NULL)); +} + G_END_DECLS arrow::Result @@ -7574,6 +7838,16 @@ garrow_function_options_new_raw(const arrow::compute::FunctionOptions *arrow_opt static_cast(arrow_options); auto options = garrow_join_options_new_raw(arrow_join_options); return GARROW_FUNCTION_OPTIONS(options); + } else if (arrow_type_name == "ListFlattenOptions") { + const auto arrow_list_flatten_options = + static_cast(arrow_options); + auto options = garrow_list_flatten_options_new_raw(arrow_list_flatten_options); + return GARROW_FUNCTION_OPTIONS(options); + } else if (arrow_type_name == "MapLookupOptions") { + const auto arrow_map_lookup_options = + static_cast(arrow_options); + auto options = garrow_map_lookup_options_new_raw(arrow_map_lookup_options); + return GARROW_FUNCTION_OPTIONS(options); } else { auto options = g_object_new(GARROW_TYPE_FUNCTION_OPTIONS, NULL); return GARROW_FUNCTION_OPTIONS(options); @@ -8250,3 +8524,45 @@ garrow_join_options_get_raw(GArrowJoinOptions *options) return static_cast( garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); } + +GArrowListFlattenOptions * +garrow_list_flatten_options_new_raw( + const arrow::compute::ListFlattenOptions *arrow_options) +{ + return GARROW_LIST_FLATTEN_OPTIONS(g_object_new(GARROW_TYPE_LIST_FLATTEN_OPTIONS, + "recursive", + arrow_options->recursive, + NULL)); +} + +arrow::compute::ListFlattenOptions * +garrow_list_flatten_options_get_raw(GArrowListFlattenOptions *options) +{ + return static_cast( + garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); +} + +GArrowMapLookupOptions * +garrow_map_lookup_options_new_raw(const arrow::compute::MapLookupOptions *arrow_options) +{ + GArrowScalar *query_key = nullptr; + if (arrow_options->query_key) { + auto arrow_query_key = arrow_options->query_key; + query_key = garrow_scalar_new_raw(&arrow_query_key); + } + GArrowMapLookupOccurrence occurrence = + static_cast(arrow_options->occurrence); + return GARROW_MAP_LOOKUP_OPTIONS(g_object_new(GARROW_TYPE_MAP_LOOKUP_OPTIONS, + "query-key", + query_key, + "occurrence", + occurrence, + NULL)); +} + +arrow::compute::MapLookupOptions * +garrow_map_lookup_options_get_raw(GArrowMapLookupOptions *options) +{ + return static_cast( + garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); +} diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 25b12e76c23..c5166a663ec 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -1322,4 +1322,54 @@ GARROW_AVAILABLE_IN_23_0 GArrowJoinOptions * garrow_join_options_new(void); +#define GARROW_TYPE_LIST_FLATTEN_OPTIONS (garrow_list_flatten_options_get_type()) +GARROW_AVAILABLE_IN_23_0 +G_DECLARE_DERIVABLE_TYPE(GArrowListFlattenOptions, + garrow_list_flatten_options, + GARROW, + LIST_FLATTEN_OPTIONS, + GArrowFunctionOptions) +struct _GArrowListFlattenOptionsClass +{ + GArrowFunctionOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_23_0 +GArrowListFlattenOptions * +garrow_list_flatten_options_new(void); + +/** + * GArrowMapLookupOccurrence: + * @GARROW_MAP_LOOKUP_OCCURRENCE_FIRST: Return the first matching value. + * @GARROW_MAP_LOOKUP_OCCURRENCE_LAST: Return the last matching value. + * @GARROW_MAP_LOOKUP_OCCURRENCE_ALL: Return all matching values. + * + * They correspond to the values of + * `arrow::compute::MapLookupOptions::Occurrence`. + * + * Since: 23.0.0 + */ +typedef enum { + GARROW_MAP_LOOKUP_OCCURRENCE_FIRST, + GARROW_MAP_LOOKUP_OCCURRENCE_LAST, + GARROW_MAP_LOOKUP_OCCURRENCE_ALL, +} GArrowMapLookupOccurrence; + +#define GARROW_TYPE_MAP_LOOKUP_OPTIONS (garrow_map_lookup_options_get_type()) +GARROW_AVAILABLE_IN_23_0 +G_DECLARE_DERIVABLE_TYPE(GArrowMapLookupOptions, + garrow_map_lookup_options, + GARROW, + MAP_LOOKUP_OPTIONS, + GArrowFunctionOptions) +struct _GArrowMapLookupOptionsClass +{ + GArrowFunctionOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_23_0 +GArrowMapLookupOptions * +garrow_map_lookup_options_new(GArrowScalar *query_key, + GArrowMapLookupOccurrence occurrence); + G_END_DECLS diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 01a6d54f640..fb5907a17ba 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -220,3 +220,14 @@ GArrowJoinOptions * garrow_join_options_new_raw(const arrow::compute::JoinOptions *arrow_options); arrow::compute::JoinOptions * garrow_join_options_get_raw(GArrowJoinOptions *options); + +GArrowListFlattenOptions * +garrow_list_flatten_options_new_raw( + const arrow::compute::ListFlattenOptions *arrow_options); +arrow::compute::ListFlattenOptions * +garrow_list_flatten_options_get_raw(GArrowListFlattenOptions *options); + +GArrowMapLookupOptions * +garrow_map_lookup_options_new_raw(const arrow::compute::MapLookupOptions *arrow_options); +arrow::compute::MapLookupOptions * +garrow_map_lookup_options_get_raw(GArrowMapLookupOptions *options); diff --git a/c_glib/test/test-list-flatten-options.rb b/c_glib/test/test-list-flatten-options.rb new file mode 100644 index 00000000000..3f21c2e9e54 --- /dev/null +++ b/c_glib/test/test-list-flatten-options.rb @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestListFlattenOptions < Test::Unit::TestCase + include Helper::Buildable + + def setup + @options = Arrow::ListFlattenOptions.new + end + + def test_recursive_property + assert do + !@options.recursive? + end + @options.recursive = true + assert do + @options.recursive? + end + end + + def test_list_flatten_function_recursive + list_data_type = Arrow::ListDataType.new(Arrow::Field.new("value", Arrow::Int8DataType.new)) + nested_list = build_list_array(list_data_type, [[[1, 2], [3]], [[4, 5]]]) + + args = [ + Arrow::ArrayDatum.new(nested_list), + ] + list_flatten_function = Arrow::Function.find("list_flatten") + + @options.recursive = false + result = list_flatten_function.execute(args, @options).value + assert_equal(build_list_array(Arrow::Int8DataType.new, [[1, 2], [3], [4, 5]]), + result) + + @options.recursive = true + result = list_flatten_function.execute(args, @options).value + assert_equal(build_int8_array([1, 2, 3, 4, 5]), + result) + end +end diff --git a/c_glib/test/test-map-lookup-options.rb b/c_glib/test/test-map-lookup-options.rb new file mode 100644 index 00000000000..3bb76ca6e26 --- /dev/null +++ b/c_glib/test/test-map-lookup-options.rb @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestMapLookupOptions < Test::Unit::TestCase + include Helper::Buildable + + def setup + @query_key = Arrow::Int32Scalar.new(1) + @options = Arrow::MapLookupOptions.new(@query_key, + Arrow::MapLookupOccurrence::FIRST) + end + + def test_query_key_property + assert_equal(@query_key, @options.query_key) + new_query_key = Arrow::Int32Scalar.new(2) + @options.query_key = new_query_key + assert_equal(new_query_key, @options.query_key) + end + + def test_occurrence_property + assert_equal(Arrow::MapLookupOccurrence::FIRST, @options.occurrence) + @options.occurrence = :last + assert_equal(Arrow::MapLookupOccurrence::LAST, @options.occurrence) + @options.occurrence = :all + assert_equal(Arrow::MapLookupOccurrence::ALL, @options.occurrence) + @options.occurrence = :first + assert_equal(Arrow::MapLookupOccurrence::FIRST, @options.occurrence) + end + + def test_map_lookup_function + map_array = build_map_array(Arrow::Int32DataType.new, + Arrow::StringDataType.new, + [[ + [1, "first_one"], + [2, "two"], + [1, nil], + [3, "three"], + [1, "second_one"], + [1, "last_one"], + ]]) + args = [Arrow::ArrayDatum.new(map_array)] + map_lookup_function = Arrow::Function.find("map_lookup") + @options.query_key = Arrow::Int32Scalar.new(1) + + @options.occurrence = :first + result = map_lookup_function.execute(args, @options).value + assert_equal(build_string_array(["first_one"]), result) + + @options.occurrence = :last + result = map_lookup_function.execute(args, @options).value + assert_equal(build_string_array(["last_one"]), result) + + @options.occurrence = :all + result = map_lookup_function.execute(args, @options).value + assert_equal(build_list_array(Arrow::StringDataType.new, + [["first_one", nil, "second_one", "last_one"]]), + result) + end +end diff --git a/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile b/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile index 3168d54f2ed..e2e4eb8f991 100644 --- a/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile +++ b/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile @@ -19,38 +19,32 @@ ARG base FROM ${base} ARG python_version=3.13 +ARG arch=aarch64 +ARG build_date -RUN apk add --no-cache \ +RUN apk update && \ + apk add --no-cache \ bash \ - build-base \ - bzip2-dev \ - g++ \ + curl \ git \ - libffi-dev \ - libnsl-dev \ - libtirpc-dev \ - linux-headers \ - ncurses-dev \ - openssl-dev \ - pkgconf \ + tar \ tzdata \ - zlib-dev + zstd -# Install Python without GIL +# Install Python with free-threading from python-build-standalone +# See available releases at: https://github.com/astral-sh/python-build-standalone/releases RUN set -e; \ case "${python_version}" in \ 3.13) python_patch_version="3.13.9";; \ 3.14) python_patch_version="3.14.0";; \ esac && \ - wget https://github.com/python/cpython/archive/refs/tags/v${python_patch_version}.tar.gz && \ - tar -xzf v${python_patch_version}.tar.gz && \ - rm v${python_patch_version}.tar.gz && \ - cd cpython-${python_patch_version}/ && \ - ./configure --disable-gil --with-ensurepip && \ - make -j && \ - make install && \ - cd ../ && \ - rm -rf cpython-${python_patch_version}/ + curl -L -o python.tar.zst \ + https://github.com/astral-sh/python-build-standalone/releases/download/${build_date}/cpython-${python_patch_version}+${build_date}-${arch}-unknown-linux-musl-freethreaded+lto-full.tar.zst && \ + mkdir -p /opt/python && \ + tar -xf python.tar.zst -C /opt/python --strip-components=1 && \ + rm python.tar.zst + +ENV PATH="/opt/python/install/bin:${PATH}" ENV ARROW_PYTHON_VENV /arrow-dev RUN python${python_version}t -m venv ${ARROW_PYTHON_VENV} diff --git a/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile b/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile index c6873612b86..53ae58c7933 100644 --- a/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile +++ b/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile @@ -19,38 +19,34 @@ ARG base FROM ${base} ARG python_version=3.13 +ARG arch=aarch64 +ARG build_date -RUN apk add --no-cache \ +RUN apk update && \ + apk add --no-cache \ bash \ build-base \ - bzip2-dev \ + curl \ g++ \ git \ - libffi-dev \ - libnsl-dev \ - libtirpc-dev \ - linux-headers \ - ncurses-dev \ - openssl-dev \ - pkgconf \ + tar \ tzdata \ - zlib-dev + zstd -# Install Python without GIL +# Install Python with free-threading from python-build-standalone +# See available releases at: https://github.com/astral-sh/python-build-standalone/releases RUN set -e; \ case "${python_version}" in \ 3.13) python_patch_version="3.13.9";; \ 3.14) python_patch_version="3.14.0";; \ esac && \ - wget https://github.com/python/cpython/archive/refs/tags/v${python_patch_version}.tar.gz && \ - tar -xzf v${python_patch_version}.tar.gz && \ - rm v${python_patch_version}.tar.gz && \ - cd cpython-${python_patch_version}/ && \ - ./configure --disable-gil --with-ensurepip && \ - make -j && \ - make install && \ - cd ../ && \ - rm -rf cpython-${python_patch_version}/ + curl -L -o python.tar.zst \ + https://github.com/astral-sh/python-build-standalone/releases/download/${build_date}/cpython-${python_patch_version}+${build_date}-${arch}-unknown-linux-musl-freethreaded+lto-full.tar.zst && \ + mkdir -p /opt/python && \ + tar -xf python.tar.zst -C /opt/python --strip-components=1 && \ + rm python.tar.zst + +ENV PATH="/opt/python/install/bin:${PATH}" ENV ARROW_PYTHON_VENV /arrow-dev RUN python${python_version}t -m venv ${ARROW_PYTHON_VENV} diff --git a/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile b/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile deleted file mode 100644 index 77a64fd5c24..00000000000 --- a/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: You must update PYTHON_WHEEL_WINDOWS_IMAGE_REVISION in .env -# when you update this file. - -ARG base -# https://github.com/hadolint/hadolint/wiki/DL3006 -# (Hadolint does not expand variables and thinks '${base}' is an untagged image) -# hadolint ignore=DL3006 -FROM ${base} - -ARG python=3.13 - -RUN (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.1") & \ - (if "%python%"=="3.14" setx PYTHON_VERSION "3.14.0") - -SHELL ["powershell", "-NoProfile", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] -RUN $version = $env:PYTHON_VERSION; \ - $filename = 'python-' + $version + '-amd64.exe'; \ - $url = 'https://www.python.org/ftp/python/' + $version + '/' + $filename; \ - Invoke-WebRequest -Uri $url -OutFile $filename; \ - Start-Process -FilePath $filename -ArgumentList '/quiet', 'Include_freethreaded=1' -Wait - -ENV PYTHON_CMD="py -${python}t" - -SHELL ["cmd", "/S", "/C"] -RUN %PYTHON_CMD% -m pip install -U pip setuptools - -COPY python/requirements-wheel-build.txt C:/arrow/python/ -# Cython wheels for 3.13 free-threaded are not released yet -RUN %PYTHON_CMD% -m pip install \ - --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ - --pre \ - --prefer-binary \ - cython -RUN %PYTHON_CMD% -m pip install -r C:/arrow/python/requirements-wheel-build.txt - -ENV PYTHON="${python}t" diff --git a/ci/docker/python-wheel-windows-test-vs2022.dockerfile b/ci/docker/python-wheel-windows-test-vs2022.dockerfile index e5d1c517587..73a9e167fea 100644 --- a/ci/docker/python-wheel-windows-test-vs2022.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2022.dockerfile @@ -42,4 +42,4 @@ RUN %PYTHON_CMD% -m pip install -U pip setuptools COPY python/requirements-wheel-test.txt C:/arrow/python/ RUN %PYTHON_CMD% -m pip install -r C:/arrow/python/requirements-wheel-test.txt -ENV PYTHON=$python +ENV PYTHON=${python} diff --git a/ci/docker/python-wheel-windows-vs2022-base.dockerfile b/ci/docker/python-wheel-windows-vs2022-base.dockerfile index 99dd27b987a..e63b8fc9945 100644 --- a/ci/docker/python-wheel-windows-vs2022-base.dockerfile +++ b/ci/docker/python-wheel-windows-vs2022-base.dockerfile @@ -74,13 +74,25 @@ RUN ` # Install choco CLI # -# Switch into Powershell just for this command because choco only provides a -# Powershell installation script. After, we switch back to cmd. +# Switch into Powershell just for the following commands because choco and Python install manager (MSIX) +# only provide a Powershell installation scripts. After, we switch back to cmd. # # See https://chocolatey.org/install#completely-offline-install SHELL ["powershell", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] RUN ` - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) + Set-ExecutionPolicy Bypass -Scope Process -Force; ` + [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; ` + iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) + +# Install the Python install manager +# +# See https://docs.python.org/dev/using/windows.html#python-install-manager and +# https://www.python.org/ftp/python/pymanager/ +RUN ` + $pymanager_url = 'https://www.python.org/ftp/python/pymanager/python-manager-25.0.msix'; ` + Invoke-WebRequest -Uri $pymanager_url -OutFile 'C:\Windows\pymanager.msix'; ` + Add-AppxPackage C:\Windows\pymanager.msix + SHELL ["cmd", "/S", "/C"] # Install CMake and other tools diff --git a/ci/docker/python-wheel-windows-vs2022.dockerfile b/ci/docker/python-wheel-windows-vs2022.dockerfile index 48d3937a0ed..d4d5e57cd2c 100644 --- a/ci/docker/python-wheel-windows-vs2022.dockerfile +++ b/ci/docker/python-wheel-windows-vs2022.dockerfile @@ -23,16 +23,15 @@ FROM ${base} # Define the full version number otherwise choco falls back to patch number 0 (3.10 => 3.10.0) ARG python=3.10 -RUN (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PYTHON_CMD "py -3.10") & \ - (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PYTHON_CMD "py -3.11") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.10" && setx PYTHON_CMD "py -3.12") & \ - (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.9" && setx PYTHON_CMD "py -3.13") & \ - (if "%python%"=="3.14" setx PYTHON_VERSION "3.14.0" && setx PYTHON_CMD "py -3.14") -RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% -RUN %PYTHON_CMD% -m pip install -U pip setuptools +ARG python_variant=default +ENV PYTHON_VERSION=${python} +ENV PYTHON_VARIANT=${python_variant} +RUN pymanager install --version %PYTHON_VERSION% --variant %PYTHON_VARIANT% + +RUN py -%PYTHON_VERSION% -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt C:/arrow/python/ -RUN %PYTHON_CMD% -m pip install -r C:/arrow/python/requirements-wheel-build.txt +RUN py -%PYTHON_VERSION% -m pip install -r C:/arrow/python/requirements-wheel-build.txt ENV PYTHON=${python} diff --git a/compose.yaml b/compose.yaml index fcdcbe289c1..84481e1af76 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1228,6 +1228,8 @@ services: args: base: "${ARCH}/alpine:${ALPINE_LINUX}" python_version: ${PYTHON} + build_date: "20251014" # python-build-standalone release date + arch: ${ARCH_ALIAS} context: . dockerfile: ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile cache_from: @@ -1270,6 +1272,8 @@ services: args: base: "${ARCH}/alpine:${ALPINE_LINUX}" python_version: ${PYTHON} + build_date: "20251014" # python-build-standalone release date + arch: ${ARCH_ALIAS} context: . dockerfile: ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile cache_from: @@ -1384,6 +1388,7 @@ services: args: base: ${REPO}:python-wheel-windows-vs2022-base-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} python: ${PYTHON} + python_variant: default context: . dockerfile: ci/docker/python-wheel-windows-vs2022.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. @@ -1400,8 +1405,9 @@ services: args: base: ${REPO}:python-wheel-windows-vs2022-base-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} python: ${PYTHON} + python_variant: freethreaded context: . - dockerfile: ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile + dockerfile: ci/docker/python-wheel-windows-vs2022.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. # Uncomment if no local cache is available. # cache_from: diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index c92daf32690..f4ff0bded3d 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -139,7 +139,7 @@ if(NOT DEFINED CMAKE_C_STANDARD) set(CMAKE_C_STANDARD 11) endif() -# This ensures that things like c++17 get passed correctly +# This ensures that a standard higher than the minimum can be passed correctly if(NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 20) elseif(${CMAKE_CXX_STANDARD} VERSION_LESS 20) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index aa5c426aff3..8a26b46d0bf 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1883,6 +1883,7 @@ function(build_protobuf) PARENT_SCOPE) fetchcontent_declare(protobuf + ${FC_DECLARE_COMMON_OPTIONS} OVERRIDE_FIND_PACKAGE URL ${PROTOBUF_SOURCE_URL} URL_HASH "SHA256=${ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM}" SOURCE_SUBDIR cmake) @@ -3120,10 +3121,13 @@ function(build_grpc) # Add warning suppression flags for gRPC build. if(NOT MSVC) - string(APPEND CMAKE_C_FLAGS - " -Wno-attributes -Wno-format-security -Wno-unknown-warning-option") - string(APPEND CMAKE_CXX_FLAGS - " -Wno-attributes -Wno-format-security -Wno-unknown-warning-option") + string(APPEND CMAKE_C_FLAGS " -Wno-attributes -Wno-format-security") + string(APPEND CMAKE_CXX_FLAGS " -Wno-attributes -Wno-format-security") + endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") + string(APPEND CMAKE_C_FLAGS " -Wno-unknown-warning-option") + string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-warning-option") endif() fetchcontent_makeavailable(grpc) @@ -3323,11 +3327,6 @@ function(build_nlohmann_json) set(NLOHMANN_JSON_VENDORED TRUE PARENT_SCOPE) - set(NLOHMANN_JSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json_fc-install") - set(NLOHMANN_JSON_PREFIX - "${NLOHMANN_JSON_PREFIX}" - PARENT_SCOPE) - fetchcontent_declare(nlohmann_json ${FC_DECLARE_COMMON_OPTIONS} OVERRIDE_FIND_PACKAGE URL ${NLOHMANN_JSON_SOURCE_URL} @@ -3341,49 +3340,13 @@ function(build_nlohmann_json) set(JSON_Install ON) fetchcontent_makeavailable(nlohmann_json) - # opentelemetry requires nlohmann_json to be installed to a known location. - # We have to do this in two steps to avoid double installation of nlohmann_json - # when Arrow is installed. - # This custom target ensures nlohmann_json is built before we install. - add_custom_target(nlohmann_json_built DEPENDS nlohmann_json::nlohmann_json) - - # Disable nlohmann_json's install script after it's built to prevent double installation. - add_custom_command(OUTPUT "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.saved" - COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake" - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.saved" - COMMAND ${CMAKE_COMMAND} -E echo - "# nlohmann-json install disabled to prevent double installation with Arrow" - > "${nlohmann_json_BINARY_DIR}/cmake_install.cmake" - DEPENDS nlohmann_json_built - COMMENT "Disabling nlohmann-json install to prevent double installation" - VERBATIM) - - add_custom_target(nlohmann_json_install_disabled ALL - DEPENDS "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.saved") - - # Install nlohmann_json to NLOHMANN_JSON_PREFIX for opentelemetry to find. - add_custom_command(OUTPUT "${NLOHMANN_JSON_PREFIX}/.nlohmann_json_installed" - COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.saved" - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.tmp" - COMMAND ${CMAKE_COMMAND} - -DCMAKE_INSTALL_PREFIX=${NLOHMANN_JSON_PREFIX} - -DCMAKE_INSTALL_CONFIG_NAME=$ -P - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.tmp" || - ${CMAKE_COMMAND} -E true - COMMAND ${CMAKE_COMMAND} -E touch - "${NLOHMANN_JSON_PREFIX}/.nlohmann_json_installed" - DEPENDS nlohmann_json_install_disabled - COMMENT "Installing nlohmann-json to ${NLOHMANN_JSON_PREFIX} for google-cloud-cpp" - VERBATIM) - - # Make nlohmann_json_fc depend on the install completion marker. - add_custom_target(nlohmann_json_fc - DEPENDS "${NLOHMANN_JSON_PREFIX}/.nlohmann_json_installed") + if(CMAKE_VERSION VERSION_LESS 3.28) + set_property(DIRECTORY ${nlohmann_json_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL TRUE) + endif() list(POP_BACK CMAKE_MESSAGE_INDENT) endfunction() + if(ARROW_WITH_NLOHMANN_JSON) resolve_dependency(nlohmann_json) get_target_property(nlohmann_json_INCLUDE_DIR nlohmann_json::nlohmann_json @@ -3726,220 +3689,87 @@ endif() # ---------------------------------------------------------------------- # OpenTelemetry C++ -macro(build_opentelemetry) - message(STATUS "Building OpenTelemetry from source") +function(build_opentelemetry) + list(APPEND CMAKE_MESSAGE_INDENT "OpenTelemetry: ") + message(STATUS "Building OpenTelemetry from source using FetchContent") + if(Protobuf_VERSION VERSION_GREATER_EQUAL 3.22) message(FATAL_ERROR "GH-36013: Can't use bundled OpenTelemetry with Protobuf 3.22 or later. " "Protobuf is version ${Protobuf_VERSION}") endif() - set(OPENTELEMETRY_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/opentelemetry_ep-install") - set(OPENTELEMETRY_INCLUDE_DIR "${OPENTELEMETRY_PREFIX}/include") - set(OPENTELEMETRY_STATIC_LIB - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry${CMAKE_STATIC_LIBRARY_SUFFIX}" + set(OPENTELEMETRY_VENDORED + TRUE + PARENT_SCOPE) + + fetchcontent_declare(opentelemetry_proto + ${FC_DECLARE_COMMON_OPTIONS} + URL ${OPENTELEMETRY_PROTO_SOURCE_URL} + URL_HASH "SHA256=${ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM}" ) - set(_OPENTELEMETRY_APIS api ext sdk) - set(_OPENTELEMETRY_LIBS - common - http_client_curl - logs - ostream_log_record_exporter - ostream_span_exporter - otlp_http_client - otlp_http_log_record_exporter - otlp_http_exporter - otlp_recordable - proto - resources - trace - version) - set(OPENTELEMETRY_BUILD_BYPRODUCTS) - set(OPENTELEMETRY_LIBRARIES) - - foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_APIS}) - add_library(opentelemetry-cpp::${_OPENTELEMETRY_LIB} INTERFACE IMPORTED) - target_include_directories(opentelemetry-cpp::${_OPENTELEMETRY_LIB} BEFORE - INTERFACE "${OPENTELEMETRY_INCLUDE_DIR}") - endforeach() - foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) - # N.B. OTel targets and libraries don't follow any consistent naming scheme - if(_OPENTELEMETRY_LIB STREQUAL "http_client_curl") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_${_OPENTELEMETRY_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "ostream_span_exporter") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_ostream_span${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_client") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http_client${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_exporter") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_log_record_exporter") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http_log${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "ostream_log_record_exporter") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_ostream_logs${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - else() - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_${_OPENTELEMETRY_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - endif() - add_library(opentelemetry-cpp::${_OPENTELEMETRY_LIB} STATIC IMPORTED) - set_target_properties(opentelemetry-cpp::${_OPENTELEMETRY_LIB} - PROPERTIES IMPORTED_LOCATION ${_OPENTELEMETRY_STATIC_LIBRARY}) - list(APPEND OPENTELEMETRY_BUILD_BYPRODUCTS ${_OPENTELEMETRY_STATIC_LIBRARY}) - list(APPEND OPENTELEMETRY_LIBRARIES opentelemetry-cpp::${_OPENTELEMETRY_LIB}) - endforeach() - set(OPENTELEMETRY_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${OPENTELEMETRY_PREFIX}" - -DWITH_EXAMPLES=OFF) + # Use FetchContent_Populate instead of MakeAvailable because opentelemetry-proto + # has no CMakeLists.txt. + cmake_policy(PUSH) + if(POLICY CMP0169) + cmake_policy(SET CMP0169 OLD) + endif() + fetchcontent_populate(opentelemetry_proto) + cmake_policy(POP) - set(OPENTELEMETRY_PREFIX_PATH_LIST) - # Don't specify the DEPENDS unless we actually have dependencies, else - # Ninja/other build systems may consider this target to always be dirty - set(_OPENTELEMETRY_DEPENDENCIES) - add_custom_target(opentelemetry_dependencies) + fetchcontent_declare(opentelemetry_cpp + ${FC_DECLARE_COMMON_OPTIONS} + URL ${OPENTELEMETRY_SOURCE_URL} + URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}") - set(_OPENTELEMETRY_DEPENDENCIES "opentelemetry_dependencies") - list(APPEND ARROW_BUNDLED_STATIC_LIBS ${OPENTELEMETRY_LIBRARIES}) - list(APPEND OPENTELEMETRY_PREFIX_PATH_LIST ${NLOHMANN_JSON_PREFIX}) + prepare_fetchcontent() - get_target_property(OPENTELEMETRY_PROTOBUF_INCLUDE_DIR ${ARROW_PROTOBUF_LIBPROTOBUF} - INTERFACE_INCLUDE_DIRECTORIES) - set(OPENTELEMETRY_PROTOBUF_LIBRARY "$") - set(OPENTELEMETRY_PROTOC_EXECUTABLE "$") - list(APPEND - OPENTELEMETRY_CMAKE_ARGS - -DWITH_OTLP_HTTP=ON - -DWITH_OTLP_GRPC=OFF - # Disabled because it seemed to cause linking errors. May be worth a closer look. - -DWITH_FUNC_TESTS=OFF - # These options are slated for removal in v1.14 and their features are deemed stable - # as of v1.13. However, setting their corresponding ENABLE_* macros in headers seems - # finicky - resulting in build failures or ABI-related runtime errors during HTTP - # client initialization. There may still be a solution, but we disable them for now. - -DWITH_OTLP_HTTP_SSL_PREVIEW=OFF - -DWITH_OTLP_HTTP_SSL_TLS_PREVIEW=OFF - "-DProtobuf_INCLUDE_DIR=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" - "-DProtobuf_LIBRARY=${OPENTELEMETRY_PROTOBUF_LIBRARY}" - "-DProtobuf_PROTOC_EXECUTABLE=${OPENTELEMETRY_PROTOC_EXECUTABLE}") - - # OpenTelemetry with OTLP enabled requires Protobuf definitions from a - # submodule. This submodule path is hardcoded into their CMake definitions, - # and submodules are not included in their releases. Add a custom build step - # to download and extract the Protobufs. - - # Adding such a step is rather complicated, so instead: create a separate - # ExternalProject that just fetches the Protobufs, then add a custom step - # to the main build to copy the Protobufs. - externalproject_add(opentelemetry_proto_ep - ${EP_COMMON_OPTIONS} - URL_HASH "SHA256=${ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM}" - URL ${OPENTELEMETRY_PROTO_SOURCE_URL} - BUILD_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - EXCLUDE_FROM_ALL OFF) - if(NLOHMANN_JSON_VENDORED) - add_dependencies(opentelemetry_dependencies nlohmann_json_fc) - else() - add_dependencies(opentelemetry_dependencies nlohmann_json::nlohmann_json) - endif() + set(OTELCPP_PROTO_PATH "${opentelemetry_proto_SOURCE_DIR}") + set(WITH_EXAMPLES OFF) + set(WITH_OTLP_HTTP ON) + set(WITH_OTLP_GRPC OFF) + set(WITH_FUNC_TESTS OFF) + # These options are slated for removal in v1.14 and their features are deemed stable + # as of v1.13. However, setting their corresponding ENABLE_* macros in headers seems + # finicky - resulting in build failures or ABI-related runtime errors during HTTP + # client initialization. There may still be a solution, but we disable them for now. + set(WITH_OTLP_HTTP_SSL_PREVIEW OFF) + set(WITH_OTLP_HTTP_SSL_TLS_PREVIEW OFF) - add_dependencies(opentelemetry_dependencies opentelemetry_proto_ep - ${ARROW_PROTOBUF_LIBPROTOBUF}) + fetchcontent_makeavailable(opentelemetry_cpp) - # Ensure vendored protobuf is installed before OpenTelemetry builds - if(PROTOBUF_VENDORED) - add_dependencies(opentelemetry_dependencies protobuf_fc) + if(CMAKE_VERSION VERSION_LESS 3.28) + set_property(DIRECTORY ${opentelemetry_cpp_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL TRUE) endif() - string(JOIN "${EP_LIST_SEPARATOR}" OPENTELEMETRY_PREFIX_PATH - ${OPENTELEMETRY_PREFIX_PATH_LIST}) - list(APPEND OPENTELEMETRY_CMAKE_ARGS "-DCMAKE_PREFIX_PATH=${OPENTELEMETRY_PREFIX_PATH}") + # Remove unused directories to save build directory storage + file(REMOVE_RECURSE "${opentelemetry_cpp_SOURCE_DIR}/ci") - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "s390x") - # OpenTelemetry tries to determine the processor arch for vcpkg, which fails - # on s390x, even though it doesn't use vcpkg there. Tell it ARCH manually - externalproject_add(opentelemetry_ep - ${EP_COMMON_OPTIONS} - URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ARCH=s390x - ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} - "" - ${OPENTELEMETRY_CMAKE_ARGS} - BUILD_COMMAND ${CMAKE_COMMAND} --build "" --target all - INSTALL_COMMAND ${CMAKE_COMMAND} --build "" --target - install - URL ${OPENTELEMETRY_SOURCE_URL} - BUILD_BYPRODUCTS ${OPENTELEMETRY_BUILD_BYPRODUCTS} - EXCLUDE_FROM_ALL NOT - ${ARROW_WITH_OPENTELEMETRY} - DEPENDS ${_OPENTELEMETRY_DEPENDENCIES}) - else() - externalproject_add(opentelemetry_ep - ${EP_COMMON_OPTIONS} - URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${OPENTELEMETRY_CMAKE_ARGS} - URL ${OPENTELEMETRY_SOURCE_URL} - BUILD_BYPRODUCTS ${OPENTELEMETRY_BUILD_BYPRODUCTS} - EXCLUDE_FROM_ALL NOT - ${ARROW_WITH_OPENTELEMETRY} - DEPENDS ${_OPENTELEMETRY_DEPENDENCIES}) - endif() - - externalproject_add_step(opentelemetry_ep download_proto - COMMAND ${CMAKE_COMMAND} -E copy_directory - $/opentelemetry - $/third_party/opentelemetry-proto/opentelemetry - DEPENDEES download - DEPENDERS configure) - - set(OPENTELEMETRY_VENDORED 1) - - target_link_libraries(opentelemetry-cpp::common - INTERFACE opentelemetry-cpp::api opentelemetry-cpp::sdk - Threads::Threads) - target_link_libraries(opentelemetry-cpp::resources INTERFACE opentelemetry-cpp::common) - target_link_libraries(opentelemetry-cpp::trace INTERFACE opentelemetry-cpp::common - opentelemetry-cpp::resources) - target_link_libraries(opentelemetry-cpp::logs INTERFACE opentelemetry-cpp::common - opentelemetry-cpp::resources) - target_link_libraries(opentelemetry-cpp::http_client_curl - INTERFACE opentelemetry-cpp::common opentelemetry-cpp::ext - CURL::libcurl) - target_link_libraries(opentelemetry-cpp::proto INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) - target_link_libraries(opentelemetry-cpp::otlp_recordable - INTERFACE opentelemetry-cpp::logs opentelemetry-cpp::trace - opentelemetry-cpp::resources opentelemetry-cpp::proto) - target_link_libraries(opentelemetry-cpp::otlp_http_client - INTERFACE opentelemetry-cpp::common opentelemetry-cpp::proto - opentelemetry-cpp::http_client_curl - nlohmann_json::nlohmann_json) - target_link_libraries(opentelemetry-cpp::otlp_http_exporter - INTERFACE opentelemetry-cpp::otlp_recordable - opentelemetry-cpp::otlp_http_client) - target_link_libraries(opentelemetry-cpp::otlp_http_log_record_exporter - INTERFACE opentelemetry-cpp::otlp_recordable - opentelemetry-cpp::otlp_http_client) - - foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) - add_dependencies(opentelemetry-cpp::${_OPENTELEMETRY_LIB} opentelemetry_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS opentelemetry-cpp::${_OPENTELEMETRY_LIB}) - endforeach() + # OpenTelemetry creates its own targets. We need to add them to bundled static libs. + # The targets created by OpenTelemetry's CMakeLists.txt use the opentelemetry:: namespace. + # List of libraries that we actually need and want to bundle. + set(_OPENTELEMETRY_BUNDLED_LIBS + opentelemetry-cpp::common + opentelemetry-cpp::http_client_curl + opentelemetry-cpp::logs + opentelemetry-cpp::ostream_log_record_exporter + opentelemetry-cpp::ostream_span_exporter + opentelemetry-cpp::otlp_http_client + opentelemetry-cpp::otlp_http_log_record_exporter + opentelemetry-cpp::otlp_http_exporter + opentelemetry-cpp::otlp_recordable + opentelemetry-cpp::proto + opentelemetry-cpp::resources + opentelemetry-cpp::trace + opentelemetry-cpp::version) - # Work around https://gitlab.kitware.com/cmake/cmake/issues/15052 - file(MAKE_DIRECTORY ${OPENTELEMETRY_INCLUDE_DIR}) -endmacro() + list(APPEND ARROW_BUNDLED_STATIC_LIBS ${_OPENTELEMETRY_BUNDLED_LIBS}) + set(ARROW_BUNDLED_STATIC_LIBS + "${ARROW_BUNDLED_STATIC_LIBS}" + PARENT_SCOPE) + + list(POP_BACK CMAKE_MESSAGE_INDENT) +endfunction() if(ARROW_WITH_OPENTELEMETRY) if(NOT ARROW_ENABLE_THREADING) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a46db60321a..df9b783d531 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -1180,7 +1180,7 @@ endif() foreach(LIB_TARGET ${ARROW_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) - # C++17 is required to compile against Arrow C++ headers and libraries + # C++20 is required to compile against Arrow C++ headers and libraries target_compile_features(${LIB_TARGET} PUBLIC cxx_std_20) endforeach() diff --git a/cpp/src/arrow/acero/tpch_node_test.cc b/cpp/src/arrow/acero/tpch_node_test.cc index f484d6c9d52..6321a8ca025 100644 --- a/cpp/src/arrow/acero/tpch_node_test.cc +++ b/cpp/src/arrow/acero/tpch_node_test.cc @@ -38,8 +38,6 @@ namespace arrow { -using arrow::internal::StartsWith; - namespace acero { namespace internal { @@ -100,7 +98,7 @@ void VerifyUniqueKey(std::unordered_set* seen, const Datum& d, int32_t void VerifyStringAndNumber_Single(std::string_view row, std::string_view prefix, const int64_t i, const int32_t* nums, bool verify_padding) { - ASSERT_TRUE(StartsWith(row, prefix)) << row << ", prefix=" << prefix << ", i=" << i; + ASSERT_TRUE(row.starts_with(prefix)) << row << ", prefix=" << prefix << ", i=" << i; const char* num_str = row.data() + prefix.size(); const char* num_str_end = row.data() + row.size(); int64_t num = 0; diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 1be398fdae9..411ff0bb026 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -410,7 +410,7 @@ bool ExecSpanIterator::Next(ExecSpan* span) { // The first time this is called, we populate the output span with any // Scalar or Array arguments in the ExecValue struct, and then just // increment array offsets below. If any arguments are ChunkedArray, then - // the internal ArraySpans will see their members updated during hte + // the internal ArraySpans will see their members updated during the // iteration span->values.resize(args_->size()); for (size_t i = 0; i < args_->size(); ++i) { diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 2563674a59c..93427f75fee 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -47,7 +47,6 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -using internal::EndsWith; using internal::ToChars; namespace compute { @@ -180,7 +179,7 @@ std::string Expression::ToString() const { } constexpr std::string_view kleene = "_kleene"; - if (EndsWith(call->function_name, kleene)) { + if (call->function_name.ends_with(kleene)) { auto op = call->function_name.substr(0, call->function_name.size() - kleene.size()); return binary(std::move(op)); } diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc index 06e6f4bb506..0146c0ab8f0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc @@ -38,9 +38,6 @@ namespace arrow { -using internal::EndsWith; -using internal::StartsWith; - namespace compute { namespace internal { @@ -1291,7 +1288,7 @@ struct PlainStartsWithMatcher { } bool Match(std::string_view current) const { - return StartsWith(current, options_.pattern); + return current.starts_with(options_.pattern); } }; @@ -1309,7 +1306,7 @@ struct PlainEndsWithMatcher { } bool Match(std::string_view current) const { - return EndsWith(current, options_.pattern); + return current.ends_with(options_.pattern); } }; diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index 5686e50e3cb..c6f6e41d1ed 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -35,8 +35,6 @@ namespace arrow { -using internal::StartsWith; - namespace dataset { namespace { @@ -49,7 +47,7 @@ bool StartsWithAnyOf(const std::string& path, const std::vector& pr auto parts = fs::internal::SplitAbstractPath(path); return std::any_of(parts.cbegin(), parts.cend(), [&](std::string_view part) { return std::any_of(prefixes.cbegin(), prefixes.cend(), - [&](std::string_view prefix) { return StartsWith(part, prefix); }); + [&](std::string_view prefix) { return part.starts_with(prefix); }); }); } diff --git a/cpp/src/arrow/dataset/subtree_test.cc b/cpp/src/arrow/dataset/subtree_test.cc index fc13c20ecee..51c7d47acbc 100644 --- a/cpp/src/arrow/dataset/subtree_test.cc +++ b/cpp/src/arrow/dataset/subtree_test.cc @@ -31,8 +31,6 @@ namespace arrow { -using internal::StartsWith; - using compute::field_ref; using compute::literal; @@ -112,7 +110,7 @@ bool IsAncestorOf(std::string_view ancestor, std::string_view descendant) { ancestor = RemoveTrailingSlash(ancestor); if (ancestor == "") return true; descendant = RemoveTrailingSlash(descendant); - if (!StartsWith(descendant, ancestor)) return false; + if (!descendant.starts_with(ancestor)) return false; descendant.remove_prefix(ancestor.size()); if (descendant.empty()) return true; return descendant.front() == '/'; diff --git a/cpp/src/arrow/engine/simple_extension_type_internal.h b/cpp/src/arrow/engine/simple_extension_type_internal.h index 1867fe50457..5124b6d294c 100644 --- a/cpp/src/arrow/engine/simple_extension_type_internal.h +++ b/cpp/src/arrow/engine/simple_extension_type_internal.h @@ -111,7 +111,7 @@ class SimpleExtensionType : public ExtensionType { void Fail() { params_ = std::nullopt; } void Init(std::string_view class_name, std::string_view repr, size_t num_properties) { - if (!::arrow::internal::StartsWith(repr, class_name)) return Fail(); + if (!repr.starts_with(class_name)) return Fail(); repr = repr.substr(class_name.size()); if (repr.empty()) return Fail(); diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc b/cpp/src/arrow/engine/substrait/relation_internal.cc index 1ea143f9c58..b9e663ed7b1 100644 --- a/cpp/src/arrow/engine/substrait/relation_internal.cc +++ b/cpp/src/arrow/engine/substrait/relation_internal.cc @@ -65,7 +65,6 @@ namespace arrow { using internal::checked_cast; -using internal::StartsWith; using internal::ToChars; using util::UriFromAbsolutePath; diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0ca18eed518..a3a162616ec 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -70,10 +70,10 @@ void AzureOptions::ExtractFromUriSchemeAndHierPart(const Uri& uri, std::string* out_path) { const auto host = uri.host(); std::string path; - if (arrow::internal::EndsWith(host, blob_storage_authority)) { + if (host.ends_with(blob_storage_authority)) { account_name = host.substr(0, host.size() - blob_storage_authority.size()); path = internal::RemoveLeadingSlash(uri.path()); - } else if (arrow::internal::EndsWith(host, dfs_storage_authority)) { + } else if (host.ends_with(dfs_storage_authority)) { account_name = host.substr(0, host.size() - dfs_storage_authority.size()); path = internal::ConcatAbstractPath(uri.username(), uri.path()); } else { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 43d1c2afb77..c3af6fb0797 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -2856,8 +2856,7 @@ std::shared_ptr NormalizerKeyValueMetadata( value = "2023-10-31T08:15:20Z"; } } else if (key == "ETag") { - if (arrow::internal::StartsWith(value, "\"") && - arrow::internal::EndsWith(value, "\"")) { + if (value.starts_with("\"") && value.ends_with("\"")) { // Valid value value = "\"ETagValue\""; } diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index a48a34135a9..dc82afd07e7 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -29,8 +29,6 @@ namespace arrow { -using internal::StartsWith; - namespace fs { namespace internal { @@ -236,7 +234,7 @@ bool IsAncestorOf(std::string_view ancestor, std::string_view descendant) { } descendant = RemoveTrailingSlash(descendant); - if (!StartsWith(descendant, ancestor)) { + if (!descendant.starts_with(ancestor)) { // an ancestor path is a prefix of descendant paths return false; } @@ -249,7 +247,7 @@ bool IsAncestorOf(std::string_view ancestor, std::string_view descendant) { } // "/hello/w" is not an ancestor of "/hello/world" - return StartsWith(descendant, std::string{kSep}); + return descendant.starts_with(std::string{kSep}); } std::optional RemoveAncestor(std::string_view ancestor, diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index c6b821f5deb..f75fd970a1e 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -1405,9 +1405,10 @@ bool IsDirectory(std::string_view key, const S3Model::HeadObjectResult& result) } // Otherwise, if its content type starts with "application/x-directory", // it's a directory - if (::arrow::internal::StartsWith(result.GetContentType(), kAwsDirectoryContentType)) { + if (result.GetContentType().starts_with(kAwsDirectoryContentType)) { return true; } + // Otherwise, it's a regular file. return false; } diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc index 6cf3242b070..943408c1ffa 100644 --- a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc +++ b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc @@ -61,8 +61,6 @@ namespace arrow { -using internal::EndsWith; - namespace flight { namespace transport { namespace grpc { @@ -175,25 +173,25 @@ class GrpcClientInterceptorAdapterFactory FlightMethod flight_method = FlightMethod::Invalid; std::string_view method(info->method()); - if (EndsWith(method, "/Handshake")) { + if (method.ends_with("/Handshake")) { flight_method = FlightMethod::Handshake; - } else if (EndsWith(method, "/ListFlights")) { + } else if (method.ends_with("/ListFlights")) { flight_method = FlightMethod::ListFlights; - } else if (EndsWith(method, "/GetFlightInfo")) { + } else if (method.ends_with("/GetFlightInfo")) { flight_method = FlightMethod::GetFlightInfo; - } else if (EndsWith(method, "/PollFlightInfo")) { + } else if (method.ends_with("/PollFlightInfo")) { flight_method = FlightMethod::PollFlightInfo; - } else if (EndsWith(method, "/GetSchema")) { + } else if (method.ends_with("/GetSchema")) { flight_method = FlightMethod::GetSchema; - } else if (EndsWith(method, "/DoGet")) { + } else if (method.ends_with("/DoGet")) { flight_method = FlightMethod::DoGet; - } else if (EndsWith(method, "/DoPut")) { + } else if (method.ends_with("/DoPut")) { flight_method = FlightMethod::DoPut; - } else if (EndsWith(method, "/DoExchange")) { + } else if (method.ends_with("/DoExchange")) { flight_method = FlightMethod::DoExchange; - } else if (EndsWith(method, "/DoAction")) { + } else if (method.ends_with("/DoAction")) { flight_method = FlightMethod::DoAction; - } else if (EndsWith(method, "/ListActions")) { + } else if (method.ends_with("/ListActions")) { flight_method = FlightMethod::ListActions; } else { ARROW_LOG(WARNING) << "Unknown Flight method: " << info->method(); diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index 759b1410bda..8166513d4e3 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -1167,7 +1167,7 @@ std::string TransportStatusDetail::ToString() const { repr += "{\""; repr += key; repr += "\", "; - if (arrow::internal::EndsWith(key, "-bin")) { + if (key.ends_with("-bin")) { repr += arrow::util::base64_encode(value); } else { repr += "\""; diff --git a/cpp/src/arrow/json/chunker_test.cc b/cpp/src/arrow/json/chunker_test.cc index 1c26d52b140..0976e9ba22b 100644 --- a/cpp/src/arrow/json/chunker_test.cc +++ b/cpp/src/arrow/json/chunker_test.cc @@ -34,8 +34,6 @@ namespace arrow { -using internal::StartsWith; - namespace json { // Use no nested objects and no string literals containing braces in this test. @@ -159,10 +157,10 @@ void AssertStraddledChunking(Chunker& chunker, const std::shared_ptr& bu AssertChunking(chunker, first_half, 1); std::shared_ptr first_whole, partial; ASSERT_OK(chunker.Process(first_half, &first_whole, &partial)); - ASSERT_TRUE(StartsWith(std::string_view(*first_half), std::string_view(*first_whole))); + ASSERT_TRUE(std::string_view(*first_half).starts_with(std::string_view(*first_whole))); std::shared_ptr completion, rest; ASSERT_OK(chunker.ProcessWithPartial(partial, second_half, &completion, &rest)); - ASSERT_TRUE(StartsWith(std::string_view(*second_half), std::string_view(*completion))); + ASSERT_TRUE(std::string_view(*second_half).starts_with(std::string_view(*completion))); std::shared_ptr straddling; ASSERT_OK_AND_ASSIGN(straddling, ConcatenateBuffers({partial, completion})); auto length = ConsumeWholeObject(&straddling); diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index 43050aa1597..48d01db729d 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -192,6 +192,7 @@ arrow_util_srcs = [ 'util/float16.cc', 'util/formatting.cc', 'util/future.cc', + 'util/fuzz_internal.cc', 'util/hashing.cc', 'util/int_util.cc', 'util/io_util.cc', diff --git a/cpp/src/arrow/telemetry/logging.h b/cpp/src/arrow/telemetry/logging.h index 04b39e6c198..ecccc26efda 100644 --- a/cpp/src/arrow/telemetry/logging.h +++ b/cpp/src/arrow/telemetry/logging.h @@ -60,7 +60,7 @@ class ARROW_EXPORT OtelLogger : public util::Logger { class ARROW_EXPORT OtelLoggerProvider { public: /// \brief Attempt to flush the log record processor associated with the provider - /// \return `true` if the flush occured + /// \return `true` if the flush occurred static bool Flush(std::chrono::microseconds timeout = std::chrono::microseconds::max()); static Result> MakeLogger( diff --git a/cpp/src/arrow/util/reflection_test.cc b/cpp/src/arrow/util/reflection_test.cc index d2d6379bece..2246c8fe7f3 100644 --- a/cpp/src/arrow/util/reflection_test.cc +++ b/cpp/src/arrow/util/reflection_test.cc @@ -83,7 +83,7 @@ struct FromStringImpl { void Fail() { obj_ = std::nullopt; } void Init(std::string_view class_name, std::string_view repr, size_t num_properties) { - if (!StartsWith(repr, class_name)) return Fail(); + if (!repr.starts_with(class_name)) return Fail(); repr = repr.substr(class_name.size()); if (repr.empty()) return Fail(); diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index d39b7a295e7..af8c948f48a 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -52,16 +52,6 @@ ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out); namespace internal { -/// Like std::string_view::starts_with in C++20 -inline bool StartsWith(std::string_view s, std::string_view prefix) { - return s.starts_with(prefix); -} - -/// Like std::string_view::ends_with in C++20 -inline bool EndsWith(std::string_view s, std::string_view suffix) { - return s.ends_with(suffix); -} - /// \brief Split a string with a delimiter ARROW_EXPORT std::vector SplitString(std::string_view v, char delim, diff --git a/cpp/src/arrow/util/string_test.cc b/cpp/src/arrow/util/string_test.cc index f222b938d5a..8988eb9996c 100644 --- a/cpp/src/arrow/util/string_test.cc +++ b/cpp/src/arrow/util/string_test.cc @@ -170,34 +170,6 @@ TEST(SplitString, LimitZero) { EXPECT_EQ(parts[2], "c"); } -TEST(StartsWith, Basics) { - std::string empty{}; - std::string abc{"abc"}; - std::string abcdef{"abcdef"}; - std::string def{"def"}; - ASSERT_TRUE(StartsWith(empty, empty)); - ASSERT_TRUE(StartsWith(abc, empty)); - ASSERT_TRUE(StartsWith(abc, abc)); - ASSERT_TRUE(StartsWith(abcdef, abc)); - ASSERT_FALSE(StartsWith(abc, abcdef)); - ASSERT_FALSE(StartsWith(def, abcdef)); - ASSERT_FALSE(StartsWith(abcdef, def)); -} - -TEST(EndsWith, Basics) { - std::string empty{}; - std::string abc{"abc"}; - std::string abcdef{"abcdef"}; - std::string def{"def"}; - ASSERT_TRUE(EndsWith(empty, empty)); - ASSERT_TRUE(EndsWith(abc, empty)); - ASSERT_TRUE(EndsWith(abc, abc)); - ASSERT_TRUE(EndsWith(abcdef, def)); - ASSERT_FALSE(EndsWith(abcdef, abc)); - ASSERT_FALSE(EndsWith(def, abcdef)); - ASSERT_FALSE(EndsWith(abcdef, abc)); -} - TEST(RegexMatch, Basics) { std::regex regex("a+(b*)(c+)d+"); std::string_view b, c; @@ -260,9 +232,9 @@ TEST(ToChars, FloatingPoint) { // to std::to_string which may make ad hoc formatting choices, so we cannot // really test much about the result. auto result = ToChars(0.0f); - ASSERT_TRUE(StartsWith(result, "0")) << result; + ASSERT_TRUE(result.starts_with("0")) << result; result = ToChars(0.25); - ASSERT_TRUE(StartsWith(result, "0.25")) << result; + ASSERT_TRUE(result.starts_with("0.25")) << result; } } diff --git a/cpp/src/parquet/arrow/fuzz_internal.cc b/cpp/src/parquet/arrow/fuzz_internal.cc index b2e295b435d..0c9d088ee8a 100644 --- a/cpp/src/parquet/arrow/fuzz_internal.cc +++ b/cpp/src/parquet/arrow/fuzz_internal.cc @@ -80,7 +80,7 @@ class FuzzDecryptionKeyRetriever : public DecryptionKeyRetriever { return it->second; } // Is it a key generated by MakeEncryptionKey? - if (::arrow::internal::StartsWith(key_id, kInlineKeyPrefix)) { + if (key_id.starts_with(kInlineKeyPrefix)) { return SecureString( ::arrow::util::base64_decode(key_id.substr(kInlineKeyPrefix.length()))); } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 293ae94b94d..266215a8104 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -49,7 +49,6 @@ using arrow::FieldVector; using arrow::KeyValueMetadata; using arrow::Status; using arrow::internal::checked_cast; -using arrow::internal::EndsWith; using arrow::internal::ToChars; using ArrowType = arrow::DataType; diff --git a/cpp/src/parquet/geospatial/util_json_internal.cc b/cpp/src/parquet/geospatial/util_json_internal.cc index 0ca88f4c6c2..6278ab8873c 100644 --- a/cpp/src/parquet/geospatial/util_json_internal.cc +++ b/cpp/src/parquet/geospatial/util_json_internal.cc @@ -104,10 +104,10 @@ ::arrow::Result MakeGeoArrowCrsMetadata( // the format and pass on this information to GeoArrow. if (crs.empty()) { return R"("crs": "OGC:CRS84", "crs_type": "authority_code")"; - } else if (::arrow::internal::StartsWith(crs, kSridPrefix)) { + } else if (crs.starts_with(kSridPrefix)) { return R"("crs": ")" + std::string(crs.substr(kSridPrefix.size())) + R"(", "crs_type": "srid")"; - } else if (::arrow::internal::StartsWith(crs, kProjjsonPrefix)) { + } else if (crs.starts_with(kProjjsonPrefix)) { std::string_view metadata_field = crs.substr(kProjjsonPrefix.size()); if (metadata && metadata->Contains(metadata_field)) { ARROW_ASSIGN_OR_RAISE(std::string projjson_value, metadata->Get(metadata_field)); diff --git a/cpp/src/parquet/meson.build b/cpp/src/parquet/meson.build index e6ff43a0bae..a75a1b94d08 100644 --- a/cpp/src/parquet/meson.build +++ b/cpp/src/parquet/meson.build @@ -17,6 +17,7 @@ parquet_srcs = files( '../generated/parquet_types.cpp', + 'arrow/fuzz_internal.cc', 'arrow/path_internal.cc', 'arrow/reader.cc', 'arrow/reader_internal.cc', diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index dc0a2f90311..e4092af70cd 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -315,7 +315,7 @@ the input to a single output value. Grouped Aggregations ("group by") ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Grouped aggregations are not directly invokable, but are used as part of a +Grouped aggregations are not directly invocable, but are used as part of a SQL-style "group by" operation. Like scalar aggregations, grouped aggregations reduce multiple input values to a single output value. Instead of aggregating all values of the input, however, grouped aggregations partition the input diff --git a/docs/source/developers/cpp/acero.rst b/docs/source/developers/cpp/acero.rst index 0492007904f..d024a2d03d6 100644 --- a/docs/source/developers/cpp/acero.rst +++ b/docs/source/developers/cpp/acero.rst @@ -328,7 +328,7 @@ this leads to problems with cache locality. For example, let's assume we have a exec nodes, scan, project, and then filter (this is a very common use case). Now let's assume there are 100 batches. In a task-per-operator model we would have tasks like "Scan Batch 5", "Project Batch 5", and "Filter Batch 5". Each of those tasks is potentially going to access the same data. For example, maybe the ``project`` and ``filter`` nodes need -to read the same column. A column which is intially created in a decode phase of the ``scan`` node. To maximize cache +to read the same column. A column which is initially created in a decode phase of the ``scan`` node. To maximize cache utilization we would need to carefully schedule our tasks to ensure that all three of those tasks are run consecutively and assigned to the same CPU core. diff --git a/matlab/README.md b/matlab/README.md index 30684625e9c..264d7f6ece5 100644 --- a/matlab/README.md +++ b/matlab/README.md @@ -64,7 +64,7 @@ To build the MATLAB Interface to Apache Arrow from source, the following softwar 1. [MATLAB](https://www.mathworks.com/products/get-matlab.html) 2. [CMake](https://cmake.org/cmake/help/latest/) -3. C++ compiler which supports C++17 (e.g. [`gcc`](https://gcc.gnu.org/) on Linux, [`Xcode`](https://developer.apple.com/xcode/) on macOS, or [`Visual Studio`](https://visualstudio.microsoft.com/) on Windows) +3. C++ compiler which supports C++20 (e.g. [`gcc`](https://gcc.gnu.org/) on Linux, [`Xcode`](https://developer.apple.com/xcode/) on macOS, or [`Visual Studio`](https://visualstudio.microsoft.com/) on Windows) 4. [Git](https://git-scm.com/) ## Setup diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index cc021b3400b..8c99d04af5b 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -130,7 +130,7 @@ libmexclass_client_add_proxy_library( INCLUDE_DIRS ${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_INCLUDE_DIRS} LINK_LIBRARIES arrow_shared ) -# Use C++17 +# Use C++20 target_compile_features(${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_NAME} PRIVATE cxx_std_20) target_compile_definitions(${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_NAME} PRIVATE ARROW_MATLAB_EXPORTING) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index ed4f394362a..d392d099719 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1127,8 +1127,9 @@ class TypedPandasWriter : public PandasWriter { Status CheckTypeExact(const DataType& type, Type::type expected) { if (type.id() != expected) { - // TODO(wesm): stringify NumPy / pandas type - return Status::NotImplemented("Cannot write Arrow data of type ", type.ToString()); + return Status::NotImplemented("Cannot write Arrow data of type ", type.ToString(), + " to pandas block with NumPy type ", + GetNumPyTypeName(NPY_TYPE)); } return Status::OK(); } diff --git a/python/pyarrow/src/arrow/python/python_test.cc b/python/pyarrow/src/arrow/python/python_test.cc index f2f9c4791db..85cc6c9a969 100644 --- a/python/pyarrow/src/arrow/python/python_test.cc +++ b/python/pyarrow/src/arrow/python/python_test.cc @@ -32,6 +32,7 @@ #include "arrow/python/decimal.h" #include "arrow/python/helpers.h" #include "arrow/python/numpy_convert.h" +#include "arrow/python/numpy_internal.h" #include "arrow/python/numpy_interop.h" #include "arrow/python/python_test.h" #include "arrow/python/python_to_arrow.h" @@ -847,6 +848,21 @@ Status TestUpdateWithNaN() { return Status::OK(); } +Status TestGetNumPyTypeName() { + ASSERT_EQ(GetNumPyTypeName(NPY_BOOL), "bool"); + ASSERT_EQ(GetNumPyTypeName(NPY_INT8), "int8"); + ASSERT_EQ(GetNumPyTypeName(NPY_INT16), "int16"); + ASSERT_EQ(GetNumPyTypeName(NPY_INT32), "int32"); + ASSERT_EQ(GetNumPyTypeName(NPY_INT64), "int64"); + ASSERT_EQ(GetNumPyTypeName(NPY_UINT8), "uint8"); + ASSERT_EQ(GetNumPyTypeName(NPY_UINT16), "uint16"); + ASSERT_EQ(GetNumPyTypeName(NPY_UINT32), "uint32"); + ASSERT_EQ(GetNumPyTypeName(NPY_UINT64), "uint64"); + ASSERT_EQ(GetNumPyTypeName(NPY_FLOAT32), "float32"); + ASSERT_EQ(GetNumPyTypeName(NPY_FLOAT64), "float64"); + return Status::OK(); +} + } // namespace std::vector GetCppTestCases() { @@ -886,6 +902,7 @@ std::vector GetCppTestCases() { TestMixedPrecisionAndScaleSequenceConvert}, {"test_simple_inference", TestSimpleInference}, {"test_update_with_nan", TestUpdateWithNaN}, + {"test_get_numpy_type_name", TestGetNumPyTypeName}, }; } diff --git a/ruby/red-arrow-format/lib/arrow-format.rb b/ruby/red-arrow-format/lib/arrow-format.rb index aea210bfb18..2c8ecbf55c7 100644 --- a/ruby/red-arrow-format/lib/arrow-format.rb +++ b/ruby/red-arrow-format/lib/arrow-format.rb @@ -16,4 +16,5 @@ # under the License. require_relative "arrow-format/file-reader" +require_relative "arrow-format/streaming-reader" require_relative "arrow-format/version" diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb b/ruby/red-arrow-format/lib/arrow-format/array.rb index 4f1f6f5f92a..ac96038f194 100644 --- a/ruby/red-arrow-format/lib/arrow-format/array.rb +++ b/ruby/red-arrow-format/lib/arrow-format/array.rb @@ -189,6 +189,45 @@ def to_a end end + class IntervalArray < TemporalArray + end + + class YearMonthIntervalArray < IntervalArray + def to_a + apply_validity(@values_buffer.values(:s32, 0, @size)) + end + end + + class DayTimeIntervalArray < IntervalArray + def to_a + values = @values_buffer. + each(:s32, 0, @size * 2). + each_slice(2). + collect do |(_, day), (_, time)| + [day, time] + end + apply_validity(values) + end + end + + class MonthDayNanoIntervalArray < IntervalArray + def to_a + buffer_types = [:s32, :s32, :s64] + value_size = IO::Buffer.size_of(buffer_types) + values = @size.times.collect do |i| + offset = value_size * i + @values_buffer.get_values(buffer_types, offset) + end + apply_validity(values) + end + end + + class DurationArray < TemporalArray + def to_a + apply_validity(@values_buffer.values(:s64, 0, @size)) + end + end + class VariableSizeBinaryLayoutArray < Array def initialize(type, size, validity_buffer, offsets_buffer, values_buffer) super(type, size, validity_buffer) diff --git a/ruby/red-arrow-format/lib/arrow-format/error.rb b/ruby/red-arrow-format/lib/arrow-format/error.rb index 39b0b8af156..d73c4082beb 100644 --- a/ruby/red-arrow-format/lib/arrow-format/error.rb +++ b/ruby/red-arrow-format/lib/arrow-format/error.rb @@ -19,6 +19,9 @@ class Error < StandardError end class ReadError < Error + end + + class FileReadError < ReadError attr_reader :buffer def initialize(buffer, message) @buffer = buffer diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb index 5a769743c6d..bf50bfd1cd3 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb @@ -15,54 +15,27 @@ # specific language governing permissions and limitations # under the License. -require_relative "array" -require_relative "error" -require_relative "field" -require_relative "record-batch" -require_relative "schema" -require_relative "type" +require_relative "streaming-reader" -require_relative "org/apache/arrow/flatbuf/binary" -require_relative "org/apache/arrow/flatbuf/bool" -require_relative "org/apache/arrow/flatbuf/date" -require_relative "org/apache/arrow/flatbuf/date_unit" -require_relative "org/apache/arrow/flatbuf/fixed_size_binary" -require_relative "org/apache/arrow/flatbuf/floating_point" +require_relative "org/apache/arrow/flatbuf/block" require_relative "org/apache/arrow/flatbuf/footer" -require_relative "org/apache/arrow/flatbuf/int" -require_relative "org/apache/arrow/flatbuf/large_binary" -require_relative "org/apache/arrow/flatbuf/large_list" -require_relative "org/apache/arrow/flatbuf/large_utf8" -require_relative "org/apache/arrow/flatbuf/list" -require_relative "org/apache/arrow/flatbuf/map" -require_relative "org/apache/arrow/flatbuf/message" -require_relative "org/apache/arrow/flatbuf/null" -require_relative "org/apache/arrow/flatbuf/precision" -require_relative "org/apache/arrow/flatbuf/schema" -require_relative "org/apache/arrow/flatbuf/struct_" -require_relative "org/apache/arrow/flatbuf/time" -require_relative "org/apache/arrow/flatbuf/timestamp" -require_relative "org/apache/arrow/flatbuf/time_unit" -require_relative "org/apache/arrow/flatbuf/union" -require_relative "org/apache/arrow/flatbuf/union_mode" -require_relative "org/apache/arrow/flatbuf/utf8" module ArrowFormat class FileReader include Enumerable + include Readable - MAGIC = "ARROW1".b + MAGIC = "ARROW1".b.freeze MAGIC_BUFFER = IO::Buffer.for(MAGIC) START_MARKER_SIZE = MAGIC_BUFFER.size END_MARKER_SIZE = MAGIC_BUFFER.size - CONTINUATION = "\xFF\xFF\xFF\xFF".b - CONTINUATION_BUFFER = IO::Buffer.for(CONTINUATION) # # STREAMING_FORMAT_START_OFFSET = 8 - INT32_SIZE = 4 - FOOTER_SIZE_SIZE = INT32_SIZE - METADATA_SIZE_SIZE = INT32_SIZE + CONTINUATION_BUFFER = + IO::Buffer.for(MessagePullReader::CONTINUATION_STRING) + FOOTER_SIZE_FORMAT = :s32 + FOOTER_SIZE_SIZE = IO::Buffer.size_of(FOOTER_SIZE_FORMAT) def initialize(input) case input @@ -76,45 +49,75 @@ def initialize(input) validate @footer = read_footer + @record_batches = @footer.record_batches + @schema = read_schema(@footer.schema) end - def each - offset = STREAMING_FORMAT_START_OFFSET - schema = nil - continuation_size = CONTINUATION_BUFFER.size - # streaming format - loop do - continuation = @buffer.slice(offset, continuation_size) - unless continuation == CONTINUATION_BUFFER - raise ReadError.new(@buffer, "No valid continuation") - end - offset += continuation_size + def n_record_batches + @record_batches.size + end - metadata_size = @buffer.get_value(:u32, offset) - offset += METADATA_SIZE_SIZE - break if metadata_size.zero? + def read(i) + block = @record_batches[i] - metadata_data = @buffer.slice(offset, metadata_size) - offset += metadata_size - metadata = Org::Apache::Arrow::Flatbuf::Message.new(metadata_data) + offset = block.offset - body = @buffer.slice(offset, metadata.body_length) - header = metadata.header - case header - when Org::Apache::Arrow::Flatbuf::Schema - schema = read_schema(header) - when Org::Apache::Arrow::Flatbuf::RecordBatch - n_rows = header.length - columns = [] - nodes = header.nodes - buffers = header.buffers - schema.fields.each do |field| - columns << read_column(field, nodes, buffers, body) - end - yield(RecordBatch.new(schema, n_rows, columns)) - end + # If we can report property error information, we can use + # MessagePullReader here. + # + # message_pull_reader = MessagePullReader.new do |message, body| + # return read_record_batch(message.header, @schema, body) + # end + # chunk = @buffer.slice(offset, + # MessagePullReader::CONTINUATION_SIZE + + # MessagePullReader::METADATA_LENGTH_SIZE + + # block.meta_data_length + + # block.body_length) + # message_pull_reader.consume(chunk) - offset += metadata.body_length + continuation_size = CONTINUATION_BUFFER.size + continuation = @buffer.slice(offset, continuation_size) + unless continuation == CONTINUATION_BUFFER + raise FileReadError.new(@buffer, + "Invalid continuation: #{i}: " + + continuation.inspect) + end + offset += continuation_size + + metadata_length_type = MessagePullReader::METADATA_LENGTH_TYPE + metadata_length_size = MessagePullReader::METADATA_LENGTH_SIZE + metadata_length = @buffer.get_value(metadata_length_type, offset) + expected_metadata_length = + block.meta_data_length - + continuation_size - + metadata_length_size + unless metadata_length == expected_metadata_length + raise FileReadError.new(@buffer, + "Invalid metadata length #{i}: " + + "expected:#{expected_metadata_length} " + + "actual:#{metadata_length}") + end + offset += metadata_length_size + + metadata = @buffer.slice(offset, metadata_length) + fb_message = Org::Apache::Arrow::Flatbuf::Message.new(metadata) + fb_header = fb_message.header + unless fb_header.is_a?(Org::Apache::Arrow::Flatbuf::RecordBatch) + raise FileReadError.new(@buffer, + "Not a record batch message: #{i}: " + + fb_header.class.name) + end + offset += metadata_length + + body = @buffer.slice(offset, block.body_length) + read_record_batch(fb_header, @schema, body) + end + + def each + return to_enum(__method__) {n_record_batches} unless block_given? + + @record_batches.size.times do |i| + yield(read(i)) end end @@ -124,192 +127,28 @@ def validate FOOTER_SIZE_SIZE + END_MARKER_SIZE if @buffer.size < minimum_size - raise ReadError.new(@buffer, - "Input must be larger than or equal to " + - "#{minimum_size}: #{@buffer.size}") + raise FileReadError.new(@buffer, + "Input must be larger than or equal to " + + "#{minimum_size}: #{@buffer.size}") end start_marker = @buffer.slice(0, START_MARKER_SIZE) if start_marker != MAGIC_BUFFER - raise ReadError.new(@buffer, "No start marker") + raise FileReadError.new(@buffer, "No start marker") end - end_marker = @buffer.slice(@buffer.size - END_MARKER_SIZE, END_MARKER_SIZE) + end_marker = @buffer.slice(@buffer.size - END_MARKER_SIZE, + END_MARKER_SIZE) if end_marker != MAGIC_BUFFER - raise ReadError.new(@buffer, "No end marker") + raise FileReadError.new(@buffer, "No end marker") end end def read_footer footer_size_offset = @buffer.size - END_MARKER_SIZE - FOOTER_SIZE_SIZE - footer_size = @buffer.get_value(:u32, footer_size_offset) - footer_data = @buffer.slice(footer_size_offset - footer_size, footer_size) + footer_size = @buffer.get_value(FOOTER_SIZE_FORMAT, footer_size_offset) + footer_data = @buffer.slice(footer_size_offset - footer_size, + footer_size) Org::Apache::Arrow::Flatbuf::Footer.new(footer_data) end - - def read_field(fb_field) - fb_type = fb_field.type - case fb_type - when Org::Apache::Arrow::Flatbuf::Null - type = NullType.singleton - when Org::Apache::Arrow::Flatbuf::Bool - type = BooleanType.singleton - when Org::Apache::Arrow::Flatbuf::Int - case fb_type.bit_width - when 8 - if fb_type.signed? - type = Int8Type.singleton - else - type = UInt8Type.singleton - end - when 16 - if fb_type.signed? - type = Int16Type.singleton - else - type = UInt16Type.singleton - end - when 32 - if fb_type.signed? - type = Int32Type.singleton - else - type = UInt32Type.singleton - end - when 64 - if fb_type.signed? - type = Int64Type.singleton - else - type = UInt64Type.singleton - end - end - when Org::Apache::Arrow::Flatbuf::FloatingPoint - case fb_type.precision - when Org::Apache::Arrow::Flatbuf::Precision::SINGLE - type = Float32Type.singleton - when Org::Apache::Arrow::Flatbuf::Precision::DOUBLE - type = Float64Type.singleton - end - when Org::Apache::Arrow::Flatbuf::Date - case fb_type.unit - when Org::Apache::Arrow::Flatbuf::DateUnit::DAY - type = Date32Type.singleton - when Org::Apache::Arrow::Flatbuf::DateUnit::MILLISECOND - type = Date64Type.singleton - end - when Org::Apache::Arrow::Flatbuf::Time - case fb_type.bit_width - when 32 - case fb_type.unit - when Org::Apache::Arrow::Flatbuf::TimeUnit::SECOND - type = Time32Type.new(:second) - when Org::Apache::Arrow::Flatbuf::TimeUnit::MILLISECOND - type = Time32Type.new(:millisecond) - end - when 64 - case fb_type.unit - when Org::Apache::Arrow::Flatbuf::TimeUnit::MICROSECOND - type = Time64Type.new(:microsecond) - when Org::Apache::Arrow::Flatbuf::TimeUnit::NANOSECOND - type = Time64Type.new(:nanosecond) - end - end - when Org::Apache::Arrow::Flatbuf::Timestamp - unit = fb_type.unit.name.downcase.to_sym - type = TimestampType.new(unit, fb_type.timezone) - when Org::Apache::Arrow::Flatbuf::List - type = ListType.new(read_field(fb_field.children[0])) - when Org::Apache::Arrow::Flatbuf::LargeList - type = LargeListType.new(read_field(fb_field.children[0])) - when Org::Apache::Arrow::Flatbuf::Struct - children = fb_field.children.collect {|child| read_field(child)} - type = StructType.new(children) - when Org::Apache::Arrow::Flatbuf::Union - children = fb_field.children.collect {|child| read_field(child)} - type_ids = fb_type.type_ids - case fb_type.mode - when Org::Apache::Arrow::Flatbuf::UnionMode::DENSE - type = DenseUnionType.new(children, type_ids) - when Org::Apache::Arrow::Flatbuf::UnionMode::SPARSE - type = SparseUnionType.new(children, type_ids) - end - when Org::Apache::Arrow::Flatbuf::Map - type = MapType.new(read_field(fb_field.children[0])) - when Org::Apache::Arrow::Flatbuf::Binary - type = BinaryType.singleton - when Org::Apache::Arrow::Flatbuf::LargeBinary - type = LargeBinaryType.singleton - when Org::Apache::Arrow::Flatbuf::Utf8 - type = UTF8Type.singleton - when Org::Apache::Arrow::Flatbuf::LargeUtf8 - type = LargeUTF8Type.singleton - when Org::Apache::Arrow::Flatbuf::FixedSizeBinary - type = FixedSizeBinaryType.new(fb_type.byte_width) - end - Field.new(fb_field.name, type, fb_field.nullable?) - end - - def read_schema(fb_schema) - fields = fb_schema.fields.collect do |fb_field| - read_field(fb_field) - end - Schema.new(fields) - end - - def read_column(field, nodes, buffers, body) - node = nodes.shift - length = node.length - - return field.type.build_array(length) if field.type.is_a?(NullType) - - validity_buffer = buffers.shift - if validity_buffer.length.zero? - validity = nil - else - validity = body.slice(validity_buffer.offset, validity_buffer.length) - end - - case field.type - when BooleanType, - NumberType, - TemporalType - values_buffer = buffers.shift - values = body.slice(values_buffer.offset, values_buffer.length) - field.type.build_array(length, validity, values) - when VariableSizeBinaryType - offsets_buffer = buffers.shift - values_buffer = buffers.shift - offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) - values = body.slice(values_buffer.offset, values_buffer.length) - field.type.build_array(length, validity, offsets, values) - when FixedSizeBinaryType - values_buffer = buffers.shift - values = body.slice(values_buffer.offset, values_buffer.length) - field.type.build_array(length, validity, values) - when VariableSizeListType - offsets_buffer = buffers.shift - offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) - child = read_column(field.type.child, nodes, buffers, body) - field.type.build_array(length, validity, offsets, child) - when StructType - children = field.type.children.collect do |child| - read_column(child, nodes, buffers, body) - end - field.type.build_array(length, validity, children) - when DenseUnionType - # dense union type doesn't have validity. - types = validity - offsets_buffer = buffers.shift - offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) - children = field.type.children.collect do |child| - read_column(child, nodes, buffers, body) - end - field.type.build_array(length, types, offsets, children) - when SparseUnionType - # sparse union type doesn't have validity. - types = validity - children = field.type.children.collect do |child| - read_column(child, nodes, buffers, body) - end - field.type.build_array(length, types, children) - end - end end end diff --git a/ruby/red-arrow-format/lib/arrow-format/readable.rb b/ruby/red-arrow-format/lib/arrow-format/readable.rb new file mode 100644 index 00000000000..2d64d5387ff --- /dev/null +++ b/ruby/red-arrow-format/lib/arrow-format/readable.rb @@ -0,0 +1,242 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "array" +require_relative "field" +require_relative "record-batch" +require_relative "schema" +require_relative "type" + +require_relative "org/apache/arrow/flatbuf/binary" +require_relative "org/apache/arrow/flatbuf/bool" +require_relative "org/apache/arrow/flatbuf/date" +require_relative "org/apache/arrow/flatbuf/date_unit" +require_relative "org/apache/arrow/flatbuf/duration" +require_relative "org/apache/arrow/flatbuf/fixed_size_binary" +require_relative "org/apache/arrow/flatbuf/floating_point" +require_relative "org/apache/arrow/flatbuf/int" +require_relative "org/apache/arrow/flatbuf/interval" +require_relative "org/apache/arrow/flatbuf/interval_unit" +require_relative "org/apache/arrow/flatbuf/large_binary" +require_relative "org/apache/arrow/flatbuf/large_list" +require_relative "org/apache/arrow/flatbuf/large_utf8" +require_relative "org/apache/arrow/flatbuf/list" +require_relative "org/apache/arrow/flatbuf/map" +require_relative "org/apache/arrow/flatbuf/message" +require_relative "org/apache/arrow/flatbuf/null" +require_relative "org/apache/arrow/flatbuf/precision" +require_relative "org/apache/arrow/flatbuf/schema" +require_relative "org/apache/arrow/flatbuf/struct_" +require_relative "org/apache/arrow/flatbuf/time" +require_relative "org/apache/arrow/flatbuf/timestamp" +require_relative "org/apache/arrow/flatbuf/time_unit" +require_relative "org/apache/arrow/flatbuf/union" +require_relative "org/apache/arrow/flatbuf/union_mode" +require_relative "org/apache/arrow/flatbuf/utf8" + +module ArrowFormat + module Readable + private + def read_schema(fb_schema) + fields = fb_schema.fields.collect do |fb_field| + read_field(fb_field) + end + Schema.new(fields) + end + + def read_field(fb_field) + fb_type = fb_field.type + case fb_type + when Org::Apache::Arrow::Flatbuf::Null + type = NullType.singleton + when Org::Apache::Arrow::Flatbuf::Bool + type = BooleanType.singleton + when Org::Apache::Arrow::Flatbuf::Int + case fb_type.bit_width + when 8 + if fb_type.signed? + type = Int8Type.singleton + else + type = UInt8Type.singleton + end + when 16 + if fb_type.signed? + type = Int16Type.singleton + else + type = UInt16Type.singleton + end + when 32 + if fb_type.signed? + type = Int32Type.singleton + else + type = UInt32Type.singleton + end + when 64 + if fb_type.signed? + type = Int64Type.singleton + else + type = UInt64Type.singleton + end + end + when Org::Apache::Arrow::Flatbuf::FloatingPoint + case fb_type.precision + when Org::Apache::Arrow::Flatbuf::Precision::SINGLE + type = Float32Type.singleton + when Org::Apache::Arrow::Flatbuf::Precision::DOUBLE + type = Float64Type.singleton + end + when Org::Apache::Arrow::Flatbuf::Date + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::DateUnit::DAY + type = Date32Type.singleton + when Org::Apache::Arrow::Flatbuf::DateUnit::MILLISECOND + type = Date64Type.singleton + end + when Org::Apache::Arrow::Flatbuf::Time + case fb_type.bit_width + when 32 + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::TimeUnit::SECOND + type = Time32Type.new(:second) + when Org::Apache::Arrow::Flatbuf::TimeUnit::MILLISECOND + type = Time32Type.new(:millisecond) + end + when 64 + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::TimeUnit::MICROSECOND + type = Time64Type.new(:microsecond) + when Org::Apache::Arrow::Flatbuf::TimeUnit::NANOSECOND + type = Time64Type.new(:nanosecond) + end + end + when Org::Apache::Arrow::Flatbuf::Timestamp + unit = fb_type.unit.name.downcase.to_sym + type = TimestampType.new(unit, fb_type.timezone) + when Org::Apache::Arrow::Flatbuf::Interval + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::IntervalUnit::YEAR_MONTH + type = YearMonthIntervalType.new + when Org::Apache::Arrow::Flatbuf::IntervalUnit::DAY_TIME + type = DayTimeIntervalType.new + when Org::Apache::Arrow::Flatbuf::IntervalUnit::MONTH_DAY_NANO + type = MonthDayNanoIntervalType.new + end + when Org::Apache::Arrow::Flatbuf::Duration + unit = fb_type.unit.name.downcase.to_sym + type = DurationType.new(unit) + when Org::Apache::Arrow::Flatbuf::List + type = ListType.new(read_field(fb_field.children[0])) + when Org::Apache::Arrow::Flatbuf::LargeList + type = LargeListType.new(read_field(fb_field.children[0])) + when Org::Apache::Arrow::Flatbuf::Struct + children = fb_field.children.collect {|child| read_field(child)} + type = StructType.new(children) + when Org::Apache::Arrow::Flatbuf::Union + children = fb_field.children.collect {|child| read_field(child)} + type_ids = fb_type.type_ids + case fb_type.mode + when Org::Apache::Arrow::Flatbuf::UnionMode::DENSE + type = DenseUnionType.new(children, type_ids) + when Org::Apache::Arrow::Flatbuf::UnionMode::SPARSE + type = SparseUnionType.new(children, type_ids) + end + when Org::Apache::Arrow::Flatbuf::Map + type = MapType.new(read_field(fb_field.children[0])) + when Org::Apache::Arrow::Flatbuf::Binary + type = BinaryType.singleton + when Org::Apache::Arrow::Flatbuf::LargeBinary + type = LargeBinaryType.singleton + when Org::Apache::Arrow::Flatbuf::Utf8 + type = UTF8Type.singleton + when Org::Apache::Arrow::Flatbuf::LargeUtf8 + type = LargeUTF8Type.singleton + when Org::Apache::Arrow::Flatbuf::FixedSizeBinary + type = FixedSizeBinaryType.new(fb_type.byte_width) + end + Field.new(fb_field.name, type, fb_field.nullable?) + end + + def read_record_batch(fb_record_batch, schema, body) + n_rows = fb_record_batch.length + nodes = fb_record_batch.nodes + buffers = fb_record_batch.buffers + columns = @schema.fields.collect do |field| + read_column(field, nodes, buffers, body) + end + RecordBatch.new(schema, n_rows, columns) + end + + def read_column(field, nodes, buffers, body) + node = nodes.shift + length = node.length + + return field.type.build_array(length) if field.type.is_a?(NullType) + + validity_buffer = buffers.shift + if validity_buffer.length.zero? + validity = nil + else + validity = body.slice(validity_buffer.offset, validity_buffer.length) + end + + case field.type + when BooleanType, + NumberType, + TemporalType + values_buffer = buffers.shift + values = body.slice(values_buffer.offset, values_buffer.length) + field.type.build_array(length, validity, values) + when VariableSizeBinaryType + offsets_buffer = buffers.shift + values_buffer = buffers.shift + offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) + values = body.slice(values_buffer.offset, values_buffer.length) + field.type.build_array(length, validity, offsets, values) + when FixedSizeBinaryType + values_buffer = buffers.shift + values = body.slice(values_buffer.offset, values_buffer.length) + field.type.build_array(length, validity, values) + when VariableSizeListType + offsets_buffer = buffers.shift + offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) + child = read_column(field.type.child, nodes, buffers, body) + field.type.build_array(length, validity, offsets, child) + when StructType + children = field.type.children.collect do |child| + read_column(child, nodes, buffers, body) + end + field.type.build_array(length, validity, children) + when DenseUnionType + # dense union type doesn't have validity. + types = validity + offsets_buffer = buffers.shift + offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) + children = field.type.children.collect do |child| + read_column(child, nodes, buffers, body) + end + field.type.build_array(length, types, offsets, children) + when SparseUnionType + # sparse union type doesn't have validity. + types = validity + children = field.type.children.collect do |child| + read_column(child, nodes, buffers, body) + end + field.type.build_array(length, types, children) + end + end + end +end diff --git a/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb b/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb new file mode 100644 index 00000000000..ae231fccbc6 --- /dev/null +++ b/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb @@ -0,0 +1,200 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "array" +require_relative "error" +require_relative "field" +require_relative "readable" +require_relative "record-batch" +require_relative "schema" +require_relative "type" + +module ArrowFormat + class MessagePullReader + CONTINUATION_TYPE = :s32 + CONTINUATION_SIZE = IO::Buffer.size_of(CONTINUATION_TYPE) + CONTINUATION_STRING = "\xFF\xFF\xFF\xFF".b.freeze + CONTINUATION_INT32 = -1 + METADATA_LENGTH_TYPE = :s32 + METADATA_LENGTH_SIZE = IO::Buffer.size_of(METADATA_LENGTH_TYPE) + + def initialize(&on_read) + @on_read = on_read + @buffer = IO::Buffer.new(0) + @metadata_length = nil + @body_length = nil + @state = :initial + end + + def next_required_size + case @state + when :initial + CONTINUATION_SIZE + when :metadata_length + METADATA_LENGTH_SIZE + when :metadata + @metadata_length + when :body + @body_length + when :eos + 0 + end + end + + def eos? + @state == :eos + end + + def consume(chunk) + return if eos? + + if @buffer.size.zero? + target = chunk + else + @buffer.resize(@buffer.size + chunk.size) + @buffer.copy(chunk) + target = @buffer + end + + loop do + next_size = next_required_size + break if next_size.zero? + + if target.size < next_size + @buffer.resize(target.size) if @buffer.size < target.size + @buffer.copy(target) + @buffer.resize(target.size) + return + end + + case @state + when :initial + consume_initial(target) + when :metadata_length + consume_metadata_length(target) + when :metadata + consume_metadata(target) + when :body + consume_body(target) + end + break if target.size == next_size + + target = target.slice(next_size) + end + end + + private + def consume_initial(target) + continuation = target.get_value(CONTINUATION_TYPE, 0) + unless continuation == CONTINUATION_INT32 + raise ReadError.new("Invalid continuation token: " + + continuation.inspect) + end + @state = :metadata_length + end + + def consume_metadata_length(target) + length = target.get_value(METADATA_LENGTH_TYPE, 0) + if length < 0 + raise ReadError.new("Negative metadata length: " + + length.inspect) + end + if length == 0 + @state = :eos + else + @metadata_length = length + @state = :metadata + end + end + + def consume_metadata(target) + metadata_buffer = target.slice(0, @metadata_length) + @message = Org::Apache::Arrow::Flatbuf::Message.new(metadata_buffer) + @body_length = @message.body_length + if @body_length < 0 + raise ReadError.new("Negative body length: " + + @body_length.inspect) + end + @state = :body + consume_body if @body_length.zero? + end + + def consume_body(target=nil) + body = target&.slice(0, @body_length) + @on_read.call(@message, body) + @state = :initial + end + end + + class StreamingPullReader + include Readable + + attr_reader :schema + def initialize(&on_read) + @on_read = on_read + @message_pull_reader = MessagePullReader.new do |message, body| + process_message(message, body) + end + @state = :schema + @schema = nil + end + + def next_required_size + @message_pull_reader.next_required_size + end + + def eos? + @message_pull_reader.eos? + end + + def consume(chunk) + @message_pull_reader.consume(chunk) + end + + private + def process_message(message, body) + case @state + when :schema + process_schema_message(message, body) + when :record_batch + process_record_batch_message(message, body) + end + end + + def process_schema_message(message, body) + header = message.header + unless header.is_a?(Org::Apache::Arrow::Flatbuf::Schema) + raise ReadError.new("Not a schema message: " + + header.inspect) + end + + @schema = read_schema(header) + # TODO: initial dictionaries support + @state = :record_batch + end + + def process_record_batch_message(message, body) + header = message.header + unless header.is_a?(Org::Apache::Arrow::Flatbuf::RecordBatch) + raise ReadError.new("Not a record batch message: " + + header.inspect) + end + + @on_read.call(read_record_batch(header, @schema, body)) + end + end +end diff --git a/ruby/red-arrow-format/lib/arrow-format/streaming-reader.rb b/ruby/red-arrow-format/lib/arrow-format/streaming-reader.rb new file mode 100644 index 00000000000..f11972c67a2 --- /dev/null +++ b/ruby/red-arrow-format/lib/arrow-format/streaming-reader.rb @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "streaming-pull-reader" + +module ArrowFormat + class StreamingReader + include Enumerable + + attr_reader :schema + def initialize(input) + @input = input + @schema = nil + end + + def each + return to_enum(__method__) unless block_given? + + reader = StreamingPullReader.new do |record_batch| + @schema ||= reader.schema + yield(record_batch) + end + + buffer = "".b + loop do + next_size = reader.next_required_size + break if next_size.zero? + + next_chunk = @input.read(next_size, buffer) + break if next_chunk.nil? + + reader.consume(IO::Buffer.for(next_chunk)) + end + end + end +end diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb b/ruby/red-arrow-format/lib/arrow-format/type.rb index 7627bf53ff3..c6679660122 100644 --- a/ruby/red-arrow-format/lib/arrow-format/type.rb +++ b/ruby/red-arrow-format/lib/arrow-format/type.rb @@ -315,6 +315,54 @@ def build_array(size, validity_buffer, values_buffer) end end + class IntervalType < TemporalType + end + + class YearMonthIntervalType < IntervalType + def initialize + super("YearMonthInterval") + end + + def build_array(size, validity_buffer, values_buffer) + YearMonthIntervalArray.new(self, size, validity_buffer, values_buffer) + end + end + + class DayTimeIntervalType < IntervalType + def initialize + super("DayTimeInterval") + end + + def build_array(size, validity_buffer, values_buffer) + DayTimeIntervalArray.new(self, size, validity_buffer, values_buffer) + end + end + + class MonthDayNanoIntervalType < IntervalType + def initialize + super("MonthDayNanoInterval") + end + + def build_array(size, validity_buffer, values_buffer) + MonthDayNanoIntervalArray.new(self, + size, + validity_buffer, + values_buffer) + end + end + + class DurationType < TemporalType + attr_reader :unit + def initialize(unit) + super("Duration") + @unit = unit + end + + def build_array(size, validity_buffer, values_buffer) + DurationArray.new(self, size, validity_buffer, values_buffer) + end + end + class VariableSizeBinaryType < Type end diff --git a/ruby/red-arrow-format/test/test-file-reader.rb b/ruby/red-arrow-format/test/test-file-reader.rb deleted file mode 100644 index 309720b989f..00000000000 --- a/ruby/red-arrow-format/test/test-file-reader.rb +++ /dev/null @@ -1,665 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestFileReader < Test::Unit::TestCase - def setup - Dir.mktmpdir do |tmp_dir| - table = Arrow::Table.new(value: build_array) - @path = File.join(tmp_dir, "data.arrow") - table.save(@path) - File.open(@path, "rb") do |input| - @reader = ArrowFormat::FileReader.new(input) - yield - @reader = nil - end - GC.start - end - end - - def read - @reader.to_a.collect do |record_batch| - record_batch.to_h.tap do |hash| - hash.each do |key, value| - hash[key] = value.to_a - end - end - end - end - - def type - @type ||= @reader.first.schema.fields[0].type - end - - sub_test_case("Null") do - def build_array - Arrow::NullArray.new(3) - end - - def test_read - assert_equal([{"value" => [nil, nil, nil]}], - read) - end - end - - sub_test_case("Boolean") do - def build_array - Arrow::BooleanArray.new([true, nil, false]) - end - - def test_read - assert_equal([{"value" => [true, nil, false]}], - read) - end - end - - sub_test_case("Int8") do - def build_array - Arrow::Int8Array.new([-128, nil, 127]) - end - - def test_read - assert_equal([{"value" => [-128, nil, 127]}], - read) - end - end - - sub_test_case("UInt8") do - def build_array - Arrow::UInt8Array.new([0, nil, 255]) - end - - def test_read - assert_equal([{"value" => [0, nil, 255]}], - read) - end - end - - sub_test_case("Int16") do - def build_array - Arrow::Int16Array.new([-32768, nil, 32767]) - end - - def test_read - assert_equal([{"value" => [-32768, nil, 32767]}], - read) - end - end - - sub_test_case("UInt16") do - def build_array - Arrow::UInt16Array.new([0, nil, 65535]) - end - - def test_read - assert_equal([{"value" => [0, nil, 65535]}], - read) - end - end - - sub_test_case("Int32") do - def build_array - Arrow::Int32Array.new([-2147483648, nil, 2147483647]) - end - - def test_read - assert_equal([{"value" => [-2147483648, nil, 2147483647]}], - read) - end - end - - sub_test_case("UInt32") do - def build_array - Arrow::UInt32Array.new([0, nil, 4294967295]) - end - - def test_read - assert_equal([{"value" => [0, nil, 4294967295]}], - read) - end - end - - sub_test_case("Int64") do - def build_array - Arrow::Int64Array.new([ - -9223372036854775808, - nil, - 9223372036854775807 - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - -9223372036854775808, - nil, - 9223372036854775807 - ] - } - ], - read) - end - end - - sub_test_case("UInt64") do - def build_array - Arrow::UInt64Array.new([0, nil, 18446744073709551615]) - end - - def test_read - assert_equal([{"value" => [0, nil, 18446744073709551615]}], - read) - end - end - - sub_test_case("Float32") do - def build_array - Arrow::FloatArray.new([-0.5, nil, 0.5]) - end - - def test_read - assert_equal([{"value" => [-0.5, nil, 0.5]}], - read) - end - end - - sub_test_case("Float64") do - def build_array - Arrow::DoubleArray.new([-0.5, nil, 0.5]) - end - - def test_read - assert_equal([{"value" => [-0.5, nil, 0.5]}], - read) - end - end - - sub_test_case("Date32") do - def setup(&block) - @date_2017_08_28 = 17406 - @date_2025_12_09 = 20431 - super(&block) - end - - def build_array - Arrow::Date32Array.new([@date_2017_08_28, nil, @date_2025_12_09]) - end - - def test_read - assert_equal([{"value" => [@date_2017_08_28, nil, @date_2025_12_09]}], - read) - end - end - - sub_test_case("Date64") do - def setup(&block) - @date_2017_08_28_00_00_00 = 1503878400000 - @date_2025_12_09_00_00_00 = 1765324800000 - super(&block) - end - - def build_array - Arrow::Date64Array.new([ - @date_2017_08_28_00_00_00, - nil, - @date_2025_12_09_00_00_00, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @date_2017_08_28_00_00_00, - nil, - @date_2025_12_09_00_00_00, - ], - }, - ], - read) - end - end - - sub_test_case("Time32(:second)") do - def setup(&block) - @time_00_00_10 = 10 - @time_00_01_10 = 60 + 10 - super(&block) - end - - def build_array - Arrow::Time32Array.new(:second, [@time_00_00_10, nil, @time_00_01_10]) - end - - def test_read - assert_equal([{"value" => [@time_00_00_10, nil, @time_00_01_10]}], - read) - end - - def test_type - assert_equal(:second, type.unit) - end - end - - sub_test_case("Time32(:millisecond)") do - def setup(&block) - @time_00_00_10_000 = 10 * 1000 - @time_00_01_10_000 = (60 + 10) * 1000 - super(&block) - end - - def build_array - Arrow::Time32Array.new(:milli, - [@time_00_00_10_000, nil, @time_00_01_10_000]) - end - - def test_read - assert_equal([{"value" => [@time_00_00_10_000, nil, @time_00_01_10_000]}], - read) - end - - def test_type - assert_equal(:millisecond, type.unit) - end - end - - sub_test_case("Time64(:microsecond)") do - def setup(&block) - @time_00_00_10_000_000 = 10 * 1_000_000 - @time_00_01_10_000_000 = (60 + 10) * 1_000_000 - super(&block) - end - - def build_array - Arrow::Time64Array.new(:micro, - [ - @time_00_00_10_000_000, - nil, - @time_00_01_10_000_000, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @time_00_00_10_000_000, - nil, - @time_00_01_10_000_000, - ], - }, - ], - read) - end - - def test_type - assert_equal(:microsecond, type.unit) - end - end - - sub_test_case("Time64(:nanosecond)") do - def setup(&block) - @time_00_00_10_000_000_000 = 10 * 1_000_000_000 - @time_00_01_10_000_000_000 = (60 + 10) * 1_000_000_000 - super(&block) - end - - def build_array - Arrow::Time64Array.new(:nano, - [ - @time_00_00_10_000_000_000, - nil, - @time_00_01_10_000_000_000, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @time_00_00_10_000_000_000, - nil, - @time_00_01_10_000_000_000, - ], - }, - ], - read) - end - - def test_type - assert_equal(:nanosecond, type.unit) - end - end - - sub_test_case("Timestamp(:second)") do - def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 - @timestamp_2025_12_16_05_33_58 = 1765863238 - super(&block) - end - - def build_array - Arrow::TimestampArray.new(:second, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ], - }, - ], - read) - end - end - - sub_test_case("Timestamp(:millisecond)") do - def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000 - @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000 - super(&block) - end - - def build_array - Arrow::TimestampArray.new(:milli, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ], - }, - ], - read) - end - end - - sub_test_case("Timestamp(:microsecond)") do - def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000 - @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000 - super(&block) - end - - def build_array - Arrow::TimestampArray.new(:micro, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ], - }, - ], - read) - end - end - - sub_test_case("Timestamp(:nanosecond)") do - def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000_000 - @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000_000 - super(&block) - end - - def build_array - Arrow::TimestampArray.new(:nano, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ], - }, - ], - read) - end - end - - sub_test_case("Timestamp(timezone)") do - def setup(&block) - @timezone = "UTC" - @timestamp_2019_11_18_00_09_11 = 1574003351 - @timestamp_2025_12_16_05_33_58 = 1765863238 - super(&block) - end - - def build_array - data_type = Arrow::TimestampDataType.new(:second, @timezone) - Arrow::TimestampArray.new(data_type, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_type - assert_equal([:second, @timezone], - [type.unit, type.timezone]) - end - end - - sub_test_case("Binary") do - def build_array - Arrow::BinaryArray.new(["Hello".b, nil, "World".b]) - end - - def test_read - assert_equal([{"value" => ["Hello".b, nil, "World".b]}], - read) - end - end - - sub_test_case("LargeBinary") do - def build_array - Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b]) - end - - def test_read - assert_equal([{"value" => ["Hello".b, nil, "World".b]}], - read) - end - end - - sub_test_case("UTF8") do - def build_array - Arrow::StringArray.new(["Hello", nil, "World"]) - end - - def test_read - assert_equal([{"value" => ["Hello", nil, "World"]}], - read) - end - end - - sub_test_case("LargeUTF8") do - def build_array - Arrow::LargeStringArray.new(["Hello", nil, "World"]) - end - - def test_read - assert_equal([{"value" => ["Hello", nil, "World"]}], - read) - end - end - - sub_test_case("FixedSizeBinary") do - def build_array - data_type = Arrow::FixedSizeBinaryDataType.new(4) - Arrow::FixedSizeBinaryArray.new(data_type, ["0124".b, nil, "abcd".b]) - end - - def test_read - assert_equal([{"value" => ["0124".b, nil, "abcd".b]}], - read) - end - end - - sub_test_case("List") do - def build_array - data_type = Arrow::ListDataType.new(name: "count", type: :int8) - Arrow::ListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]]) - end - - def test_read - assert_equal([{"value" => [[-128, 127], nil, [-1, 0, 1]]}], - read) - end - end - - sub_test_case("LargeList") do - def build_array - data_type = Arrow::LargeListDataType.new(name: "count", type: :int8) - Arrow::LargeListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]]) - end - - def test_read - assert_equal([{"value" => [[-128, 127], nil, [-1, 0, 1]]}], - read) - end - end - - sub_test_case("Struct") do - def build_array - data_type = Arrow::StructDataType.new(count: :int8, - visible: :boolean) - Arrow::StructArray.new(data_type, [[-128, nil], nil, [nil, true]]) - end - - def test_read - assert_equal([ - { - "value" => [ - [-128, nil], - nil, - [nil, true], - ], - }, - ], - read) - end - end - - sub_test_case("DenseUnion") do - def build_array - fields = [ - Arrow::Field.new("number", :int8), - Arrow::Field.new("text", :string), - ] - type_ids = [11, 13] - data_type = Arrow::DenseUnionDataType.new(fields, type_ids) - types = Arrow::Int8Array.new([11, 13, 11, 13, 13]) - value_offsets = Arrow::Int32Array.new([0, 0, 1, 1, 2]) - children = [ - Arrow::Int8Array.new([1, nil]), - Arrow::StringArray.new(["a", "b", "c"]) - ] - Arrow::DenseUnionArray.new(data_type, - types, - value_offsets, - children) - end - - def test_read - assert_equal([{"value" => [1, "a", nil, "b", "c"]}], - read) - end - end - - sub_test_case("SparseUnion") do - def build_array - fields = [ - Arrow::Field.new("number", :int8), - Arrow::Field.new("text", :string), - ] - type_ids = [11, 13] - data_type = Arrow::SparseUnionDataType.new(fields, type_ids) - types = Arrow::Int8Array.new([11, 13, 11, 13, 11]) - children = [ - Arrow::Int8Array.new([1, nil, nil, nil, 5]), - Arrow::StringArray.new([nil, "b", nil, "d", nil]) - ] - Arrow::SparseUnionArray.new(data_type, types, children) - end - - def test_read - assert_equal([{"value" => [1, "b", nil, "d", 5]}], - read) - end - end - - sub_test_case("Map") do - def build_array - data_type = Arrow::MapDataType.new(:string, :int8) - Arrow::MapArray.new(data_type, - [ - {"a" => -128, "b" => 127}, - nil, - {"c" => nil}, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - {"a" => -128, "b" => 127}, - nil, - {"c" => nil}, - ], - }, - ], - read) - end - end -end diff --git a/ruby/red-arrow-format/test/test-reader.rb b/ruby/red-arrow-format/test/test-reader.rb new file mode 100644 index 00000000000..8095adfd50f --- /dev/null +++ b/ruby/red-arrow-format/test/test-reader.rb @@ -0,0 +1,872 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ReaderTests + class << self + def included(base) + base.class_eval do + sub_test_case("Null") do + def build_array + Arrow::NullArray.new(3) + end + + def test_read + assert_equal([{"value" => [nil, nil, nil]}], + read) + end + end + + sub_test_case("Boolean") do + def build_array + Arrow::BooleanArray.new([true, nil, false]) + end + + def test_read + assert_equal([{"value" => [true, nil, false]}], + read) + end + end + + sub_test_case("Int8") do + def build_array + Arrow::Int8Array.new([-128, nil, 127]) + end + + def test_read + assert_equal([{"value" => [-128, nil, 127]}], + read) + end + end + + sub_test_case("UInt8") do + def build_array + Arrow::UInt8Array.new([0, nil, 255]) + end + + def test_read + assert_equal([{"value" => [0, nil, 255]}], + read) + end + end + + sub_test_case("Int16") do + def build_array + Arrow::Int16Array.new([-32768, nil, 32767]) + end + + def test_read + assert_equal([{"value" => [-32768, nil, 32767]}], + read) + end + end + + sub_test_case("UInt16") do + def build_array + Arrow::UInt16Array.new([0, nil, 65535]) + end + + def test_read + assert_equal([{"value" => [0, nil, 65535]}], + read) + end + end + + sub_test_case("Int32") do + def build_array + Arrow::Int32Array.new([-2147483648, nil, 2147483647]) + end + + def test_read + assert_equal([{"value" => [-2147483648, nil, 2147483647]}], + read) + end + end + + sub_test_case("UInt32") do + def build_array + Arrow::UInt32Array.new([0, nil, 4294967295]) + end + + def test_read + assert_equal([{"value" => [0, nil, 4294967295]}], + read) + end + end + + sub_test_case("Int64") do + def build_array + Arrow::Int64Array.new([ + -9223372036854775808, + nil, + 9223372036854775807 + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + -9223372036854775808, + nil, + 9223372036854775807 + ] + } + ], + read) + end + end + + sub_test_case("UInt64") do + def build_array + Arrow::UInt64Array.new([0, nil, 18446744073709551615]) + end + + def test_read + assert_equal([{"value" => [0, nil, 18446744073709551615]}], + read) + end + end + + sub_test_case("Float32") do + def build_array + Arrow::FloatArray.new([-0.5, nil, 0.5]) + end + + def test_read + assert_equal([{"value" => [-0.5, nil, 0.5]}], + read) + end + end + + sub_test_case("Float64") do + def build_array + Arrow::DoubleArray.new([-0.5, nil, 0.5]) + end + + def test_read + assert_equal([{"value" => [-0.5, nil, 0.5]}], + read) + end + end + + sub_test_case("Date32") do + def setup(&block) + @date_2017_08_28 = 17406 + @date_2025_12_09 = 20431 + super(&block) + end + + def build_array + Arrow::Date32Array.new([@date_2017_08_28, nil, @date_2025_12_09]) + end + + def test_read + assert_equal([ + { + "value" => [ + @date_2017_08_28, + nil, + @date_2025_12_09, + ], + }, + ], + read) + end + end + + sub_test_case("Date64") do + def setup(&block) + @date_2017_08_28_00_00_00 = 1503878400000 + @date_2025_12_09_00_00_00 = 1765324800000 + super(&block) + end + + def build_array + Arrow::Date64Array.new([ + @date_2017_08_28_00_00_00, + nil, + @date_2025_12_09_00_00_00, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @date_2017_08_28_00_00_00, + nil, + @date_2025_12_09_00_00_00, + ], + }, + ], + read) + end + end + + sub_test_case("Time32(:second)") do + def setup(&block) + @time_00_00_10 = 10 + @time_00_01_10 = 60 + 10 + super(&block) + end + + def build_array + Arrow::Time32Array.new(:second, [@time_00_00_10, nil, @time_00_01_10]) + end + + def test_read + assert_equal([ + { + "value" => [ + @time_00_00_10, + nil, + @time_00_01_10, + ], + }, + ], + read) + end + + def test_type + assert_equal(:second, type.unit) + end + end + + sub_test_case("Time32(:millisecond)") do + def setup(&block) + @time_00_00_10_000 = 10 * 1000 + @time_00_01_10_000 = (60 + 10) * 1000 + super(&block) + end + + def build_array + Arrow::Time32Array.new(:milli, + [ + @time_00_00_10_000, + nil, + @time_00_01_10_000, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @time_00_00_10_000, + nil, + @time_00_01_10_000, + ], + }, + ], + read) + end + + def test_type + assert_equal(:millisecond, type.unit) + end + end + + sub_test_case("Time64(:microsecond)") do + def setup(&block) + @time_00_00_10_000_000 = 10 * 1_000_000 + @time_00_01_10_000_000 = (60 + 10) * 1_000_000 + super(&block) + end + + def build_array + Arrow::Time64Array.new(:micro, + [ + @time_00_00_10_000_000, + nil, + @time_00_01_10_000_000, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @time_00_00_10_000_000, + nil, + @time_00_01_10_000_000, + ], + }, + ], + read) + end + + def test_type + assert_equal(:microsecond, type.unit) + end + end + + sub_test_case("Time64(:nanosecond)") do + def setup(&block) + @time_00_00_10_000_000_000 = 10 * 1_000_000_000 + @time_00_01_10_000_000_000 = (60 + 10) * 1_000_000_000 + super(&block) + end + + def build_array + Arrow::Time64Array.new(:nano, + [ + @time_00_00_10_000_000_000, + nil, + @time_00_01_10_000_000_000, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @time_00_00_10_000_000_000, + nil, + @time_00_01_10_000_000_000, + ], + }, + ], + read) + end + + def test_type + assert_equal(:nanosecond, type.unit) + end + end + + sub_test_case("Timestamp(:second)") do + def setup(&block) + @timestamp_2019_11_18_00_09_11 = 1574003351 + @timestamp_2025_12_16_05_33_58 = 1765863238 + super(&block) + end + + def build_array + Arrow::TimestampArray.new(:second, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ], + }, + ], + read) + end + end + + sub_test_case("Timestamp(:millisecond)") do + def setup(&block) + @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000 + @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000 + super(&block) + end + + def build_array + Arrow::TimestampArray.new(:milli, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ], + }, + ], + read) + end + end + + sub_test_case("Timestamp(:microsecond)") do + def setup(&block) + @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000 + @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000 + super(&block) + end + + def build_array + Arrow::TimestampArray.new(:micro, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ], + }, + ], + read) + end + end + + sub_test_case("Timestamp(:nanosecond)") do + def setup(&block) + @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000_000 + @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000_000 + super(&block) + end + + def build_array + Arrow::TimestampArray.new(:nano, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ], + }, + ], + read) + end + end + + sub_test_case("Timestamp(timezone)") do + def setup(&block) + @timezone = "UTC" + @timestamp_2019_11_18_00_09_11 = 1574003351 + @timestamp_2025_12_16_05_33_58 = 1765863238 + super(&block) + end + + def build_array + data_type = Arrow::TimestampDataType.new(:second, @timezone) + Arrow::TimestampArray.new(data_type, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_type + assert_equal([:second, @timezone], + [type.unit, type.timezone]) + end + end + + sub_test_case("YearMonthInterval") do + def build_array + Arrow::MonthIntervalArray.new([0, nil, 100]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100]}], + read) + end + end + + sub_test_case("DayTimeInterval") do + def build_array + Arrow::DayTimeIntervalArray.new([ + {day: 1, millisecond: 100}, + nil, + {day: 3, millisecond: 300}, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + [1, 100], + nil, + [3, 300], + ], + }, + ], + read) + end + end + + sub_test_case("MonthDayNanoInterval") do + def build_array + Arrow::MonthDayNanoIntervalArray.new([ + { + month: 1, + day: 1, + nanosecond: 100, + }, + nil, + { + month: 3, + day: 3, + nanosecond: 300, + }, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + [1, 1, 100], + nil, + [3, 3, 300], + ], + }, + ], + read) + end + end + + sub_test_case("Duration(:second)") do + def build_array + Arrow::DurationArray.new(:second, [0, nil, 100]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100]}], + read) + end + + def test_type + assert_equal(:second, type.unit) + end + end + + sub_test_case("Duration(:millisecond)") do + def build_array + Arrow::DurationArray.new(:milli, [0, nil, 100_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000]}], + read) + end + + def test_type + assert_equal(:millisecond, type.unit) + end + end + + sub_test_case("Duration(:microsecond)") do + def build_array + Arrow::DurationArray.new(:micro, [0, nil, 100_000_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000_000]}], + read) + end + + def test_type + assert_equal(:microsecond, type.unit) + end + end + + sub_test_case("Duration(:nanosecond)") do + def build_array + Arrow::DurationArray.new(:nano, [0, nil, 100_000_000_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000_000_000]}], + read) + end + + def test_type + assert_equal(:nanosecond, type.unit) + end + end + + sub_test_case("Binary") do + def build_array + Arrow::BinaryArray.new(["Hello".b, nil, "World".b]) + end + + def test_read + assert_equal([{"value" => ["Hello".b, nil, "World".b]}], + read) + end + end + + sub_test_case("LargeBinary") do + def build_array + Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b]) + end + + def test_read + assert_equal([{"value" => ["Hello".b, nil, "World".b]}], + read) + end + end + + sub_test_case("UTF8") do + def build_array + Arrow::StringArray.new(["Hello", nil, "World"]) + end + + def test_read + assert_equal([{"value" => ["Hello", nil, "World"]}], + read) + end + end + + sub_test_case("LargeUTF8") do + def build_array + Arrow::LargeStringArray.new(["Hello", nil, "World"]) + end + + def test_read + assert_equal([{"value" => ["Hello", nil, "World"]}], + read) + end + end + + sub_test_case("FixedSizeBinary") do + def build_array + data_type = Arrow::FixedSizeBinaryDataType.new(4) + Arrow::FixedSizeBinaryArray.new(data_type, + ["0124".b, nil, "abcd".b]) + end + + def test_read + assert_equal([{"value" => ["0124".b, nil, "abcd".b]}], + read) + end + end + + sub_test_case("List") do + def build_array + data_type = Arrow::ListDataType.new(name: "count", type: :int8) + Arrow::ListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]]) + end + + def test_read + assert_equal([{"value" => [[-128, 127], nil, [-1, 0, 1]]}], + read) + end + end + + sub_test_case("LargeList") do + def build_array + data_type = Arrow::LargeListDataType.new(name: "count", + type: :int8) + Arrow::LargeListArray.new(data_type, + [[-128, 127], nil, [-1, 0, 1]]) + end + + def test_read + assert_equal([ + { + "value" => [ + [-128, 127], + nil, + [-1, 0, 1], + ], + }, + ], + read) + end + end + + sub_test_case("Struct") do + def build_array + data_type = Arrow::StructDataType.new(count: :int8, + visible: :boolean) + Arrow::StructArray.new(data_type, + [[-128, nil], nil, [nil, true]]) + end + + def test_read + assert_equal([ + { + "value" => [ + [-128, nil], + nil, + [nil, true], + ], + }, + ], + read) + end + end + + sub_test_case("DenseUnion") do + def build_array + fields = [ + Arrow::Field.new("number", :int8), + Arrow::Field.new("text", :string), + ] + type_ids = [11, 13] + data_type = Arrow::DenseUnionDataType.new(fields, type_ids) + types = Arrow::Int8Array.new([11, 13, 11, 13, 13]) + value_offsets = Arrow::Int32Array.new([0, 0, 1, 1, 2]) + children = [ + Arrow::Int8Array.new([1, nil]), + Arrow::StringArray.new(["a", "b", "c"]) + ] + Arrow::DenseUnionArray.new(data_type, + types, + value_offsets, + children) + end + + def test_read + assert_equal([{"value" => [1, "a", nil, "b", "c"]}], + read) + end + end + + sub_test_case("SparseUnion") do + def build_array + fields = [ + Arrow::Field.new("number", :int8), + Arrow::Field.new("text", :string), + ] + type_ids = [11, 13] + data_type = Arrow::SparseUnionDataType.new(fields, type_ids) + types = Arrow::Int8Array.new([11, 13, 11, 13, 11]) + children = [ + Arrow::Int8Array.new([1, nil, nil, nil, 5]), + Arrow::StringArray.new([nil, "b", nil, "d", nil]) + ] + Arrow::SparseUnionArray.new(data_type, types, children) + end + + def test_read + assert_equal([{"value" => [1, "b", nil, "d", 5]}], + read) + end + end + + sub_test_case("Map") do + def build_array + data_type = Arrow::MapDataType.new(:string, :int8) + Arrow::MapArray.new(data_type, + [ + {"a" => -128, "b" => 127}, + nil, + {"c" => nil}, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + {"a" => -128, "b" => 127}, + nil, + {"c" => nil}, + ], + }, + ], + read) + end + end + end + end + end +end + +class TestFileReader < Test::Unit::TestCase + include ReaderTests + + def setup + Dir.mktmpdir do |tmp_dir| + table = Arrow::Table.new(value: build_array) + @path = File.join(tmp_dir, "data.arrow") + table.save(@path) + File.open(@path, "rb") do |input| + @reader = ArrowFormat::FileReader.new(input) + yield + @reader = nil + end + GC.start + end + end + + def read + @reader.to_a.collect do |record_batch| + record_batch.to_h.tap do |hash| + hash.each do |key, value| + hash[key] = value.to_a + end + end + end + end + + def type + @type ||= @reader.first.schema.fields[0].type + end +end + +class TestStreamingReader < Test::Unit::TestCase + include ReaderTests + + def setup + Dir.mktmpdir do |tmp_dir| + table = Arrow::Table.new(value: build_array) + @path = File.join(tmp_dir, "data.arrows") + table.save(@path) + File.open(@path, "rb") do |input| + @reader = ArrowFormat::StreamingReader.new(input) + yield + @reader = nil + end + GC.start + end + end + + def read + @reader.to_a.collect do |record_batch| + record_batch.to_h.tap do |hash| + hash.each do |key, value| + hash[key] = value.to_a + end + end + end + end + + def type + @type ||= @reader.first.schema.fields[0].type + end +end diff --git a/ruby/red-arrow/lib/arrow/array-builder.rb b/ruby/red-arrow/lib/arrow/array-builder.rb index 876fd71120b..2ccf50f3c1b 100644 --- a/ruby/red-arrow/lib/arrow/array-builder.rb +++ b/ruby/red-arrow/lib/arrow/array-builder.rb @@ -155,12 +155,14 @@ def detect_builder_info(value, builder_info) sub_builder_info = detect_builder_info(sub_value, sub_builder_info) break if sub_builder_info and sub_builder_info[:detected] end - if sub_builder_info and sub_builder_info[:detected] - sub_value_data_type = sub_builder_info[:builder].value_data_type + if sub_builder_info + sub_builder = sub_builder_info[:builder] + return builder_info unless sub_builder + sub_value_data_type = sub_builder.value_data_type field = Field.new("item", sub_value_data_type) { builder: ListArrayBuilder.new(ListDataType.new(field)), - detected: true, + detected: sub_builder_info[:detected], } else builder_info diff --git a/ruby/red-arrow/lib/arrow/duration-array-builder.rb b/ruby/red-arrow/lib/arrow/duration-array-builder.rb new file mode 100644 index 00000000000..5b1a1b283cd --- /dev/null +++ b/ruby/red-arrow/lib/arrow/duration-array-builder.rb @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class DurationArrayBuilder + class << self + def build(data_type, values) + builder = new(data_type) + builder.build(values) + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/duration-array.rb b/ruby/red-arrow/lib/arrow/duration-array.rb new file mode 100644 index 00000000000..06962bde8af --- /dev/null +++ b/ruby/red-arrow/lib/arrow/duration-array.rb @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class DurationArray + def unit + @unit ||= value_data_type.unit + end + end +end diff --git a/ruby/red-arrow/lib/arrow/duration-data-type.rb b/ruby/red-arrow/lib/arrow/duration-data-type.rb new file mode 100644 index 00000000000..bb2e1f2f870 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/duration-data-type.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class DurationDataType + class << self + # @api private + def try_convert(value) + case value + when Symbol, Arrow::TimeUnit + new(value) + else + super + end + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/fixed-size-list-array-builder.rb b/ruby/red-arrow/lib/arrow/fixed-size-list-array-builder.rb new file mode 100644 index 00000000000..84c6f13dca0 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/fixed-size-list-array-builder.rb @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class FixedSizeListArrayBuilder + class << self + def build(data_type, values) + builder = new(data_type) + builder.build(values) + end + end + + prepend ListValuesAppendable + end +end diff --git a/ruby/red-arrow/lib/arrow/libraries.rb b/ruby/red-arrow/lib/arrow/libraries.rb index 007d541423c..3e1b4eb3f8a 100644 --- a/ruby/red-arrow/lib/arrow/libraries.rb +++ b/ruby/red-arrow/lib/arrow/libraries.rb @@ -59,6 +59,9 @@ require_relative "dense-union-data-type" require_relative "dictionary-array" require_relative "dictionary-data-type" +require_relative "duration-array" +require_relative "duration-array-builder" +require_relative "duration-data-type" require_relative "equal-options" require_relative "expression" require_relative "field" @@ -66,6 +69,7 @@ require_relative "file-system" require_relative "fixed-size-binary-array" require_relative "fixed-size-binary-array-builder" +require_relative "fixed-size-list-array-builder" require_relative "function" require_relative "group" require_relative "half-float" diff --git a/ruby/red-arrow/test/test-array-builder.rb b/ruby/red-arrow/test/test-array-builder.rb index fb48aba8a42..7a2d42e54b3 100644 --- a/ruby/red-arrow/test/test-array-builder.rb +++ b/ruby/red-arrow/test/test-array-builder.rb @@ -146,6 +146,46 @@ def assert_build(builder_class, raw_array) ["Apache Arrow"], ]) end + + test("lists") do + values = [ + [0, 1, 2], + [3, 4], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt8DataType.new) + assert_equal({ + data_type: data_type, + values: [ + [0, 1, 2], + [3, 4], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists") do + values = [ + [0, -1, 2], + [3, 4], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int8DataType.new) + assert_equal({ + data_type: data_type, + values: [ + [0, -1, 2], + [3, 4], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end end sub_test_case("specific builder") do diff --git a/ruby/red-arrow/test/test-duration-array.rb b/ruby/red-arrow/test/test-duration-array.rb new file mode 100644 index 00000000000..86d0244c2be --- /dev/null +++ b/ruby/red-arrow/test/test-duration-array.rb @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class DurationArrayTest < Test::Unit::TestCase + test("#[]") do + array = Arrow::DurationArray.new(:micro, [29]) + assert_equal(29, array[0]) + end +end diff --git a/ruby/red-arrow/test/test-duration-data-type.rb b/ruby/red-arrow/test/test-duration-data-type.rb new file mode 100644 index 00000000000..80d6ddca734 --- /dev/null +++ b/ruby/red-arrow/test/test-duration-data-type.rb @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class DurationDataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + test("Arrow::TimeUnit") do + assert_equal("duration[ms]", + Arrow::DurationDataType.new(Arrow::TimeUnit::MILLI).to_s) + end + + test("Symbol") do + assert_equal("duration[ms]", + Arrow::DurationDataType.new(:milli).to_s) + end + + test("unit: Arrow::TimeUnit") do + data_type = Arrow::DurationDataType.new(unit: Arrow::TimeUnit::MILLI) + assert_equal("duration[ms]", + data_type.to_s) + end + + test("unit: Symbol") do + data_type = Arrow::DurationDataType.new(unit: :milli) + assert_equal("duration[ms]", + data_type.to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-fixed-size-list-array.rb b/ruby/red-arrow/test/test-fixed-size-list-array.rb new file mode 100644 index 00000000000..0436c56dcfd --- /dev/null +++ b/ruby/red-arrow/test/test-fixed-size-list-array.rb @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class FixedSizeListArrayTest < Test::Unit::TestCase + sub_test_case(".new") do + test("build") do + data_type = [:fixed_size_list, :int8, 2] + values = [ + [1, 2], + [3, 4], + nil, + ] + array = Arrow::FixedSizeListArray.new(data_type, values) + assert_equal(values, array.map { |v| v&.to_a }) + end + end +end