From 0ff5c2d8b911aa21eb866043ed063ebec9ff6fb9 Mon Sep 17 00:00:00 2001 From: Jonah Kelman Date: Fri, 19 Dec 2025 13:37:42 -0500 Subject: [PATCH 01/17] Changed Starts/EndsWith to starts/ends_with. --- cpp/src/arrow/acero/tpch_node_test.cc | 4 +--- cpp/src/arrow/compute/expression.cc | 3 +-- .../compute/kernels/scalar_string_ascii.cc | 7 ++---- cpp/src/arrow/dataset/discovery.cc | 3 +-- cpp/src/arrow/dataset/subtree_test.cc | 4 +--- .../engine/substrait/relation_internal.cc | 1 - cpp/src/arrow/filesystem/azurefs.cc | 4 ++-- cpp/src/arrow/filesystem/azurefs_test.cc | 4 ++-- cpp/src/arrow/filesystem/path_util.cc | 5 ++--- cpp/src/arrow/filesystem/s3fs.cc | 3 ++- .../flight/transport/grpc/grpc_client.cc | 22 +++++++++---------- cpp/src/arrow/flight/types.cc | 2 +- cpp/src/arrow/json/chunker_test.cc | 5 ++--- cpp/src/arrow/util/reflection_test.cc | 2 +- cpp/src/arrow/util/string_test.cc | 4 ++-- cpp/src/parquet/arrow/fuzz_internal.cc | 2 +- .../parquet/geospatial/util_json_internal.cc | 4 ++-- 17 files changed, 33 insertions(+), 46 deletions(-) diff --git a/cpp/src/arrow/acero/tpch_node_test.cc b/cpp/src/arrow/acero/tpch_node_test.cc index f484d6c9d52..6321a8ca025 100644 --- a/cpp/src/arrow/acero/tpch_node_test.cc +++ b/cpp/src/arrow/acero/tpch_node_test.cc @@ -38,8 +38,6 @@ namespace arrow { -using arrow::internal::StartsWith; - namespace acero { namespace internal { @@ -100,7 +98,7 @@ void VerifyUniqueKey(std::unordered_set* seen, const Datum& d, int32_t void VerifyStringAndNumber_Single(std::string_view row, std::string_view prefix, const int64_t i, const int32_t* nums, bool verify_padding) { - ASSERT_TRUE(StartsWith(row, prefix)) << row << ", prefix=" << prefix << ", i=" << i; + ASSERT_TRUE(row.starts_with(prefix)) << row << ", prefix=" << prefix << ", i=" << i; const char* num_str = row.data() + prefix.size(); const char* num_str_end = row.data() + row.size(); int64_t num = 0; diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 2563674a59c..93427f75fee 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -47,7 +47,6 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -using internal::EndsWith; using internal::ToChars; namespace compute { @@ -180,7 +179,7 @@ std::string Expression::ToString() const { } constexpr std::string_view kleene = "_kleene"; - if (EndsWith(call->function_name, kleene)) { + if (call->function_name.ends_with(kleene)) { auto op = call->function_name.substr(0, call->function_name.size() - kleene.size()); return binary(std::move(op)); } diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc index 06e6f4bb506..0146c0ab8f0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc @@ -38,9 +38,6 @@ namespace arrow { -using internal::EndsWith; -using internal::StartsWith; - namespace compute { namespace internal { @@ -1291,7 +1288,7 @@ struct PlainStartsWithMatcher { } bool Match(std::string_view current) const { - return StartsWith(current, options_.pattern); + return current.starts_with(options_.pattern); } }; @@ -1309,7 +1306,7 @@ struct PlainEndsWithMatcher { } bool Match(std::string_view current) const { - return EndsWith(current, options_.pattern); + return current.ends_with(options_.pattern); } }; diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index 5686e50e3cb..f8f36d5eb73 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -35,7 +35,6 @@ namespace arrow { -using internal::StartsWith; namespace dataset { @@ -49,7 +48,7 @@ bool StartsWithAnyOf(const std::string& path, const std::vector& pr auto parts = fs::internal::SplitAbstractPath(path); return std::any_of(parts.cbegin(), parts.cend(), [&](std::string_view part) { return std::any_of(prefixes.cbegin(), prefixes.cend(), - [&](std::string_view prefix) { return StartsWith(part, prefix); }); + [&](std::string_view prefix) { return part.starts_with(prefix); }); }); } diff --git a/cpp/src/arrow/dataset/subtree_test.cc b/cpp/src/arrow/dataset/subtree_test.cc index fc13c20ecee..51c7d47acbc 100644 --- a/cpp/src/arrow/dataset/subtree_test.cc +++ b/cpp/src/arrow/dataset/subtree_test.cc @@ -31,8 +31,6 @@ namespace arrow { -using internal::StartsWith; - using compute::field_ref; using compute::literal; @@ -112,7 +110,7 @@ bool IsAncestorOf(std::string_view ancestor, std::string_view descendant) { ancestor = RemoveTrailingSlash(ancestor); if (ancestor == "") return true; descendant = RemoveTrailingSlash(descendant); - if (!StartsWith(descendant, ancestor)) return false; + if (!descendant.starts_with(ancestor)) return false; descendant.remove_prefix(ancestor.size()); if (descendant.empty()) return true; return descendant.front() == '/'; diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc b/cpp/src/arrow/engine/substrait/relation_internal.cc index 1ea143f9c58..b9e663ed7b1 100644 --- a/cpp/src/arrow/engine/substrait/relation_internal.cc +++ b/cpp/src/arrow/engine/substrait/relation_internal.cc @@ -65,7 +65,6 @@ namespace arrow { using internal::checked_cast; -using internal::StartsWith; using internal::ToChars; using util::UriFromAbsolutePath; diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0ca18eed518..a3a162616ec 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -70,10 +70,10 @@ void AzureOptions::ExtractFromUriSchemeAndHierPart(const Uri& uri, std::string* out_path) { const auto host = uri.host(); std::string path; - if (arrow::internal::EndsWith(host, blob_storage_authority)) { + if (host.ends_with(blob_storage_authority)) { account_name = host.substr(0, host.size() - blob_storage_authority.size()); path = internal::RemoveLeadingSlash(uri.path()); - } else if (arrow::internal::EndsWith(host, dfs_storage_authority)) { + } else if (host.ends_with(dfs_storage_authority)) { account_name = host.substr(0, host.size() - dfs_storage_authority.size()); path = internal::ConcatAbstractPath(uri.username(), uri.path()); } else { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 43d1c2afb77..a530095d1ca 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -2856,8 +2856,8 @@ std::shared_ptr NormalizerKeyValueMetadata( value = "2023-10-31T08:15:20Z"; } } else if (key == "ETag") { - if (arrow::internal::StartsWith(value, "\"") && - arrow::internal::EndsWith(value, "\"")) { + if (value.starts_with("\"") && + value.ends_with("\"")) { // Valid value value = "\"ETagValue\""; } diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index a48a34135a9..8bf9a5e5d16 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -29,7 +29,6 @@ namespace arrow { -using internal::StartsWith; namespace fs { namespace internal { @@ -236,7 +235,7 @@ bool IsAncestorOf(std::string_view ancestor, std::string_view descendant) { } descendant = RemoveTrailingSlash(descendant); - if (!StartsWith(descendant, ancestor)) { + if (!descendant.starts_with(ancestor)) { // an ancestor path is a prefix of descendant paths return false; } @@ -249,7 +248,7 @@ bool IsAncestorOf(std::string_view ancestor, std::string_view descendant) { } // "/hello/w" is not an ancestor of "/hello/world" - return StartsWith(descendant, std::string{kSep}); + return descendant.starts_with(std::string{kSep}); } std::optional RemoveAncestor(std::string_view ancestor, diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index c6b821f5deb..f75fd970a1e 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -1405,9 +1405,10 @@ bool IsDirectory(std::string_view key, const S3Model::HeadObjectResult& result) } // Otherwise, if its content type starts with "application/x-directory", // it's a directory - if (::arrow::internal::StartsWith(result.GetContentType(), kAwsDirectoryContentType)) { + if (result.GetContentType().starts_with(kAwsDirectoryContentType)) { return true; } + // Otherwise, it's a regular file. return false; } diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc index 6cf3242b070..943408c1ffa 100644 --- a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc +++ b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc @@ -61,8 +61,6 @@ namespace arrow { -using internal::EndsWith; - namespace flight { namespace transport { namespace grpc { @@ -175,25 +173,25 @@ class GrpcClientInterceptorAdapterFactory FlightMethod flight_method = FlightMethod::Invalid; std::string_view method(info->method()); - if (EndsWith(method, "/Handshake")) { + if (method.ends_with("/Handshake")) { flight_method = FlightMethod::Handshake; - } else if (EndsWith(method, "/ListFlights")) { + } else if (method.ends_with("/ListFlights")) { flight_method = FlightMethod::ListFlights; - } else if (EndsWith(method, "/GetFlightInfo")) { + } else if (method.ends_with("/GetFlightInfo")) { flight_method = FlightMethod::GetFlightInfo; - } else if (EndsWith(method, "/PollFlightInfo")) { + } else if (method.ends_with("/PollFlightInfo")) { flight_method = FlightMethod::PollFlightInfo; - } else if (EndsWith(method, "/GetSchema")) { + } else if (method.ends_with("/GetSchema")) { flight_method = FlightMethod::GetSchema; - } else if (EndsWith(method, "/DoGet")) { + } else if (method.ends_with("/DoGet")) { flight_method = FlightMethod::DoGet; - } else if (EndsWith(method, "/DoPut")) { + } else if (method.ends_with("/DoPut")) { flight_method = FlightMethod::DoPut; - } else if (EndsWith(method, "/DoExchange")) { + } else if (method.ends_with("/DoExchange")) { flight_method = FlightMethod::DoExchange; - } else if (EndsWith(method, "/DoAction")) { + } else if (method.ends_with("/DoAction")) { flight_method = FlightMethod::DoAction; - } else if (EndsWith(method, "/ListActions")) { + } else if (method.ends_with("/ListActions")) { flight_method = FlightMethod::ListActions; } else { ARROW_LOG(WARNING) << "Unknown Flight method: " << info->method(); diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index 759b1410bda..8166513d4e3 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -1167,7 +1167,7 @@ std::string TransportStatusDetail::ToString() const { repr += "{\""; repr += key; repr += "\", "; - if (arrow::internal::EndsWith(key, "-bin")) { + if (key.ends_with("-bin")) { repr += arrow::util::base64_encode(value); } else { repr += "\""; diff --git a/cpp/src/arrow/json/chunker_test.cc b/cpp/src/arrow/json/chunker_test.cc index 1c26d52b140..bb32297dc04 100644 --- a/cpp/src/arrow/json/chunker_test.cc +++ b/cpp/src/arrow/json/chunker_test.cc @@ -34,7 +34,6 @@ namespace arrow { -using internal::StartsWith; namespace json { @@ -159,10 +158,10 @@ void AssertStraddledChunking(Chunker& chunker, const std::shared_ptr& bu AssertChunking(chunker, first_half, 1); std::shared_ptr first_whole, partial; ASSERT_OK(chunker.Process(first_half, &first_whole, &partial)); - ASSERT_TRUE(StartsWith(std::string_view(*first_half), std::string_view(*first_whole))); + ASSERT_TRUE(std::string_view(*first_half).starts_with(std::string_view(*first_whole))); std::shared_ptr completion, rest; ASSERT_OK(chunker.ProcessWithPartial(partial, second_half, &completion, &rest)); - ASSERT_TRUE(StartsWith(std::string_view(*second_half), std::string_view(*completion))); + ASSERT_TRUE(std::string_view(*second_half).starts_with(std::string_view(*completion))); std::shared_ptr straddling; ASSERT_OK_AND_ASSIGN(straddling, ConcatenateBuffers({partial, completion})); auto length = ConsumeWholeObject(&straddling); diff --git a/cpp/src/arrow/util/reflection_test.cc b/cpp/src/arrow/util/reflection_test.cc index d2d6379bece..2246c8fe7f3 100644 --- a/cpp/src/arrow/util/reflection_test.cc +++ b/cpp/src/arrow/util/reflection_test.cc @@ -83,7 +83,7 @@ struct FromStringImpl { void Fail() { obj_ = std::nullopt; } void Init(std::string_view class_name, std::string_view repr, size_t num_properties) { - if (!StartsWith(repr, class_name)) return Fail(); + if (!repr.starts_with(class_name)) return Fail(); repr = repr.substr(class_name.size()); if (repr.empty()) return Fail(); diff --git a/cpp/src/arrow/util/string_test.cc b/cpp/src/arrow/util/string_test.cc index f222b938d5a..225624c7d53 100644 --- a/cpp/src/arrow/util/string_test.cc +++ b/cpp/src/arrow/util/string_test.cc @@ -260,9 +260,9 @@ TEST(ToChars, FloatingPoint) { // to std::to_string which may make ad hoc formatting choices, so we cannot // really test much about the result. auto result = ToChars(0.0f); - ASSERT_TRUE(StartsWith(result, "0")) << result; + ASSERT_TRUE(result.starts_with("0")) << result; result = ToChars(0.25); - ASSERT_TRUE(StartsWith(result, "0.25")) << result; + ASSERT_TRUE(result.starts_with("0.25")) << result; } } diff --git a/cpp/src/parquet/arrow/fuzz_internal.cc b/cpp/src/parquet/arrow/fuzz_internal.cc index b2e295b435d..0c9d088ee8a 100644 --- a/cpp/src/parquet/arrow/fuzz_internal.cc +++ b/cpp/src/parquet/arrow/fuzz_internal.cc @@ -80,7 +80,7 @@ class FuzzDecryptionKeyRetriever : public DecryptionKeyRetriever { return it->second; } // Is it a key generated by MakeEncryptionKey? - if (::arrow::internal::StartsWith(key_id, kInlineKeyPrefix)) { + if (key_id.starts_with(kInlineKeyPrefix)) { return SecureString( ::arrow::util::base64_decode(key_id.substr(kInlineKeyPrefix.length()))); } diff --git a/cpp/src/parquet/geospatial/util_json_internal.cc b/cpp/src/parquet/geospatial/util_json_internal.cc index 0ca88f4c6c2..6278ab8873c 100644 --- a/cpp/src/parquet/geospatial/util_json_internal.cc +++ b/cpp/src/parquet/geospatial/util_json_internal.cc @@ -104,10 +104,10 @@ ::arrow::Result MakeGeoArrowCrsMetadata( // the format and pass on this information to GeoArrow. if (crs.empty()) { return R"("crs": "OGC:CRS84", "crs_type": "authority_code")"; - } else if (::arrow::internal::StartsWith(crs, kSridPrefix)) { + } else if (crs.starts_with(kSridPrefix)) { return R"("crs": ")" + std::string(crs.substr(kSridPrefix.size())) + R"(", "crs_type": "srid")"; - } else if (::arrow::internal::StartsWith(crs, kProjjsonPrefix)) { + } else if (crs.starts_with(kProjjsonPrefix)) { std::string_view metadata_field = crs.substr(kProjjsonPrefix.size()); if (metadata && metadata->Contains(metadata_field)) { ARROW_ASSIGN_OR_RAISE(std::string projjson_value, metadata->Get(metadata_field)); From e8ec20fa1a4de5932af457c5f275652c79217a5b Mon Sep 17 00:00:00 2001 From: Jonah Kelman Date: Sat, 20 Dec 2025 22:26:31 -0500 Subject: [PATCH 02/17] Passes unit and style checks --- cpp/src/arrow/dataset/discovery.cc | 1 - cpp/src/arrow/filesystem/azurefs_test.cc | 3 +-- cpp/src/arrow/filesystem/path_util.cc | 1 - cpp/src/arrow/json/chunker_test.cc | 1 - 4 files changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index f8f36d5eb73..c6f6e41d1ed 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -35,7 +35,6 @@ namespace arrow { - namespace dataset { namespace { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index a530095d1ca..c3af6fb0797 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -2856,8 +2856,7 @@ std::shared_ptr NormalizerKeyValueMetadata( value = "2023-10-31T08:15:20Z"; } } else if (key == "ETag") { - if (value.starts_with("\"") && - value.ends_with("\"")) { + if (value.starts_with("\"") && value.ends_with("\"")) { // Valid value value = "\"ETagValue\""; } diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 8bf9a5e5d16..dc82afd07e7 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -29,7 +29,6 @@ namespace arrow { - namespace fs { namespace internal { diff --git a/cpp/src/arrow/json/chunker_test.cc b/cpp/src/arrow/json/chunker_test.cc index bb32297dc04..0976e9ba22b 100644 --- a/cpp/src/arrow/json/chunker_test.cc +++ b/cpp/src/arrow/json/chunker_test.cc @@ -34,7 +34,6 @@ namespace arrow { - namespace json { // Use no nested objects and no string literals containing braces in this test. From 08ea769d311947b2356ed6705f506c8bae6e8447 Mon Sep 17 00:00:00 2001 From: Jonah Kelman Date: Sat, 20 Dec 2025 22:58:42 -0500 Subject: [PATCH 03/17] Removed unnecessary StartsWith, EndsWith utility functions --- .../engine/simple_extension_type_internal.h | 2 +- cpp/src/arrow/util/string.h | 10 ------- cpp/src/arrow/util/string_test.cc | 28 ------------------- cpp/src/parquet/arrow/schema.cc | 1 - 4 files changed, 1 insertion(+), 40 deletions(-) diff --git a/cpp/src/arrow/engine/simple_extension_type_internal.h b/cpp/src/arrow/engine/simple_extension_type_internal.h index 1867fe50457..5124b6d294c 100644 --- a/cpp/src/arrow/engine/simple_extension_type_internal.h +++ b/cpp/src/arrow/engine/simple_extension_type_internal.h @@ -111,7 +111,7 @@ class SimpleExtensionType : public ExtensionType { void Fail() { params_ = std::nullopt; } void Init(std::string_view class_name, std::string_view repr, size_t num_properties) { - if (!::arrow::internal::StartsWith(repr, class_name)) return Fail(); + if (!repr.starts_with(class_name)) return Fail(); repr = repr.substr(class_name.size()); if (repr.empty()) return Fail(); diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index d39b7a295e7..af8c948f48a 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -52,16 +52,6 @@ ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out); namespace internal { -/// Like std::string_view::starts_with in C++20 -inline bool StartsWith(std::string_view s, std::string_view prefix) { - return s.starts_with(prefix); -} - -/// Like std::string_view::ends_with in C++20 -inline bool EndsWith(std::string_view s, std::string_view suffix) { - return s.ends_with(suffix); -} - /// \brief Split a string with a delimiter ARROW_EXPORT std::vector SplitString(std::string_view v, char delim, diff --git a/cpp/src/arrow/util/string_test.cc b/cpp/src/arrow/util/string_test.cc index 225624c7d53..8988eb9996c 100644 --- a/cpp/src/arrow/util/string_test.cc +++ b/cpp/src/arrow/util/string_test.cc @@ -170,34 +170,6 @@ TEST(SplitString, LimitZero) { EXPECT_EQ(parts[2], "c"); } -TEST(StartsWith, Basics) { - std::string empty{}; - std::string abc{"abc"}; - std::string abcdef{"abcdef"}; - std::string def{"def"}; - ASSERT_TRUE(StartsWith(empty, empty)); - ASSERT_TRUE(StartsWith(abc, empty)); - ASSERT_TRUE(StartsWith(abc, abc)); - ASSERT_TRUE(StartsWith(abcdef, abc)); - ASSERT_FALSE(StartsWith(abc, abcdef)); - ASSERT_FALSE(StartsWith(def, abcdef)); - ASSERT_FALSE(StartsWith(abcdef, def)); -} - -TEST(EndsWith, Basics) { - std::string empty{}; - std::string abc{"abc"}; - std::string abcdef{"abcdef"}; - std::string def{"def"}; - ASSERT_TRUE(EndsWith(empty, empty)); - ASSERT_TRUE(EndsWith(abc, empty)); - ASSERT_TRUE(EndsWith(abc, abc)); - ASSERT_TRUE(EndsWith(abcdef, def)); - ASSERT_FALSE(EndsWith(abcdef, abc)); - ASSERT_FALSE(EndsWith(def, abcdef)); - ASSERT_FALSE(EndsWith(abcdef, abc)); -} - TEST(RegexMatch, Basics) { std::regex regex("a+(b*)(c+)d+"); std::string_view b, c; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 293ae94b94d..266215a8104 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -49,7 +49,6 @@ using arrow::FieldVector; using arrow::KeyValueMetadata; using arrow::Status; using arrow::internal::checked_cast; -using arrow::internal::EndsWith; using arrow::internal::ToChars; using ArrowType = arrow::DataType; From b1cbecdcc0f4a8b02340397f03edcad1f2f731b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 19 Dec 2025 01:49:35 +0100 Subject: [PATCH 04/17] GH-48555: [C++] Use FetchContent for bundled opentelemetry (#48556) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change As a follow up of requiring a minimum CMake version >= 3.25 we discussed moving our dependencies from ExternalProject to FetchContent. This can simplify our third party dependency management. ### What changes are included in this PR? The general change is moving opentelemetry from `ExternalProject` to `FetchContent`. We can clean up previously required installation on nlohmann-json. There has been also a minor unrelated change fixing a warning raised on non-clang compiler when using ` -Wno-unknown-warning-option` ### Are these changes tested? Yes, the changes are tested locally and on CI. ### Are there any user-facing changes? No * GitHub Issue: #48555 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 324 +++++--------------- 1 file changed, 77 insertions(+), 247 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index aa5c426aff3..8a26b46d0bf 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1883,6 +1883,7 @@ function(build_protobuf) PARENT_SCOPE) fetchcontent_declare(protobuf + ${FC_DECLARE_COMMON_OPTIONS} OVERRIDE_FIND_PACKAGE URL ${PROTOBUF_SOURCE_URL} URL_HASH "SHA256=${ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM}" SOURCE_SUBDIR cmake) @@ -3120,10 +3121,13 @@ function(build_grpc) # Add warning suppression flags for gRPC build. if(NOT MSVC) - string(APPEND CMAKE_C_FLAGS - " -Wno-attributes -Wno-format-security -Wno-unknown-warning-option") - string(APPEND CMAKE_CXX_FLAGS - " -Wno-attributes -Wno-format-security -Wno-unknown-warning-option") + string(APPEND CMAKE_C_FLAGS " -Wno-attributes -Wno-format-security") + string(APPEND CMAKE_CXX_FLAGS " -Wno-attributes -Wno-format-security") + endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") + string(APPEND CMAKE_C_FLAGS " -Wno-unknown-warning-option") + string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-warning-option") endif() fetchcontent_makeavailable(grpc) @@ -3323,11 +3327,6 @@ function(build_nlohmann_json) set(NLOHMANN_JSON_VENDORED TRUE PARENT_SCOPE) - set(NLOHMANN_JSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json_fc-install") - set(NLOHMANN_JSON_PREFIX - "${NLOHMANN_JSON_PREFIX}" - PARENT_SCOPE) - fetchcontent_declare(nlohmann_json ${FC_DECLARE_COMMON_OPTIONS} OVERRIDE_FIND_PACKAGE URL ${NLOHMANN_JSON_SOURCE_URL} @@ -3341,49 +3340,13 @@ function(build_nlohmann_json) set(JSON_Install ON) fetchcontent_makeavailable(nlohmann_json) - # opentelemetry requires nlohmann_json to be installed to a known location. - # We have to do this in two steps to avoid double installation of nlohmann_json - # when Arrow is installed. - # This custom target ensures nlohmann_json is built before we install. - add_custom_target(nlohmann_json_built DEPENDS nlohmann_json::nlohmann_json) - - # Disable nlohmann_json's install script after it's built to prevent double installation. - add_custom_command(OUTPUT "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.saved" - COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake" - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.saved" - COMMAND ${CMAKE_COMMAND} -E echo - "# nlohmann-json install disabled to prevent double installation with Arrow" - > "${nlohmann_json_BINARY_DIR}/cmake_install.cmake" - DEPENDS nlohmann_json_built - COMMENT "Disabling nlohmann-json install to prevent double installation" - VERBATIM) - - add_custom_target(nlohmann_json_install_disabled ALL - DEPENDS "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.saved") - - # Install nlohmann_json to NLOHMANN_JSON_PREFIX for opentelemetry to find. - add_custom_command(OUTPUT "${NLOHMANN_JSON_PREFIX}/.nlohmann_json_installed" - COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.saved" - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.tmp" - COMMAND ${CMAKE_COMMAND} - -DCMAKE_INSTALL_PREFIX=${NLOHMANN_JSON_PREFIX} - -DCMAKE_INSTALL_CONFIG_NAME=$ -P - "${nlohmann_json_BINARY_DIR}/cmake_install.cmake.tmp" || - ${CMAKE_COMMAND} -E true - COMMAND ${CMAKE_COMMAND} -E touch - "${NLOHMANN_JSON_PREFIX}/.nlohmann_json_installed" - DEPENDS nlohmann_json_install_disabled - COMMENT "Installing nlohmann-json to ${NLOHMANN_JSON_PREFIX} for google-cloud-cpp" - VERBATIM) - - # Make nlohmann_json_fc depend on the install completion marker. - add_custom_target(nlohmann_json_fc - DEPENDS "${NLOHMANN_JSON_PREFIX}/.nlohmann_json_installed") + if(CMAKE_VERSION VERSION_LESS 3.28) + set_property(DIRECTORY ${nlohmann_json_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL TRUE) + endif() list(POP_BACK CMAKE_MESSAGE_INDENT) endfunction() + if(ARROW_WITH_NLOHMANN_JSON) resolve_dependency(nlohmann_json) get_target_property(nlohmann_json_INCLUDE_DIR nlohmann_json::nlohmann_json @@ -3726,220 +3689,87 @@ endif() # ---------------------------------------------------------------------- # OpenTelemetry C++ -macro(build_opentelemetry) - message(STATUS "Building OpenTelemetry from source") +function(build_opentelemetry) + list(APPEND CMAKE_MESSAGE_INDENT "OpenTelemetry: ") + message(STATUS "Building OpenTelemetry from source using FetchContent") + if(Protobuf_VERSION VERSION_GREATER_EQUAL 3.22) message(FATAL_ERROR "GH-36013: Can't use bundled OpenTelemetry with Protobuf 3.22 or later. " "Protobuf is version ${Protobuf_VERSION}") endif() - set(OPENTELEMETRY_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/opentelemetry_ep-install") - set(OPENTELEMETRY_INCLUDE_DIR "${OPENTELEMETRY_PREFIX}/include") - set(OPENTELEMETRY_STATIC_LIB - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry${CMAKE_STATIC_LIBRARY_SUFFIX}" + set(OPENTELEMETRY_VENDORED + TRUE + PARENT_SCOPE) + + fetchcontent_declare(opentelemetry_proto + ${FC_DECLARE_COMMON_OPTIONS} + URL ${OPENTELEMETRY_PROTO_SOURCE_URL} + URL_HASH "SHA256=${ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM}" ) - set(_OPENTELEMETRY_APIS api ext sdk) - set(_OPENTELEMETRY_LIBS - common - http_client_curl - logs - ostream_log_record_exporter - ostream_span_exporter - otlp_http_client - otlp_http_log_record_exporter - otlp_http_exporter - otlp_recordable - proto - resources - trace - version) - set(OPENTELEMETRY_BUILD_BYPRODUCTS) - set(OPENTELEMETRY_LIBRARIES) - - foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_APIS}) - add_library(opentelemetry-cpp::${_OPENTELEMETRY_LIB} INTERFACE IMPORTED) - target_include_directories(opentelemetry-cpp::${_OPENTELEMETRY_LIB} BEFORE - INTERFACE "${OPENTELEMETRY_INCLUDE_DIR}") - endforeach() - foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) - # N.B. OTel targets and libraries don't follow any consistent naming scheme - if(_OPENTELEMETRY_LIB STREQUAL "http_client_curl") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_${_OPENTELEMETRY_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "ostream_span_exporter") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_ostream_span${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_client") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http_client${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_exporter") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_log_record_exporter") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http_log${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(_OPENTELEMETRY_LIB STREQUAL "ostream_log_record_exporter") - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_ostream_logs${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - else() - set(_OPENTELEMETRY_STATIC_LIBRARY - "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_${_OPENTELEMETRY_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - endif() - add_library(opentelemetry-cpp::${_OPENTELEMETRY_LIB} STATIC IMPORTED) - set_target_properties(opentelemetry-cpp::${_OPENTELEMETRY_LIB} - PROPERTIES IMPORTED_LOCATION ${_OPENTELEMETRY_STATIC_LIBRARY}) - list(APPEND OPENTELEMETRY_BUILD_BYPRODUCTS ${_OPENTELEMETRY_STATIC_LIBRARY}) - list(APPEND OPENTELEMETRY_LIBRARIES opentelemetry-cpp::${_OPENTELEMETRY_LIB}) - endforeach() - set(OPENTELEMETRY_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${OPENTELEMETRY_PREFIX}" - -DWITH_EXAMPLES=OFF) + # Use FetchContent_Populate instead of MakeAvailable because opentelemetry-proto + # has no CMakeLists.txt. + cmake_policy(PUSH) + if(POLICY CMP0169) + cmake_policy(SET CMP0169 OLD) + endif() + fetchcontent_populate(opentelemetry_proto) + cmake_policy(POP) - set(OPENTELEMETRY_PREFIX_PATH_LIST) - # Don't specify the DEPENDS unless we actually have dependencies, else - # Ninja/other build systems may consider this target to always be dirty - set(_OPENTELEMETRY_DEPENDENCIES) - add_custom_target(opentelemetry_dependencies) + fetchcontent_declare(opentelemetry_cpp + ${FC_DECLARE_COMMON_OPTIONS} + URL ${OPENTELEMETRY_SOURCE_URL} + URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}") - set(_OPENTELEMETRY_DEPENDENCIES "opentelemetry_dependencies") - list(APPEND ARROW_BUNDLED_STATIC_LIBS ${OPENTELEMETRY_LIBRARIES}) - list(APPEND OPENTELEMETRY_PREFIX_PATH_LIST ${NLOHMANN_JSON_PREFIX}) + prepare_fetchcontent() - get_target_property(OPENTELEMETRY_PROTOBUF_INCLUDE_DIR ${ARROW_PROTOBUF_LIBPROTOBUF} - INTERFACE_INCLUDE_DIRECTORIES) - set(OPENTELEMETRY_PROTOBUF_LIBRARY "$") - set(OPENTELEMETRY_PROTOC_EXECUTABLE "$") - list(APPEND - OPENTELEMETRY_CMAKE_ARGS - -DWITH_OTLP_HTTP=ON - -DWITH_OTLP_GRPC=OFF - # Disabled because it seemed to cause linking errors. May be worth a closer look. - -DWITH_FUNC_TESTS=OFF - # These options are slated for removal in v1.14 and their features are deemed stable - # as of v1.13. However, setting their corresponding ENABLE_* macros in headers seems - # finicky - resulting in build failures or ABI-related runtime errors during HTTP - # client initialization. There may still be a solution, but we disable them for now. - -DWITH_OTLP_HTTP_SSL_PREVIEW=OFF - -DWITH_OTLP_HTTP_SSL_TLS_PREVIEW=OFF - "-DProtobuf_INCLUDE_DIR=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" - "-DProtobuf_LIBRARY=${OPENTELEMETRY_PROTOBUF_LIBRARY}" - "-DProtobuf_PROTOC_EXECUTABLE=${OPENTELEMETRY_PROTOC_EXECUTABLE}") - - # OpenTelemetry with OTLP enabled requires Protobuf definitions from a - # submodule. This submodule path is hardcoded into their CMake definitions, - # and submodules are not included in their releases. Add a custom build step - # to download and extract the Protobufs. - - # Adding such a step is rather complicated, so instead: create a separate - # ExternalProject that just fetches the Protobufs, then add a custom step - # to the main build to copy the Protobufs. - externalproject_add(opentelemetry_proto_ep - ${EP_COMMON_OPTIONS} - URL_HASH "SHA256=${ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM}" - URL ${OPENTELEMETRY_PROTO_SOURCE_URL} - BUILD_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - EXCLUDE_FROM_ALL OFF) - if(NLOHMANN_JSON_VENDORED) - add_dependencies(opentelemetry_dependencies nlohmann_json_fc) - else() - add_dependencies(opentelemetry_dependencies nlohmann_json::nlohmann_json) - endif() + set(OTELCPP_PROTO_PATH "${opentelemetry_proto_SOURCE_DIR}") + set(WITH_EXAMPLES OFF) + set(WITH_OTLP_HTTP ON) + set(WITH_OTLP_GRPC OFF) + set(WITH_FUNC_TESTS OFF) + # These options are slated for removal in v1.14 and their features are deemed stable + # as of v1.13. However, setting their corresponding ENABLE_* macros in headers seems + # finicky - resulting in build failures or ABI-related runtime errors during HTTP + # client initialization. There may still be a solution, but we disable them for now. + set(WITH_OTLP_HTTP_SSL_PREVIEW OFF) + set(WITH_OTLP_HTTP_SSL_TLS_PREVIEW OFF) - add_dependencies(opentelemetry_dependencies opentelemetry_proto_ep - ${ARROW_PROTOBUF_LIBPROTOBUF}) + fetchcontent_makeavailable(opentelemetry_cpp) - # Ensure vendored protobuf is installed before OpenTelemetry builds - if(PROTOBUF_VENDORED) - add_dependencies(opentelemetry_dependencies protobuf_fc) + if(CMAKE_VERSION VERSION_LESS 3.28) + set_property(DIRECTORY ${opentelemetry_cpp_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL TRUE) endif() - string(JOIN "${EP_LIST_SEPARATOR}" OPENTELEMETRY_PREFIX_PATH - ${OPENTELEMETRY_PREFIX_PATH_LIST}) - list(APPEND OPENTELEMETRY_CMAKE_ARGS "-DCMAKE_PREFIX_PATH=${OPENTELEMETRY_PREFIX_PATH}") + # Remove unused directories to save build directory storage + file(REMOVE_RECURSE "${opentelemetry_cpp_SOURCE_DIR}/ci") - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "s390x") - # OpenTelemetry tries to determine the processor arch for vcpkg, which fails - # on s390x, even though it doesn't use vcpkg there. Tell it ARCH manually - externalproject_add(opentelemetry_ep - ${EP_COMMON_OPTIONS} - URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ARCH=s390x - ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} - "" - ${OPENTELEMETRY_CMAKE_ARGS} - BUILD_COMMAND ${CMAKE_COMMAND} --build "" --target all - INSTALL_COMMAND ${CMAKE_COMMAND} --build "" --target - install - URL ${OPENTELEMETRY_SOURCE_URL} - BUILD_BYPRODUCTS ${OPENTELEMETRY_BUILD_BYPRODUCTS} - EXCLUDE_FROM_ALL NOT - ${ARROW_WITH_OPENTELEMETRY} - DEPENDS ${_OPENTELEMETRY_DEPENDENCIES}) - else() - externalproject_add(opentelemetry_ep - ${EP_COMMON_OPTIONS} - URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${OPENTELEMETRY_CMAKE_ARGS} - URL ${OPENTELEMETRY_SOURCE_URL} - BUILD_BYPRODUCTS ${OPENTELEMETRY_BUILD_BYPRODUCTS} - EXCLUDE_FROM_ALL NOT - ${ARROW_WITH_OPENTELEMETRY} - DEPENDS ${_OPENTELEMETRY_DEPENDENCIES}) - endif() - - externalproject_add_step(opentelemetry_ep download_proto - COMMAND ${CMAKE_COMMAND} -E copy_directory - $/opentelemetry - $/third_party/opentelemetry-proto/opentelemetry - DEPENDEES download - DEPENDERS configure) - - set(OPENTELEMETRY_VENDORED 1) - - target_link_libraries(opentelemetry-cpp::common - INTERFACE opentelemetry-cpp::api opentelemetry-cpp::sdk - Threads::Threads) - target_link_libraries(opentelemetry-cpp::resources INTERFACE opentelemetry-cpp::common) - target_link_libraries(opentelemetry-cpp::trace INTERFACE opentelemetry-cpp::common - opentelemetry-cpp::resources) - target_link_libraries(opentelemetry-cpp::logs INTERFACE opentelemetry-cpp::common - opentelemetry-cpp::resources) - target_link_libraries(opentelemetry-cpp::http_client_curl - INTERFACE opentelemetry-cpp::common opentelemetry-cpp::ext - CURL::libcurl) - target_link_libraries(opentelemetry-cpp::proto INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) - target_link_libraries(opentelemetry-cpp::otlp_recordable - INTERFACE opentelemetry-cpp::logs opentelemetry-cpp::trace - opentelemetry-cpp::resources opentelemetry-cpp::proto) - target_link_libraries(opentelemetry-cpp::otlp_http_client - INTERFACE opentelemetry-cpp::common opentelemetry-cpp::proto - opentelemetry-cpp::http_client_curl - nlohmann_json::nlohmann_json) - target_link_libraries(opentelemetry-cpp::otlp_http_exporter - INTERFACE opentelemetry-cpp::otlp_recordable - opentelemetry-cpp::otlp_http_client) - target_link_libraries(opentelemetry-cpp::otlp_http_log_record_exporter - INTERFACE opentelemetry-cpp::otlp_recordable - opentelemetry-cpp::otlp_http_client) - - foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) - add_dependencies(opentelemetry-cpp::${_OPENTELEMETRY_LIB} opentelemetry_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS opentelemetry-cpp::${_OPENTELEMETRY_LIB}) - endforeach() + # OpenTelemetry creates its own targets. We need to add them to bundled static libs. + # The targets created by OpenTelemetry's CMakeLists.txt use the opentelemetry:: namespace. + # List of libraries that we actually need and want to bundle. + set(_OPENTELEMETRY_BUNDLED_LIBS + opentelemetry-cpp::common + opentelemetry-cpp::http_client_curl + opentelemetry-cpp::logs + opentelemetry-cpp::ostream_log_record_exporter + opentelemetry-cpp::ostream_span_exporter + opentelemetry-cpp::otlp_http_client + opentelemetry-cpp::otlp_http_log_record_exporter + opentelemetry-cpp::otlp_http_exporter + opentelemetry-cpp::otlp_recordable + opentelemetry-cpp::proto + opentelemetry-cpp::resources + opentelemetry-cpp::trace + opentelemetry-cpp::version) - # Work around https://gitlab.kitware.com/cmake/cmake/issues/15052 - file(MAKE_DIRECTORY ${OPENTELEMETRY_INCLUDE_DIR}) -endmacro() + list(APPEND ARROW_BUNDLED_STATIC_LIBS ${_OPENTELEMETRY_BUNDLED_LIBS}) + set(ARROW_BUNDLED_STATIC_LIBS + "${ARROW_BUNDLED_STATIC_LIBS}" + PARENT_SCOPE) + + list(POP_BACK CMAKE_MESSAGE_INDENT) +endfunction() if(ARROW_WITH_OPENTELEMETRY) if(NOT ARROW_ENABLE_THREADING) From e4a3fb5f8609d60dcab835a5f6363beb95ee5ea2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 19 Dec 2025 11:16:42 +0900 Subject: [PATCH 05/17] GH-48579: [Ruby] Add support for reading duration array (#48580) ### Rationale for this change It's an array for elapsed time. ### What changes are included in this PR? * Add `ArrowFormat::DurationType` * Add `ArrowFormat::DurationArray` * Improve `Arrow::DurationDataType` API * Improve `Arrow::DurationArray` API * Improve `Arrow::DurationArrayBuilder` API ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #48579 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .../lib/arrow-format/array.rb | 6 ++ .../lib/arrow-format/file-reader.rb | 4 ++ .../red-arrow-format/lib/arrow-format/type.rb | 12 ++++ .../red-arrow-format/test/test-file-reader.rb | 60 +++++++++++++++++++ .../lib/arrow/duration-array-builder.rb | 27 +++++++++ ruby/red-arrow/lib/arrow/duration-array.rb | 24 ++++++++ .../red-arrow/lib/arrow/duration-data-type.rb | 32 ++++++++++ ruby/red-arrow/lib/arrow/libraries.rb | 3 + ruby/red-arrow/test/test-duration-array.rb | 23 +++++++ .../red-arrow/test/test-duration-data-type.rb | 42 +++++++++++++ 10 files changed, 233 insertions(+) create mode 100644 ruby/red-arrow/lib/arrow/duration-array-builder.rb create mode 100644 ruby/red-arrow/lib/arrow/duration-array.rb create mode 100644 ruby/red-arrow/lib/arrow/duration-data-type.rb create mode 100644 ruby/red-arrow/test/test-duration-array.rb create mode 100644 ruby/red-arrow/test/test-duration-data-type.rb diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb b/ruby/red-arrow-format/lib/arrow-format/array.rb index 4f1f6f5f92a..72cce5cbbea 100644 --- a/ruby/red-arrow-format/lib/arrow-format/array.rb +++ b/ruby/red-arrow-format/lib/arrow-format/array.rb @@ -189,6 +189,12 @@ def to_a end end + class DurationArray < TemporalArray + def to_a + apply_validity(@values_buffer.values(:s64, 0, @size)) + end + end + class VariableSizeBinaryLayoutArray < Array def initialize(type, size, validity_buffer, offsets_buffer, values_buffer) super(type, size, validity_buffer) diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb index 5a769743c6d..71f6e9cedae 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb @@ -26,6 +26,7 @@ require_relative "org/apache/arrow/flatbuf/bool" require_relative "org/apache/arrow/flatbuf/date" require_relative "org/apache/arrow/flatbuf/date_unit" +require_relative "org/apache/arrow/flatbuf/duration" require_relative "org/apache/arrow/flatbuf/fixed_size_binary" require_relative "org/apache/arrow/flatbuf/floating_point" require_relative "org/apache/arrow/flatbuf/footer" @@ -214,6 +215,9 @@ def read_field(fb_field) when Org::Apache::Arrow::Flatbuf::Timestamp unit = fb_type.unit.name.downcase.to_sym type = TimestampType.new(unit, fb_type.timezone) + when Org::Apache::Arrow::Flatbuf::Duration + unit = fb_type.unit.name.downcase.to_sym + type = DurationType.new(unit) when Org::Apache::Arrow::Flatbuf::List type = ListType.new(read_field(fb_field.children[0])) when Org::Apache::Arrow::Flatbuf::LargeList diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb b/ruby/red-arrow-format/lib/arrow-format/type.rb index 7627bf53ff3..abcefca3bf5 100644 --- a/ruby/red-arrow-format/lib/arrow-format/type.rb +++ b/ruby/red-arrow-format/lib/arrow-format/type.rb @@ -315,6 +315,18 @@ def build_array(size, validity_buffer, values_buffer) end end + class DurationType < TemporalType + attr_reader :unit + def initialize(unit) + super("Duration") + @unit = unit + end + + def build_array(size, validity_buffer, values_buffer) + DurationArray.new(self, size, validity_buffer, values_buffer) + end + end + class VariableSizeBinaryType < Type end diff --git a/ruby/red-arrow-format/test/test-file-reader.rb b/ruby/red-arrow-format/test/test-file-reader.rb index 309720b989f..12ed95846c3 100644 --- a/ruby/red-arrow-format/test/test-file-reader.rb +++ b/ruby/red-arrow-format/test/test-file-reader.rb @@ -489,6 +489,66 @@ def test_type end end + sub_test_case("Duration(:second)") do + def build_array + Arrow::DurationArray.new(:second, [0, nil, 100]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100]}], + read) + end + + def test_type + assert_equal(:second, type.unit) + end + end + + sub_test_case("Duration(:millisecond)") do + def build_array + Arrow::DurationArray.new(:milli, [0, nil, 100_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000]}], + read) + end + + def test_type + assert_equal(:millisecond, type.unit) + end + end + + sub_test_case("Duration(:microsecond)") do + def build_array + Arrow::DurationArray.new(:micro, [0, nil, 100_000_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000_000]}], + read) + end + + def test_type + assert_equal(:microsecond, type.unit) + end + end + + sub_test_case("Duration(:nanosecond)") do + def build_array + Arrow::DurationArray.new(:nano, [0, nil, 100_000_000_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000_000_000]}], + read) + end + + def test_type + assert_equal(:nanosecond, type.unit) + end + end + sub_test_case("Binary") do def build_array Arrow::BinaryArray.new(["Hello".b, nil, "World".b]) diff --git a/ruby/red-arrow/lib/arrow/duration-array-builder.rb b/ruby/red-arrow/lib/arrow/duration-array-builder.rb new file mode 100644 index 00000000000..5b1a1b283cd --- /dev/null +++ b/ruby/red-arrow/lib/arrow/duration-array-builder.rb @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class DurationArrayBuilder + class << self + def build(data_type, values) + builder = new(data_type) + builder.build(values) + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/duration-array.rb b/ruby/red-arrow/lib/arrow/duration-array.rb new file mode 100644 index 00000000000..06962bde8af --- /dev/null +++ b/ruby/red-arrow/lib/arrow/duration-array.rb @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class DurationArray + def unit + @unit ||= value_data_type.unit + end + end +end diff --git a/ruby/red-arrow/lib/arrow/duration-data-type.rb b/ruby/red-arrow/lib/arrow/duration-data-type.rb new file mode 100644 index 00000000000..bb2e1f2f870 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/duration-data-type.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class DurationDataType + class << self + # @api private + def try_convert(value) + case value + when Symbol, Arrow::TimeUnit + new(value) + else + super + end + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/libraries.rb b/ruby/red-arrow/lib/arrow/libraries.rb index 007d541423c..8427cd83ca9 100644 --- a/ruby/red-arrow/lib/arrow/libraries.rb +++ b/ruby/red-arrow/lib/arrow/libraries.rb @@ -59,6 +59,9 @@ require_relative "dense-union-data-type" require_relative "dictionary-array" require_relative "dictionary-data-type" +require_relative "duration-array" +require_relative "duration-array-builder" +require_relative "duration-data-type" require_relative "equal-options" require_relative "expression" require_relative "field" diff --git a/ruby/red-arrow/test/test-duration-array.rb b/ruby/red-arrow/test/test-duration-array.rb new file mode 100644 index 00000000000..86d0244c2be --- /dev/null +++ b/ruby/red-arrow/test/test-duration-array.rb @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class DurationArrayTest < Test::Unit::TestCase + test("#[]") do + array = Arrow::DurationArray.new(:micro, [29]) + assert_equal(29, array[0]) + end +end diff --git a/ruby/red-arrow/test/test-duration-data-type.rb b/ruby/red-arrow/test/test-duration-data-type.rb new file mode 100644 index 00000000000..80d6ddca734 --- /dev/null +++ b/ruby/red-arrow/test/test-duration-data-type.rb @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class DurationDataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + test("Arrow::TimeUnit") do + assert_equal("duration[ms]", + Arrow::DurationDataType.new(Arrow::TimeUnit::MILLI).to_s) + end + + test("Symbol") do + assert_equal("duration[ms]", + Arrow::DurationDataType.new(:milli).to_s) + end + + test("unit: Arrow::TimeUnit") do + data_type = Arrow::DurationDataType.new(unit: Arrow::TimeUnit::MILLI) + assert_equal("duration[ms]", + data_type.to_s) + end + + test("unit: Symbol") do + data_type = Arrow::DurationDataType.new(unit: :milli) + assert_equal("duration[ms]", + data_type.to_s) + end + end +end From 6b5dc9e5d70707aee974e6802a0ca3cf833206f7 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 19 Dec 2025 11:43:41 +0100 Subject: [PATCH 06/17] GH-47797: [CI][Python] Update Python installs for free-threaded wheel tasks (#47993) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Musllinux and Win wheel tasks are building Python from source. That can be changed. ### What changes are included in this PR? - usage of one of the binaries provided by [astral-sh](https://github.com/astral-sh/python-build-standalone/releases) in the free-threaded musllinux wheel tasks - usage of Python install manager for the free-threaded and regular Win wheels build (test stays as is). ### Are these changes tested? Yes, with extended wheel builds. ### Are there any user-facing changes? No. * GitHub Issue: #47797 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- ...ed-wheel-musllinux-test-imports.dockerfile | 38 ++++++------- ...-wheel-musllinux-test-unittests.dockerfile | 36 ++++++------- ...e-threaded-wheel-windows-vs2022.dockerfile | 53 ------------------- ...ython-wheel-windows-test-vs2022.dockerfile | 2 +- ...ython-wheel-windows-vs2022-base.dockerfile | 18 +++++-- .../python-wheel-windows-vs2022.dockerfile | 15 +++--- compose.yaml | 8 ++- 7 files changed, 62 insertions(+), 108 deletions(-) delete mode 100644 ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile diff --git a/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile b/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile index 3168d54f2ed..e2e4eb8f991 100644 --- a/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile +++ b/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile @@ -19,38 +19,32 @@ ARG base FROM ${base} ARG python_version=3.13 +ARG arch=aarch64 +ARG build_date -RUN apk add --no-cache \ +RUN apk update && \ + apk add --no-cache \ bash \ - build-base \ - bzip2-dev \ - g++ \ + curl \ git \ - libffi-dev \ - libnsl-dev \ - libtirpc-dev \ - linux-headers \ - ncurses-dev \ - openssl-dev \ - pkgconf \ + tar \ tzdata \ - zlib-dev + zstd -# Install Python without GIL +# Install Python with free-threading from python-build-standalone +# See available releases at: https://github.com/astral-sh/python-build-standalone/releases RUN set -e; \ case "${python_version}" in \ 3.13) python_patch_version="3.13.9";; \ 3.14) python_patch_version="3.14.0";; \ esac && \ - wget https://github.com/python/cpython/archive/refs/tags/v${python_patch_version}.tar.gz && \ - tar -xzf v${python_patch_version}.tar.gz && \ - rm v${python_patch_version}.tar.gz && \ - cd cpython-${python_patch_version}/ && \ - ./configure --disable-gil --with-ensurepip && \ - make -j && \ - make install && \ - cd ../ && \ - rm -rf cpython-${python_patch_version}/ + curl -L -o python.tar.zst \ + https://github.com/astral-sh/python-build-standalone/releases/download/${build_date}/cpython-${python_patch_version}+${build_date}-${arch}-unknown-linux-musl-freethreaded+lto-full.tar.zst && \ + mkdir -p /opt/python && \ + tar -xf python.tar.zst -C /opt/python --strip-components=1 && \ + rm python.tar.zst + +ENV PATH="/opt/python/install/bin:${PATH}" ENV ARROW_PYTHON_VENV /arrow-dev RUN python${python_version}t -m venv ${ARROW_PYTHON_VENV} diff --git a/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile b/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile index c6873612b86..53ae58c7933 100644 --- a/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile +++ b/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile @@ -19,38 +19,34 @@ ARG base FROM ${base} ARG python_version=3.13 +ARG arch=aarch64 +ARG build_date -RUN apk add --no-cache \ +RUN apk update && \ + apk add --no-cache \ bash \ build-base \ - bzip2-dev \ + curl \ g++ \ git \ - libffi-dev \ - libnsl-dev \ - libtirpc-dev \ - linux-headers \ - ncurses-dev \ - openssl-dev \ - pkgconf \ + tar \ tzdata \ - zlib-dev + zstd -# Install Python without GIL +# Install Python with free-threading from python-build-standalone +# See available releases at: https://github.com/astral-sh/python-build-standalone/releases RUN set -e; \ case "${python_version}" in \ 3.13) python_patch_version="3.13.9";; \ 3.14) python_patch_version="3.14.0";; \ esac && \ - wget https://github.com/python/cpython/archive/refs/tags/v${python_patch_version}.tar.gz && \ - tar -xzf v${python_patch_version}.tar.gz && \ - rm v${python_patch_version}.tar.gz && \ - cd cpython-${python_patch_version}/ && \ - ./configure --disable-gil --with-ensurepip && \ - make -j && \ - make install && \ - cd ../ && \ - rm -rf cpython-${python_patch_version}/ + curl -L -o python.tar.zst \ + https://github.com/astral-sh/python-build-standalone/releases/download/${build_date}/cpython-${python_patch_version}+${build_date}-${arch}-unknown-linux-musl-freethreaded+lto-full.tar.zst && \ + mkdir -p /opt/python && \ + tar -xf python.tar.zst -C /opt/python --strip-components=1 && \ + rm python.tar.zst + +ENV PATH="/opt/python/install/bin:${PATH}" ENV ARROW_PYTHON_VENV /arrow-dev RUN python${python_version}t -m venv ${ARROW_PYTHON_VENV} diff --git a/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile b/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile deleted file mode 100644 index 77a64fd5c24..00000000000 --- a/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: You must update PYTHON_WHEEL_WINDOWS_IMAGE_REVISION in .env -# when you update this file. - -ARG base -# https://github.com/hadolint/hadolint/wiki/DL3006 -# (Hadolint does not expand variables and thinks '${base}' is an untagged image) -# hadolint ignore=DL3006 -FROM ${base} - -ARG python=3.13 - -RUN (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.1") & \ - (if "%python%"=="3.14" setx PYTHON_VERSION "3.14.0") - -SHELL ["powershell", "-NoProfile", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] -RUN $version = $env:PYTHON_VERSION; \ - $filename = 'python-' + $version + '-amd64.exe'; \ - $url = 'https://www.python.org/ftp/python/' + $version + '/' + $filename; \ - Invoke-WebRequest -Uri $url -OutFile $filename; \ - Start-Process -FilePath $filename -ArgumentList '/quiet', 'Include_freethreaded=1' -Wait - -ENV PYTHON_CMD="py -${python}t" - -SHELL ["cmd", "/S", "/C"] -RUN %PYTHON_CMD% -m pip install -U pip setuptools - -COPY python/requirements-wheel-build.txt C:/arrow/python/ -# Cython wheels for 3.13 free-threaded are not released yet -RUN %PYTHON_CMD% -m pip install \ - --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ - --pre \ - --prefer-binary \ - cython -RUN %PYTHON_CMD% -m pip install -r C:/arrow/python/requirements-wheel-build.txt - -ENV PYTHON="${python}t" diff --git a/ci/docker/python-wheel-windows-test-vs2022.dockerfile b/ci/docker/python-wheel-windows-test-vs2022.dockerfile index e5d1c517587..73a9e167fea 100644 --- a/ci/docker/python-wheel-windows-test-vs2022.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2022.dockerfile @@ -42,4 +42,4 @@ RUN %PYTHON_CMD% -m pip install -U pip setuptools COPY python/requirements-wheel-test.txt C:/arrow/python/ RUN %PYTHON_CMD% -m pip install -r C:/arrow/python/requirements-wheel-test.txt -ENV PYTHON=$python +ENV PYTHON=${python} diff --git a/ci/docker/python-wheel-windows-vs2022-base.dockerfile b/ci/docker/python-wheel-windows-vs2022-base.dockerfile index 99dd27b987a..e63b8fc9945 100644 --- a/ci/docker/python-wheel-windows-vs2022-base.dockerfile +++ b/ci/docker/python-wheel-windows-vs2022-base.dockerfile @@ -74,13 +74,25 @@ RUN ` # Install choco CLI # -# Switch into Powershell just for this command because choco only provides a -# Powershell installation script. After, we switch back to cmd. +# Switch into Powershell just for the following commands because choco and Python install manager (MSIX) +# only provide a Powershell installation scripts. After, we switch back to cmd. # # See https://chocolatey.org/install#completely-offline-install SHELL ["powershell", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] RUN ` - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) + Set-ExecutionPolicy Bypass -Scope Process -Force; ` + [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; ` + iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) + +# Install the Python install manager +# +# See https://docs.python.org/dev/using/windows.html#python-install-manager and +# https://www.python.org/ftp/python/pymanager/ +RUN ` + $pymanager_url = 'https://www.python.org/ftp/python/pymanager/python-manager-25.0.msix'; ` + Invoke-WebRequest -Uri $pymanager_url -OutFile 'C:\Windows\pymanager.msix'; ` + Add-AppxPackage C:\Windows\pymanager.msix + SHELL ["cmd", "/S", "/C"] # Install CMake and other tools diff --git a/ci/docker/python-wheel-windows-vs2022.dockerfile b/ci/docker/python-wheel-windows-vs2022.dockerfile index 48d3937a0ed..d4d5e57cd2c 100644 --- a/ci/docker/python-wheel-windows-vs2022.dockerfile +++ b/ci/docker/python-wheel-windows-vs2022.dockerfile @@ -23,16 +23,15 @@ FROM ${base} # Define the full version number otherwise choco falls back to patch number 0 (3.10 => 3.10.0) ARG python=3.10 -RUN (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PYTHON_CMD "py -3.10") & \ - (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PYTHON_CMD "py -3.11") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.10" && setx PYTHON_CMD "py -3.12") & \ - (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.9" && setx PYTHON_CMD "py -3.13") & \ - (if "%python%"=="3.14" setx PYTHON_VERSION "3.14.0" && setx PYTHON_CMD "py -3.14") -RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% -RUN %PYTHON_CMD% -m pip install -U pip setuptools +ARG python_variant=default +ENV PYTHON_VERSION=${python} +ENV PYTHON_VARIANT=${python_variant} +RUN pymanager install --version %PYTHON_VERSION% --variant %PYTHON_VARIANT% + +RUN py -%PYTHON_VERSION% -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt C:/arrow/python/ -RUN %PYTHON_CMD% -m pip install -r C:/arrow/python/requirements-wheel-build.txt +RUN py -%PYTHON_VERSION% -m pip install -r C:/arrow/python/requirements-wheel-build.txt ENV PYTHON=${python} diff --git a/compose.yaml b/compose.yaml index fcdcbe289c1..84481e1af76 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1228,6 +1228,8 @@ services: args: base: "${ARCH}/alpine:${ALPINE_LINUX}" python_version: ${PYTHON} + build_date: "20251014" # python-build-standalone release date + arch: ${ARCH_ALIAS} context: . dockerfile: ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile cache_from: @@ -1270,6 +1272,8 @@ services: args: base: "${ARCH}/alpine:${ALPINE_LINUX}" python_version: ${PYTHON} + build_date: "20251014" # python-build-standalone release date + arch: ${ARCH_ALIAS} context: . dockerfile: ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile cache_from: @@ -1384,6 +1388,7 @@ services: args: base: ${REPO}:python-wheel-windows-vs2022-base-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} python: ${PYTHON} + python_variant: default context: . dockerfile: ci/docker/python-wheel-windows-vs2022.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. @@ -1400,8 +1405,9 @@ services: args: base: ${REPO}:python-wheel-windows-vs2022-base-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} python: ${PYTHON} + python_variant: freethreaded context: . - dockerfile: ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile + dockerfile: ci/docker/python-wheel-windows-vs2022.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. # Uncomment if no local cache is available. # cache_from: From 4bf59c952f1fe0f9277619926d80cb592dda02f5 Mon Sep 17 00:00:00 2001 From: tennisleng <83838474+tennisleng@users.noreply.github.com> Date: Fri, 19 Dec 2025 06:52:45 -0500 Subject: [PATCH 07/17] MINOR: [C++] Fix typos in telemetry and compute comments (#48562) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Fix two typos in C++ documentation comments: - `cpp/src/arrow/telemetry/logging.h`: "occured" → "occurred" - `cpp/src/arrow/compute/exec.cc`: "hte" → "the" ## Type of change - [ ] Bug fix - [ ] New feature - [x] Documentation / Minor (typo fix) ## Checklist - [x] My code follows the code style of this project - [x] I have updated the documentation accordingly (N/A - comment only) - [x] I have added tests to cover my changes (N/A - comment only) Authored-by: Andrew Leng Signed-off-by: Raúl Cumplido --- cpp/src/arrow/compute/exec.cc | 2 +- cpp/src/arrow/telemetry/logging.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 1be398fdae9..411ff0bb026 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -410,7 +410,7 @@ bool ExecSpanIterator::Next(ExecSpan* span) { // The first time this is called, we populate the output span with any // Scalar or Array arguments in the ExecValue struct, and then just // increment array offsets below. If any arguments are ChunkedArray, then - // the internal ArraySpans will see their members updated during hte + // the internal ArraySpans will see their members updated during the // iteration span->values.resize(args_->size()); for (size_t i = 0; i < args_->size(); ++i) { diff --git a/cpp/src/arrow/telemetry/logging.h b/cpp/src/arrow/telemetry/logging.h index 04b39e6c198..ecccc26efda 100644 --- a/cpp/src/arrow/telemetry/logging.h +++ b/cpp/src/arrow/telemetry/logging.h @@ -60,7 +60,7 @@ class ARROW_EXPORT OtelLogger : public util::Logger { class ARROW_EXPORT OtelLoggerProvider { public: /// \brief Attempt to flush the log record processor associated with the provider - /// \return `true` if the flush occured + /// \return `true` if the flush occurred static bool Flush(std::chrono::microseconds timeout = std::chrono::microseconds::max()); static Result> MakeLogger( From b796fd2e095ad3cb05330ba28cb5f85f1adfe14a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 19 Dec 2025 19:41:45 +0100 Subject: [PATCH 08/17] MINOR: [C++] Fix C++17-related comments (#48598) ### Rationale for this change As a followup to https://github.com/apache/arrow/pull/48414, fix the comments that I had forgotten to update. ### Are these changes tested? No testing required. ### Are there any user-facing changes? No. Authored-by: Antoine Pitrou Signed-off-by: Nic Crane --- cpp/cmake_modules/SetupCxxFlags.cmake | 2 +- cpp/src/arrow/CMakeLists.txt | 2 +- matlab/README.md | 2 +- matlab/tools/cmake/BuildMatlabArrowInterface.cmake | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index c92daf32690..f4ff0bded3d 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -139,7 +139,7 @@ if(NOT DEFINED CMAKE_C_STANDARD) set(CMAKE_C_STANDARD 11) endif() -# This ensures that things like c++17 get passed correctly +# This ensures that a standard higher than the minimum can be passed correctly if(NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 20) elseif(${CMAKE_CXX_STANDARD} VERSION_LESS 20) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a46db60321a..df9b783d531 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -1180,7 +1180,7 @@ endif() foreach(LIB_TARGET ${ARROW_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) - # C++17 is required to compile against Arrow C++ headers and libraries + # C++20 is required to compile against Arrow C++ headers and libraries target_compile_features(${LIB_TARGET} PUBLIC cxx_std_20) endforeach() diff --git a/matlab/README.md b/matlab/README.md index 30684625e9c..264d7f6ece5 100644 --- a/matlab/README.md +++ b/matlab/README.md @@ -64,7 +64,7 @@ To build the MATLAB Interface to Apache Arrow from source, the following softwar 1. [MATLAB](https://www.mathworks.com/products/get-matlab.html) 2. [CMake](https://cmake.org/cmake/help/latest/) -3. C++ compiler which supports C++17 (e.g. [`gcc`](https://gcc.gnu.org/) on Linux, [`Xcode`](https://developer.apple.com/xcode/) on macOS, or [`Visual Studio`](https://visualstudio.microsoft.com/) on Windows) +3. C++ compiler which supports C++20 (e.g. [`gcc`](https://gcc.gnu.org/) on Linux, [`Xcode`](https://developer.apple.com/xcode/) on macOS, or [`Visual Studio`](https://visualstudio.microsoft.com/) on Windows) 4. [Git](https://git-scm.com/) ## Setup diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index cc021b3400b..8c99d04af5b 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -130,7 +130,7 @@ libmexclass_client_add_proxy_library( INCLUDE_DIRS ${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_INCLUDE_DIRS} LINK_LIBRARIES arrow_shared ) -# Use C++17 +# Use C++20 target_compile_features(${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_NAME} PRIVATE cxx_std_20) target_compile_definitions(${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_NAME} PRIVATE ARROW_MATLAB_EXPORTING) From 58da4ed7e7df329e8a12595a418d721eede88629 Mon Sep 17 00:00:00 2001 From: kilavvy <140459108+kilavvy@users.noreply.github.com> Date: Fri, 19 Dec 2025 19:57:23 +0100 Subject: [PATCH 09/17] MINOR: Fix Typos in Documentation for Grouped Aggregations and Cache Locality (#46793) **Description:** This pull request corrects minor typos in the documentation files: - In `docs/source/cpp/compute.rst`, the word "invokable" is now spelled correctly. - In `docs/source/developers/cpp/acero.rst`, the word "initially" is now spelled correctly. These changes improve the clarity and professionalism of the documentation without affecting any code functionality. Authored-by: kilavvy <140459108+kilavvy@users.noreply.github.com> Signed-off-by: Nic Crane --- docs/source/cpp/compute.rst | 2 +- docs/source/developers/cpp/acero.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index dc0a2f90311..e4092af70cd 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -315,7 +315,7 @@ the input to a single output value. Grouped Aggregations ("group by") ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Grouped aggregations are not directly invokable, but are used as part of a +Grouped aggregations are not directly invocable, but are used as part of a SQL-style "group by" operation. Like scalar aggregations, grouped aggregations reduce multiple input values to a single output value. Instead of aggregating all values of the input, however, grouped aggregations partition the input diff --git a/docs/source/developers/cpp/acero.rst b/docs/source/developers/cpp/acero.rst index 0492007904f..d024a2d03d6 100644 --- a/docs/source/developers/cpp/acero.rst +++ b/docs/source/developers/cpp/acero.rst @@ -328,7 +328,7 @@ this leads to problems with cache locality. For example, let's assume we have a exec nodes, scan, project, and then filter (this is a very common use case). Now let's assume there are 100 batches. In a task-per-operator model we would have tasks like "Scan Batch 5", "Project Batch 5", and "Filter Batch 5". Each of those tasks is potentially going to access the same data. For example, maybe the ``project`` and ``filter`` nodes need -to read the same column. A column which is intially created in a decode phase of the ``scan`` node. To maximize cache +to read the same column. A column which is initially created in a decode phase of the ``scan`` node. To maximize cache utilization we would need to carefully schedule our tasks to ensure that all three of those tasks are run consecutively and assigned to the same CPU core. From d16e42ed56f363ce477e5634f49f935ab3ea0ebd Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Sat, 20 Dec 2025 05:05:19 +0900 Subject: [PATCH 10/17] GH-48463: [Python] Improve error message in CheckTypeExact arrow_to_pandas.cc (#48464) ### Rationale for this change https://github.com/apache/arrow/commit/3b000b7062b5614a02134d1a90ae381b937aac07 first introduced this todo as below https://github.com/apache/arrow/blob/0bfbd19bce3e10163537b349f9205b635c87eea7/python/pyarrow/src/arrow/python/arrow_to_pandas.cc#L1130 Then, the error messages are actually handled at the higher level if I am not mistaken (e.g., 05bc63c3b76a5fc865434f596c63ff0f26388f69 introduced `_sanitize_arrays` and casting at `asarray` ). So the end users would not reach this. This is more for a sanity check and developers to debug. This is to improve the error message for the sanity check and developers. ### What changes are included in this PR? This PR improves the error message in CheckTypeExact arrow_to_pandas.cc by adding the corresponding NumPy string type. ### Are these changes tested? Unittest is added, and manually run as below: ``` pytest pyarrow/tests/test_cpp_internals.py::test_get_numpy_type_name -v ``` ### Are there any user-facing changes? No. * GitHub Issue: #48463 Authored-by: Hyukjin Kwon Signed-off-by: Rok Mihevc --- .../pyarrow/src/arrow/python/arrow_to_pandas.cc | 5 +++-- python/pyarrow/src/arrow/python/python_test.cc | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index ed4f394362a..d392d099719 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1127,8 +1127,9 @@ class TypedPandasWriter : public PandasWriter { Status CheckTypeExact(const DataType& type, Type::type expected) { if (type.id() != expected) { - // TODO(wesm): stringify NumPy / pandas type - return Status::NotImplemented("Cannot write Arrow data of type ", type.ToString()); + return Status::NotImplemented("Cannot write Arrow data of type ", type.ToString(), + " to pandas block with NumPy type ", + GetNumPyTypeName(NPY_TYPE)); } return Status::OK(); } diff --git a/python/pyarrow/src/arrow/python/python_test.cc b/python/pyarrow/src/arrow/python/python_test.cc index f2f9c4791db..85cc6c9a969 100644 --- a/python/pyarrow/src/arrow/python/python_test.cc +++ b/python/pyarrow/src/arrow/python/python_test.cc @@ -32,6 +32,7 @@ #include "arrow/python/decimal.h" #include "arrow/python/helpers.h" #include "arrow/python/numpy_convert.h" +#include "arrow/python/numpy_internal.h" #include "arrow/python/numpy_interop.h" #include "arrow/python/python_test.h" #include "arrow/python/python_to_arrow.h" @@ -847,6 +848,21 @@ Status TestUpdateWithNaN() { return Status::OK(); } +Status TestGetNumPyTypeName() { + ASSERT_EQ(GetNumPyTypeName(NPY_BOOL), "bool"); + ASSERT_EQ(GetNumPyTypeName(NPY_INT8), "int8"); + ASSERT_EQ(GetNumPyTypeName(NPY_INT16), "int16"); + ASSERT_EQ(GetNumPyTypeName(NPY_INT32), "int32"); + ASSERT_EQ(GetNumPyTypeName(NPY_INT64), "int64"); + ASSERT_EQ(GetNumPyTypeName(NPY_UINT8), "uint8"); + ASSERT_EQ(GetNumPyTypeName(NPY_UINT16), "uint16"); + ASSERT_EQ(GetNumPyTypeName(NPY_UINT32), "uint32"); + ASSERT_EQ(GetNumPyTypeName(NPY_UINT64), "uint64"); + ASSERT_EQ(GetNumPyTypeName(NPY_FLOAT32), "float32"); + ASSERT_EQ(GetNumPyTypeName(NPY_FLOAT64), "float64"); + return Status::OK(); +} + } // namespace std::vector GetCppTestCases() { @@ -886,6 +902,7 @@ std::vector GetCppTestCases() { TestMixedPrecisionAndScaleSequenceConvert}, {"test_simple_inference", TestSimpleInference}, {"test_update_with_nan", TestUpdateWithNaN}, + {"test_get_numpy_type_name", TestGetNumPyTypeName}, }; } From 4a286ebf7d8f11f7eee48b86a7e8ea20c64a2da7 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 19 Dec 2025 17:28:25 -0500 Subject: [PATCH 11/17] GH-48570: [C++] Add Missing Fuzz Sources to Meson configuration (#48571) ### Rationale for this change The meson configuration on main is broken - this fixes it ### What changes are included in this PR? Missing fuzzing targets are added to the configuration ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #48570 Authored-by: Will Ayd Signed-off-by: Sutou Kouhei --- cpp/src/arrow/meson.build | 1 + cpp/src/parquet/meson.build | 1 + 2 files changed, 2 insertions(+) diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index 43050aa1597..48d01db729d 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -192,6 +192,7 @@ arrow_util_srcs = [ 'util/float16.cc', 'util/formatting.cc', 'util/future.cc', + 'util/fuzz_internal.cc', 'util/hashing.cc', 'util/int_util.cc', 'util/io_util.cc', diff --git a/cpp/src/parquet/meson.build b/cpp/src/parquet/meson.build index e6ff43a0bae..a75a1b94d08 100644 --- a/cpp/src/parquet/meson.build +++ b/cpp/src/parquet/meson.build @@ -17,6 +17,7 @@ parquet_srcs = files( '../generated/parquet_types.cpp', + 'arrow/fuzz_internal.cc', 'arrow/path_internal.cc', 'arrow/reader.cc', 'arrow/reader_internal.cc', From 4d2cd878a53bb5a0c519d69f416d20895daa7027 Mon Sep 17 00:00:00 2001 From: Sten Larsson Date: Fri, 19 Dec 2025 23:37:26 +0100 Subject: [PATCH 12/17] GH-48486: [GLib][Ruby] Add GArrowListFlattenOptions (#48487) ### Rationale for this change The `ListFlattenOptions` class is not available in GLib/Ruby, and it is used together with the `list_flatten` compute function. ### What changes are included in this PR? This adds the `ListFlattenOptions` class to GLib. ### Are these changes tested? Yes, with Ruby unit tests. ### Are there any user-facing changes? Yes, a new class. * GitHub Issue: #48486 Authored-by: Sten Larsson Signed-off-by: Sutou Kouhei --- c_glib/arrow-glib/compute.cpp | 120 +++++++++++++++++++++++ c_glib/arrow-glib/compute.h | 16 +++ c_glib/arrow-glib/compute.hpp | 6 ++ c_glib/test/test-list-flatten-options.rb | 54 ++++++++++ 4 files changed, 196 insertions(+) create mode 100644 c_glib/test/test-list-flatten-options.rb diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index c4446c5c7ed..7f1cb7171c3 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -276,6 +276,9 @@ G_BEGIN_DECLS * #GArrowJoinOptions is a class to customize the `binary_join_element_wise` * function. * + * #GArrowListFlattenOptions is a class to customize the `list_flatten` + * function. + * * There are many functions to compute data on an array. */ @@ -7400,6 +7403,101 @@ garrow_join_options_new(void) return GARROW_JOIN_OPTIONS(options); } +enum { + PROP_LIST_FLATTEN_OPTIONS_RECURSIVE = 1, +}; + +G_DEFINE_TYPE(GArrowListFlattenOptions, + garrow_list_flatten_options, + GARROW_TYPE_FUNCTION_OPTIONS) + +static void +garrow_list_flatten_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto options = garrow_list_flatten_options_get_raw(GARROW_LIST_FLATTEN_OPTIONS(object)); + + switch (prop_id) { + case PROP_LIST_FLATTEN_OPTIONS_RECURSIVE: + options->recursive = g_value_get_boolean(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_list_flatten_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto options = garrow_list_flatten_options_get_raw(GARROW_LIST_FLATTEN_OPTIONS(object)); + + switch (prop_id) { + case PROP_LIST_FLATTEN_OPTIONS_RECURSIVE: + g_value_set_boolean(value, options->recursive); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_list_flatten_options_init(GArrowListFlattenOptions *object) +{ + auto priv = GARROW_FUNCTION_OPTIONS_GET_PRIVATE(object); + priv->options = static_cast( + new arrow::compute::ListFlattenOptions()); +} + +static void +garrow_list_flatten_options_class_init(GArrowListFlattenOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = garrow_list_flatten_options_set_property; + gobject_class->get_property = garrow_list_flatten_options_get_property; + + arrow::compute::ListFlattenOptions options; + + GParamSpec *spec; + /** + * GArrowListFlattenOptions:recursive: + * + * If true, the list is flattened recursively until a non-list array is formed. + * + * Since: 23.0.0 + */ + spec = g_param_spec_boolean( + "recursive", + "Recursive", + "If true, the list is flattened recursively until a non-list array is formed", + options.recursive, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_LIST_FLATTEN_OPTIONS_RECURSIVE, + spec); +} + +/** + * garrow_list_flatten_options_new: + * + * Returns: A newly created #GArrowListFlattenOptions. + * + * Since: 23.0.0 + */ +GArrowListFlattenOptions * +garrow_list_flatten_options_new(void) +{ + auto options = g_object_new(GARROW_TYPE_LIST_FLATTEN_OPTIONS, NULL); + return GARROW_LIST_FLATTEN_OPTIONS(options); +} + G_END_DECLS arrow::Result @@ -7574,6 +7672,11 @@ garrow_function_options_new_raw(const arrow::compute::FunctionOptions *arrow_opt static_cast(arrow_options); auto options = garrow_join_options_new_raw(arrow_join_options); return GARROW_FUNCTION_OPTIONS(options); + } else if (arrow_type_name == "ListFlattenOptions") { + const auto arrow_list_flatten_options = + static_cast(arrow_options); + auto options = garrow_list_flatten_options_new_raw(arrow_list_flatten_options); + return GARROW_FUNCTION_OPTIONS(options); } else { auto options = g_object_new(GARROW_TYPE_FUNCTION_OPTIONS, NULL); return GARROW_FUNCTION_OPTIONS(options); @@ -8250,3 +8353,20 @@ garrow_join_options_get_raw(GArrowJoinOptions *options) return static_cast( garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); } + +GArrowListFlattenOptions * +garrow_list_flatten_options_new_raw( + const arrow::compute::ListFlattenOptions *arrow_options) +{ + return GARROW_LIST_FLATTEN_OPTIONS(g_object_new(GARROW_TYPE_LIST_FLATTEN_OPTIONS, + "recursive", + arrow_options->recursive, + NULL)); +} + +arrow::compute::ListFlattenOptions * +garrow_list_flatten_options_get_raw(GArrowListFlattenOptions *options) +{ + return static_cast( + garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); +} diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 25b12e76c23..f6cbd77ee57 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -1322,4 +1322,20 @@ GARROW_AVAILABLE_IN_23_0 GArrowJoinOptions * garrow_join_options_new(void); +#define GARROW_TYPE_LIST_FLATTEN_OPTIONS (garrow_list_flatten_options_get_type()) +GARROW_AVAILABLE_IN_23_0 +G_DECLARE_DERIVABLE_TYPE(GArrowListFlattenOptions, + garrow_list_flatten_options, + GARROW, + LIST_FLATTEN_OPTIONS, + GArrowFunctionOptions) +struct _GArrowListFlattenOptionsClass +{ + GArrowFunctionOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_23_0 +GArrowListFlattenOptions * +garrow_list_flatten_options_new(void); + G_END_DECLS diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 01a6d54f640..17f7369e7e8 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -220,3 +220,9 @@ GArrowJoinOptions * garrow_join_options_new_raw(const arrow::compute::JoinOptions *arrow_options); arrow::compute::JoinOptions * garrow_join_options_get_raw(GArrowJoinOptions *options); + +GArrowListFlattenOptions * +garrow_list_flatten_options_new_raw( + const arrow::compute::ListFlattenOptions *arrow_options); +arrow::compute::ListFlattenOptions * +garrow_list_flatten_options_get_raw(GArrowListFlattenOptions *options); diff --git a/c_glib/test/test-list-flatten-options.rb b/c_glib/test/test-list-flatten-options.rb new file mode 100644 index 00000000000..3f21c2e9e54 --- /dev/null +++ b/c_glib/test/test-list-flatten-options.rb @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestListFlattenOptions < Test::Unit::TestCase + include Helper::Buildable + + def setup + @options = Arrow::ListFlattenOptions.new + end + + def test_recursive_property + assert do + !@options.recursive? + end + @options.recursive = true + assert do + @options.recursive? + end + end + + def test_list_flatten_function_recursive + list_data_type = Arrow::ListDataType.new(Arrow::Field.new("value", Arrow::Int8DataType.new)) + nested_list = build_list_array(list_data_type, [[[1, 2], [3]], [[4, 5]]]) + + args = [ + Arrow::ArrayDatum.new(nested_list), + ] + list_flatten_function = Arrow::Function.find("list_flatten") + + @options.recursive = false + result = list_flatten_function.execute(args, @options).value + assert_equal(build_list_array(Arrow::Int8DataType.new, [[1, 2], [3], [4, 5]]), + result) + + @options.recursive = true + result = list_flatten_function.execute(args, @options).value + assert_equal(build_int8_array([1, 2, 3, 4, 5]), + result) + end +end From e1e4c0eef93d0ac80b4287682967c70c6ef9515d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 20 Dec 2025 21:47:48 +0900 Subject: [PATCH 13/17] GH-48602: [Ruby] Add support for reading interval arrays (#48603) ### Rationale for this change There are `YEAR_MONTH`, `DAY_TIME` and `MONTH_DAY_NANO` variants. ### What changes are included in this PR? * Add `ArrowFormat::YearMonthIntervalType` * Add `ArrowFormat::YearMonthIntervalArray` * Add `ArrowFormat::DayTimeIntervalType` * Add `ArrowFormat::DayTimeIntervalArray` * Add `ArrowFormat::MonthDayNanoIntervalType` * Add `ArrowFormat::MonthDayNanoIntervalArray` ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #48602 Lead-authored-by: Sutou Kouhei Co-authored-by: Sutou Kouhei Co-authored-by: Hiroyuki Sato Signed-off-by: Sutou Kouhei --- .../lib/arrow-format/array.rb | 33 ++++++++++ .../lib/arrow-format/file-reader.rb | 11 ++++ .../red-arrow-format/lib/arrow-format/type.rb | 36 ++++++++++ .../red-arrow-format/test/test-file-reader.rb | 65 +++++++++++++++++++ 4 files changed, 145 insertions(+) diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb b/ruby/red-arrow-format/lib/arrow-format/array.rb index 72cce5cbbea..ac96038f194 100644 --- a/ruby/red-arrow-format/lib/arrow-format/array.rb +++ b/ruby/red-arrow-format/lib/arrow-format/array.rb @@ -189,6 +189,39 @@ def to_a end end + class IntervalArray < TemporalArray + end + + class YearMonthIntervalArray < IntervalArray + def to_a + apply_validity(@values_buffer.values(:s32, 0, @size)) + end + end + + class DayTimeIntervalArray < IntervalArray + def to_a + values = @values_buffer. + each(:s32, 0, @size * 2). + each_slice(2). + collect do |(_, day), (_, time)| + [day, time] + end + apply_validity(values) + end + end + + class MonthDayNanoIntervalArray < IntervalArray + def to_a + buffer_types = [:s32, :s32, :s64] + value_size = IO::Buffer.size_of(buffer_types) + values = @size.times.collect do |i| + offset = value_size * i + @values_buffer.get_values(buffer_types, offset) + end + apply_validity(values) + end + end + class DurationArray < TemporalArray def to_a apply_validity(@values_buffer.values(:s64, 0, @size)) diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb index 71f6e9cedae..29c7f5edd49 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb @@ -31,6 +31,8 @@ require_relative "org/apache/arrow/flatbuf/floating_point" require_relative "org/apache/arrow/flatbuf/footer" require_relative "org/apache/arrow/flatbuf/int" +require_relative "org/apache/arrow/flatbuf/interval" +require_relative "org/apache/arrow/flatbuf/interval_unit" require_relative "org/apache/arrow/flatbuf/large_binary" require_relative "org/apache/arrow/flatbuf/large_list" require_relative "org/apache/arrow/flatbuf/large_utf8" @@ -215,6 +217,15 @@ def read_field(fb_field) when Org::Apache::Arrow::Flatbuf::Timestamp unit = fb_type.unit.name.downcase.to_sym type = TimestampType.new(unit, fb_type.timezone) + when Org::Apache::Arrow::Flatbuf::Interval + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::IntervalUnit::YEAR_MONTH + type = YearMonthIntervalType.new + when Org::Apache::Arrow::Flatbuf::IntervalUnit::DAY_TIME + type = DayTimeIntervalType.new + when Org::Apache::Arrow::Flatbuf::IntervalUnit::MONTH_DAY_NANO + type = MonthDayNanoIntervalType.new + end when Org::Apache::Arrow::Flatbuf::Duration unit = fb_type.unit.name.downcase.to_sym type = DurationType.new(unit) diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb b/ruby/red-arrow-format/lib/arrow-format/type.rb index abcefca3bf5..c6679660122 100644 --- a/ruby/red-arrow-format/lib/arrow-format/type.rb +++ b/ruby/red-arrow-format/lib/arrow-format/type.rb @@ -315,6 +315,42 @@ def build_array(size, validity_buffer, values_buffer) end end + class IntervalType < TemporalType + end + + class YearMonthIntervalType < IntervalType + def initialize + super("YearMonthInterval") + end + + def build_array(size, validity_buffer, values_buffer) + YearMonthIntervalArray.new(self, size, validity_buffer, values_buffer) + end + end + + class DayTimeIntervalType < IntervalType + def initialize + super("DayTimeInterval") + end + + def build_array(size, validity_buffer, values_buffer) + DayTimeIntervalArray.new(self, size, validity_buffer, values_buffer) + end + end + + class MonthDayNanoIntervalType < IntervalType + def initialize + super("MonthDayNanoInterval") + end + + def build_array(size, validity_buffer, values_buffer) + MonthDayNanoIntervalArray.new(self, + size, + validity_buffer, + values_buffer) + end + end + class DurationType < TemporalType attr_reader :unit def initialize(unit) diff --git a/ruby/red-arrow-format/test/test-file-reader.rb b/ruby/red-arrow-format/test/test-file-reader.rb index 12ed95846c3..6198f0cb96d 100644 --- a/ruby/red-arrow-format/test/test-file-reader.rb +++ b/ruby/red-arrow-format/test/test-file-reader.rb @@ -489,6 +489,71 @@ def test_type end end + sub_test_case("YearMonthInterval") do + def build_array + Arrow::MonthIntervalArray.new([0, nil, 100]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100]}], + read) + end + end + + sub_test_case("DayTimeInterval") do + def build_array + Arrow::DayTimeIntervalArray.new([ + {day: 1, millisecond: 100}, + nil, + {day: 3, millisecond: 300}, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + [1, 100], + nil, + [3, 300], + ], + }, + ], + read) + end + end + + sub_test_case("MonthDayNanoInterval") do + def build_array + Arrow::MonthDayNanoIntervalArray.new([ + { + month: 1, + day: 1, + nanosecond: 100, + }, + nil, + { + month: 3, + day: 3, + nanosecond: 300, + }, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + [1, 1, 100], + nil, + [3, 3, 300], + ], + }, + ], + read) + end + end + sub_test_case("Duration(:second)") do def build_array Arrow::DurationArray.new(:second, [0, nil, 100]) From bcaaf8288a8f1197b28c01c6735d0bd06ef39832 Mon Sep 17 00:00:00 2001 From: hypsakata <46911464+hypsakata@users.noreply.github.com> Date: Sat, 20 Dec 2025 21:49:01 +0900 Subject: [PATCH 14/17] GH-48478: [Ruby] Fix Ruby list inference for nested non-negative integer arrays (#48584) ### Rationale for this change When building an `Arrow::Table` from a Ruby Hash passed to `Arrow::Table.new`, nested `Integer` arrays are incorrectly inferred as `string` (utf8) if all values are non-negative. This behavior is unexpected; nested integer arrays should be consistently represented as a list type (e.g., `list` or `list`) rather than falling back to UTF-8 strings. ### What changes are included in this PR? This PR modifies the logic in `detect_builder_info()`, specifically the `when ::Array` block, to correctly identify nested non-negative integer arrays as list arrays. The change ensures that if `sub_builder_info` contains a valid `:builder`, it will be used even if `sub_builder_info` does not yet indicate that the type has been "detected." ### Are these changes tested? Yes. (`ruby ruby/red-arrow/test/run-test.rb`) ### Are there any user-facing changes? Yes. GitHub Issue: Closes #48478 * GitHub Issue: #48478 Authored-by: hypsakata <46911464+hypsakata@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- ruby/red-arrow/lib/arrow/array-builder.rb | 8 +++-- ruby/red-arrow/test/test-array-builder.rb | 40 +++++++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/array-builder.rb b/ruby/red-arrow/lib/arrow/array-builder.rb index 876fd71120b..2ccf50f3c1b 100644 --- a/ruby/red-arrow/lib/arrow/array-builder.rb +++ b/ruby/red-arrow/lib/arrow/array-builder.rb @@ -155,12 +155,14 @@ def detect_builder_info(value, builder_info) sub_builder_info = detect_builder_info(sub_value, sub_builder_info) break if sub_builder_info and sub_builder_info[:detected] end - if sub_builder_info and sub_builder_info[:detected] - sub_value_data_type = sub_builder_info[:builder].value_data_type + if sub_builder_info + sub_builder = sub_builder_info[:builder] + return builder_info unless sub_builder + sub_value_data_type = sub_builder.value_data_type field = Field.new("item", sub_value_data_type) { builder: ListArrayBuilder.new(ListDataType.new(field)), - detected: true, + detected: sub_builder_info[:detected], } else builder_info diff --git a/ruby/red-arrow/test/test-array-builder.rb b/ruby/red-arrow/test/test-array-builder.rb index fb48aba8a42..7a2d42e54b3 100644 --- a/ruby/red-arrow/test/test-array-builder.rb +++ b/ruby/red-arrow/test/test-array-builder.rb @@ -146,6 +146,46 @@ def assert_build(builder_class, raw_array) ["Apache Arrow"], ]) end + + test("lists") do + values = [ + [0, 1, 2], + [3, 4], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt8DataType.new) + assert_equal({ + data_type: data_type, + values: [ + [0, 1, 2], + [3, 4], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists") do + values = [ + [0, -1, 2], + [3, 4], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int8DataType.new) + assert_equal({ + data_type: data_type, + values: [ + [0, -1, 2], + [3, 4], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end end sub_test_case("specific builder") do From 81b03ca14e69b9200e59eb3c216a974d1248c77c Mon Sep 17 00:00:00 2001 From: Sten Larsson Date: Sun, 21 Dec 2025 02:03:55 +0100 Subject: [PATCH 15/17] GH-48610: [Ruby] Add FixedSizeListArray glue (#48609) ### Rationale for this change When testing ListSliceOptions I noticed that some glue were missing from the Ruby code. ### What changes are included in this PR? This adds the `FixedSizeListArrayBuilder.build` method in Ruby. ### Are these changes tested? Yes. ### Are there any user-facing changes? This makes it easier to create `FixedSizeListArray` objects in Ruby. * GitHub Issue: #48610 Authored-by: Sten Larsson Signed-off-by: Sutou Kouhei --- .../arrow/fixed-size-list-array-builder.rb | 29 +++++++++++++++++ ruby/red-arrow/lib/arrow/libraries.rb | 1 + .../test/test-fixed-size-list-array.rb | 31 +++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 ruby/red-arrow/lib/arrow/fixed-size-list-array-builder.rb create mode 100644 ruby/red-arrow/test/test-fixed-size-list-array.rb diff --git a/ruby/red-arrow/lib/arrow/fixed-size-list-array-builder.rb b/ruby/red-arrow/lib/arrow/fixed-size-list-array-builder.rb new file mode 100644 index 00000000000..84c6f13dca0 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/fixed-size-list-array-builder.rb @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class FixedSizeListArrayBuilder + class << self + def build(data_type, values) + builder = new(data_type) + builder.build(values) + end + end + + prepend ListValuesAppendable + end +end diff --git a/ruby/red-arrow/lib/arrow/libraries.rb b/ruby/red-arrow/lib/arrow/libraries.rb index 8427cd83ca9..3e1b4eb3f8a 100644 --- a/ruby/red-arrow/lib/arrow/libraries.rb +++ b/ruby/red-arrow/lib/arrow/libraries.rb @@ -69,6 +69,7 @@ require_relative "file-system" require_relative "fixed-size-binary-array" require_relative "fixed-size-binary-array-builder" +require_relative "fixed-size-list-array-builder" require_relative "function" require_relative "group" require_relative "half-float" diff --git a/ruby/red-arrow/test/test-fixed-size-list-array.rb b/ruby/red-arrow/test/test-fixed-size-list-array.rb new file mode 100644 index 00000000000..0436c56dcfd --- /dev/null +++ b/ruby/red-arrow/test/test-fixed-size-list-array.rb @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class FixedSizeListArrayTest < Test::Unit::TestCase + sub_test_case(".new") do + test("build") do + data_type = [:fixed_size_list, :int8, 2] + values = [ + [1, 2], + [3, 4], + nil, + ] + array = Arrow::FixedSizeListArray.new(data_type, values) + assert_equal(values, array.map { |v| v&.to_a }) + end + end +end From d12020e45719647f0ba12af9827210a8ceb84366 Mon Sep 17 00:00:00 2001 From: Sten Larsson Date: Mon, 22 Dec 2025 01:58:19 +0100 Subject: [PATCH 16/17] GH-48492: [GLib][Ruby] Add MapLookupOptions (#48513) ### Rationale for this change The `MapLookupOptions` class is not available in GLib/Ruby, and it is used together with the `map_lookup` compute function. ### What changes are included in this PR? This adds the `MapLookupOptions` class to GLib. ### Are these changes tested? Yes, with Ruby unit tests. ### Are there any user-facing changes? Yes, a new class. * GitHub Issue: #48492 Authored-by: Sten Larsson Signed-off-by: Sutou Kouhei --- c_glib/arrow-glib/compute.cpp | 196 +++++++++++++++++++++++++ c_glib/arrow-glib/compute.h | 34 +++++ c_glib/arrow-glib/compute.hpp | 5 + c_glib/test/test-map-lookup-options.rb | 73 +++++++++ 4 files changed, 308 insertions(+) create mode 100644 c_glib/test/test-map-lookup-options.rb diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 7f1cb7171c3..c620b0eb512 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -279,6 +279,9 @@ G_BEGIN_DECLS * #GArrowListFlattenOptions is a class to customize the `list_flatten` * function. * + * #GArrowMapLookupOptions is a class to customize the `map_lookup` + * function. + * * There are many functions to compute data on an array. */ @@ -7498,6 +7501,169 @@ garrow_list_flatten_options_new(void) return GARROW_LIST_FLATTEN_OPTIONS(options); } +typedef struct GArrowMapLookupOptionsPrivate_ +{ + GArrowScalar *query_key; +} GArrowMapLookupOptionsPrivate; + +enum { + PROP_MAP_LOOKUP_OPTIONS_QUERY_KEY = 1, + PROP_MAP_LOOKUP_OPTIONS_OCCURRENCE, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowMapLookupOptions, + garrow_map_lookup_options, + GARROW_TYPE_FUNCTION_OPTIONS) + +#define GARROW_MAP_LOOKUP_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_map_lookup_options_get_instance_private(GARROW_MAP_LOOKUP_OPTIONS(object))) + +static void +garrow_map_lookup_options_dispose(GObject *object) +{ + auto priv = GARROW_MAP_LOOKUP_OPTIONS_GET_PRIVATE(object); + + if (priv->query_key) { + g_object_unref(priv->query_key); + priv->query_key = NULL; + } + + G_OBJECT_CLASS(garrow_map_lookup_options_parent_class)->dispose(object); +} + +static void +garrow_map_lookup_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_MAP_LOOKUP_OPTIONS_GET_PRIVATE(object); + auto options = garrow_map_lookup_options_get_raw(GARROW_MAP_LOOKUP_OPTIONS(object)); + + switch (prop_id) { + case PROP_MAP_LOOKUP_OPTIONS_QUERY_KEY: + { + auto query_key = g_value_get_object(value); + if (priv->query_key != query_key) { + if (priv->query_key) { + g_object_unref(priv->query_key); + } + priv->query_key = GARROW_SCALAR(query_key); + if (priv->query_key) { + g_object_ref(priv->query_key); + options->query_key = garrow_scalar_get_raw(priv->query_key); + } else { + options->query_key = nullptr; + } + } + } + break; + case PROP_MAP_LOOKUP_OPTIONS_OCCURRENCE: + options->occurrence = + static_cast(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_map_lookup_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_MAP_LOOKUP_OPTIONS_GET_PRIVATE(object); + auto options = garrow_map_lookup_options_get_raw(GARROW_MAP_LOOKUP_OPTIONS(object)); + + switch (prop_id) { + case PROP_MAP_LOOKUP_OPTIONS_QUERY_KEY: + g_value_set_object(value, priv->query_key); + break; + case PROP_MAP_LOOKUP_OPTIONS_OCCURRENCE: + g_value_set_enum(value, static_cast(options->occurrence)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_map_lookup_options_init(GArrowMapLookupOptions *object) +{ + auto priv = GARROW_FUNCTION_OPTIONS_GET_PRIVATE(object); + priv->options = static_cast( + new arrow::compute::MapLookupOptions()); +} + +static void +garrow_map_lookup_options_class_init(GArrowMapLookupOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_map_lookup_options_dispose; + gobject_class->set_property = garrow_map_lookup_options_set_property; + gobject_class->get_property = garrow_map_lookup_options_get_property; + + arrow::compute::MapLookupOptions options; + + GParamSpec *spec; + /** + * GArrowMapLookupOptions:query-key: + * + * The key to lookup in the map. + * + * Since: 23.0.0 + */ + spec = g_param_spec_object("query-key", + "Query key", + "The key to lookup in the map", + GARROW_TYPE_SCALAR, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_MAP_LOOKUP_OPTIONS_QUERY_KEY, spec); + + /** + * GArrowMapLookupOptions:occurrence: + * + * Whether to return the first, last, or all matching values. + * + * Since: 23.0.0 + */ + spec = g_param_spec_enum("occurrence", + "Occurrence", + "Whether to return the first, last, or all matching values", + GARROW_TYPE_MAP_LOOKUP_OCCURRENCE, + static_cast(options.occurrence), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_MAP_LOOKUP_OPTIONS_OCCURRENCE, + spec); +} + +/** + * garrow_map_lookup_options_new: + * @query_key: (nullable): A #GArrowScalar to be looked up. + * @occurrence: A #GArrowMapLookupOccurrence. + * + * Returns: A newly created #GArrowMapLookupOptions. + * + * Since: 23.0.0 + */ +GArrowMapLookupOptions * +garrow_map_lookup_options_new(GArrowScalar *query_key, + GArrowMapLookupOccurrence occurrence) +{ + return GARROW_MAP_LOOKUP_OPTIONS(g_object_new(GARROW_TYPE_MAP_LOOKUP_OPTIONS, + "query-key", + query_key, + "occurrence", + occurrence, + NULL)); +} + G_END_DECLS arrow::Result @@ -7677,6 +7843,11 @@ garrow_function_options_new_raw(const arrow::compute::FunctionOptions *arrow_opt static_cast(arrow_options); auto options = garrow_list_flatten_options_new_raw(arrow_list_flatten_options); return GARROW_FUNCTION_OPTIONS(options); + } else if (arrow_type_name == "MapLookupOptions") { + const auto arrow_map_lookup_options = + static_cast(arrow_options); + auto options = garrow_map_lookup_options_new_raw(arrow_map_lookup_options); + return GARROW_FUNCTION_OPTIONS(options); } else { auto options = g_object_new(GARROW_TYPE_FUNCTION_OPTIONS, NULL); return GARROW_FUNCTION_OPTIONS(options); @@ -8370,3 +8541,28 @@ garrow_list_flatten_options_get_raw(GArrowListFlattenOptions *options) return static_cast( garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); } + +GArrowMapLookupOptions * +garrow_map_lookup_options_new_raw(const arrow::compute::MapLookupOptions *arrow_options) +{ + GArrowScalar *query_key = nullptr; + if (arrow_options->query_key) { + auto arrow_query_key = arrow_options->query_key; + query_key = garrow_scalar_new_raw(&arrow_query_key); + } + GArrowMapLookupOccurrence occurrence = + static_cast(arrow_options->occurrence); + return GARROW_MAP_LOOKUP_OPTIONS(g_object_new(GARROW_TYPE_MAP_LOOKUP_OPTIONS, + "query-key", + query_key, + "occurrence", + occurrence, + NULL)); +} + +arrow::compute::MapLookupOptions * +garrow_map_lookup_options_get_raw(GArrowMapLookupOptions *options) +{ + return static_cast( + garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); +} diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index f6cbd77ee57..c5166a663ec 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -1338,4 +1338,38 @@ GARROW_AVAILABLE_IN_23_0 GArrowListFlattenOptions * garrow_list_flatten_options_new(void); +/** + * GArrowMapLookupOccurrence: + * @GARROW_MAP_LOOKUP_OCCURRENCE_FIRST: Return the first matching value. + * @GARROW_MAP_LOOKUP_OCCURRENCE_LAST: Return the last matching value. + * @GARROW_MAP_LOOKUP_OCCURRENCE_ALL: Return all matching values. + * + * They correspond to the values of + * `arrow::compute::MapLookupOptions::Occurrence`. + * + * Since: 23.0.0 + */ +typedef enum { + GARROW_MAP_LOOKUP_OCCURRENCE_FIRST, + GARROW_MAP_LOOKUP_OCCURRENCE_LAST, + GARROW_MAP_LOOKUP_OCCURRENCE_ALL, +} GArrowMapLookupOccurrence; + +#define GARROW_TYPE_MAP_LOOKUP_OPTIONS (garrow_map_lookup_options_get_type()) +GARROW_AVAILABLE_IN_23_0 +G_DECLARE_DERIVABLE_TYPE(GArrowMapLookupOptions, + garrow_map_lookup_options, + GARROW, + MAP_LOOKUP_OPTIONS, + GArrowFunctionOptions) +struct _GArrowMapLookupOptionsClass +{ + GArrowFunctionOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_23_0 +GArrowMapLookupOptions * +garrow_map_lookup_options_new(GArrowScalar *query_key, + GArrowMapLookupOccurrence occurrence); + G_END_DECLS diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 17f7369e7e8..fb5907a17ba 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -226,3 +226,8 @@ garrow_list_flatten_options_new_raw( const arrow::compute::ListFlattenOptions *arrow_options); arrow::compute::ListFlattenOptions * garrow_list_flatten_options_get_raw(GArrowListFlattenOptions *options); + +GArrowMapLookupOptions * +garrow_map_lookup_options_new_raw(const arrow::compute::MapLookupOptions *arrow_options); +arrow::compute::MapLookupOptions * +garrow_map_lookup_options_get_raw(GArrowMapLookupOptions *options); diff --git a/c_glib/test/test-map-lookup-options.rb b/c_glib/test/test-map-lookup-options.rb new file mode 100644 index 00000000000..3bb76ca6e26 --- /dev/null +++ b/c_glib/test/test-map-lookup-options.rb @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestMapLookupOptions < Test::Unit::TestCase + include Helper::Buildable + + def setup + @query_key = Arrow::Int32Scalar.new(1) + @options = Arrow::MapLookupOptions.new(@query_key, + Arrow::MapLookupOccurrence::FIRST) + end + + def test_query_key_property + assert_equal(@query_key, @options.query_key) + new_query_key = Arrow::Int32Scalar.new(2) + @options.query_key = new_query_key + assert_equal(new_query_key, @options.query_key) + end + + def test_occurrence_property + assert_equal(Arrow::MapLookupOccurrence::FIRST, @options.occurrence) + @options.occurrence = :last + assert_equal(Arrow::MapLookupOccurrence::LAST, @options.occurrence) + @options.occurrence = :all + assert_equal(Arrow::MapLookupOccurrence::ALL, @options.occurrence) + @options.occurrence = :first + assert_equal(Arrow::MapLookupOccurrence::FIRST, @options.occurrence) + end + + def test_map_lookup_function + map_array = build_map_array(Arrow::Int32DataType.new, + Arrow::StringDataType.new, + [[ + [1, "first_one"], + [2, "two"], + [1, nil], + [3, "three"], + [1, "second_one"], + [1, "last_one"], + ]]) + args = [Arrow::ArrayDatum.new(map_array)] + map_lookup_function = Arrow::Function.find("map_lookup") + @options.query_key = Arrow::Int32Scalar.new(1) + + @options.occurrence = :first + result = map_lookup_function.execute(args, @options).value + assert_equal(build_string_array(["first_one"]), result) + + @options.occurrence = :last + result = map_lookup_function.execute(args, @options).value + assert_equal(build_string_array(["last_one"]), result) + + @options.occurrence = :all + result = map_lookup_function.execute(args, @options).value + assert_equal(build_list_array(Arrow::StringDataType.new, + [["first_one", nil, "second_one", "last_one"]]), + result) + end +end From d3ba7694f13e090394c062ebabc78a23adb01b4d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 22 Dec 2025 10:17:31 +0900 Subject: [PATCH 17/17] GH-48612: [Ruby] Add support for reading streaming format (#48613) ### Rationale for this change It's a streaming variant of IPC format. ### What changes are included in this PR? * Add `ArrowFormat::MessagePullReader` * Add `ArrowFormat::StreamingPullReader` * Add `ArrowFormat::StreamingReader` * Extract read schema and record batch features as `ArrowFormat::Readable` and reuse it in `ArrowFormat::StreamingPullParser` and `ArrowFormat::FileReader` ### Are these changes tested? Yes. We should add more error case tests later. ### Are there any user-facing changes? Yes. * GitHub Issue: #48612 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ruby/red-arrow-format/lib/arrow-format.rb | 1 + .../lib/arrow-format/error.rb | 3 + .../lib/arrow-format/file-reader.rb | 338 ++----- .../lib/arrow-format/readable.rb | 242 +++++ .../lib/arrow-format/streaming-pull-reader.rb | 200 ++++ .../lib/arrow-format/streaming-reader.rb | 50 + .../red-arrow-format/test/test-file-reader.rb | 790 ---------------- ruby/red-arrow-format/test/test-reader.rb | 872 ++++++++++++++++++ 8 files changed, 1449 insertions(+), 1047 deletions(-) create mode 100644 ruby/red-arrow-format/lib/arrow-format/readable.rb create mode 100644 ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb create mode 100644 ruby/red-arrow-format/lib/arrow-format/streaming-reader.rb delete mode 100644 ruby/red-arrow-format/test/test-file-reader.rb create mode 100644 ruby/red-arrow-format/test/test-reader.rb diff --git a/ruby/red-arrow-format/lib/arrow-format.rb b/ruby/red-arrow-format/lib/arrow-format.rb index aea210bfb18..2c8ecbf55c7 100644 --- a/ruby/red-arrow-format/lib/arrow-format.rb +++ b/ruby/red-arrow-format/lib/arrow-format.rb @@ -16,4 +16,5 @@ # under the License. require_relative "arrow-format/file-reader" +require_relative "arrow-format/streaming-reader" require_relative "arrow-format/version" diff --git a/ruby/red-arrow-format/lib/arrow-format/error.rb b/ruby/red-arrow-format/lib/arrow-format/error.rb index 39b0b8af156..d73c4082beb 100644 --- a/ruby/red-arrow-format/lib/arrow-format/error.rb +++ b/ruby/red-arrow-format/lib/arrow-format/error.rb @@ -19,6 +19,9 @@ class Error < StandardError end class ReadError < Error + end + + class FileReadError < ReadError attr_reader :buffer def initialize(buffer, message) @buffer = buffer diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb index 29c7f5edd49..bf50bfd1cd3 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb @@ -15,57 +15,27 @@ # specific language governing permissions and limitations # under the License. -require_relative "array" -require_relative "error" -require_relative "field" -require_relative "record-batch" -require_relative "schema" -require_relative "type" +require_relative "streaming-reader" -require_relative "org/apache/arrow/flatbuf/binary" -require_relative "org/apache/arrow/flatbuf/bool" -require_relative "org/apache/arrow/flatbuf/date" -require_relative "org/apache/arrow/flatbuf/date_unit" -require_relative "org/apache/arrow/flatbuf/duration" -require_relative "org/apache/arrow/flatbuf/fixed_size_binary" -require_relative "org/apache/arrow/flatbuf/floating_point" +require_relative "org/apache/arrow/flatbuf/block" require_relative "org/apache/arrow/flatbuf/footer" -require_relative "org/apache/arrow/flatbuf/int" -require_relative "org/apache/arrow/flatbuf/interval" -require_relative "org/apache/arrow/flatbuf/interval_unit" -require_relative "org/apache/arrow/flatbuf/large_binary" -require_relative "org/apache/arrow/flatbuf/large_list" -require_relative "org/apache/arrow/flatbuf/large_utf8" -require_relative "org/apache/arrow/flatbuf/list" -require_relative "org/apache/arrow/flatbuf/map" -require_relative "org/apache/arrow/flatbuf/message" -require_relative "org/apache/arrow/flatbuf/null" -require_relative "org/apache/arrow/flatbuf/precision" -require_relative "org/apache/arrow/flatbuf/schema" -require_relative "org/apache/arrow/flatbuf/struct_" -require_relative "org/apache/arrow/flatbuf/time" -require_relative "org/apache/arrow/flatbuf/timestamp" -require_relative "org/apache/arrow/flatbuf/time_unit" -require_relative "org/apache/arrow/flatbuf/union" -require_relative "org/apache/arrow/flatbuf/union_mode" -require_relative "org/apache/arrow/flatbuf/utf8" module ArrowFormat class FileReader include Enumerable + include Readable - MAGIC = "ARROW1".b + MAGIC = "ARROW1".b.freeze MAGIC_BUFFER = IO::Buffer.for(MAGIC) START_MARKER_SIZE = MAGIC_BUFFER.size END_MARKER_SIZE = MAGIC_BUFFER.size - CONTINUATION = "\xFF\xFF\xFF\xFF".b - CONTINUATION_BUFFER = IO::Buffer.for(CONTINUATION) # # STREAMING_FORMAT_START_OFFSET = 8 - INT32_SIZE = 4 - FOOTER_SIZE_SIZE = INT32_SIZE - METADATA_SIZE_SIZE = INT32_SIZE + CONTINUATION_BUFFER = + IO::Buffer.for(MessagePullReader::CONTINUATION_STRING) + FOOTER_SIZE_FORMAT = :s32 + FOOTER_SIZE_SIZE = IO::Buffer.size_of(FOOTER_SIZE_FORMAT) def initialize(input) case input @@ -79,45 +49,75 @@ def initialize(input) validate @footer = read_footer + @record_batches = @footer.record_batches + @schema = read_schema(@footer.schema) end - def each - offset = STREAMING_FORMAT_START_OFFSET - schema = nil - continuation_size = CONTINUATION_BUFFER.size - # streaming format - loop do - continuation = @buffer.slice(offset, continuation_size) - unless continuation == CONTINUATION_BUFFER - raise ReadError.new(@buffer, "No valid continuation") - end - offset += continuation_size + def n_record_batches + @record_batches.size + end - metadata_size = @buffer.get_value(:u32, offset) - offset += METADATA_SIZE_SIZE - break if metadata_size.zero? + def read(i) + block = @record_batches[i] - metadata_data = @buffer.slice(offset, metadata_size) - offset += metadata_size - metadata = Org::Apache::Arrow::Flatbuf::Message.new(metadata_data) + offset = block.offset - body = @buffer.slice(offset, metadata.body_length) - header = metadata.header - case header - when Org::Apache::Arrow::Flatbuf::Schema - schema = read_schema(header) - when Org::Apache::Arrow::Flatbuf::RecordBatch - n_rows = header.length - columns = [] - nodes = header.nodes - buffers = header.buffers - schema.fields.each do |field| - columns << read_column(field, nodes, buffers, body) - end - yield(RecordBatch.new(schema, n_rows, columns)) - end + # If we can report property error information, we can use + # MessagePullReader here. + # + # message_pull_reader = MessagePullReader.new do |message, body| + # return read_record_batch(message.header, @schema, body) + # end + # chunk = @buffer.slice(offset, + # MessagePullReader::CONTINUATION_SIZE + + # MessagePullReader::METADATA_LENGTH_SIZE + + # block.meta_data_length + + # block.body_length) + # message_pull_reader.consume(chunk) - offset += metadata.body_length + continuation_size = CONTINUATION_BUFFER.size + continuation = @buffer.slice(offset, continuation_size) + unless continuation == CONTINUATION_BUFFER + raise FileReadError.new(@buffer, + "Invalid continuation: #{i}: " + + continuation.inspect) + end + offset += continuation_size + + metadata_length_type = MessagePullReader::METADATA_LENGTH_TYPE + metadata_length_size = MessagePullReader::METADATA_LENGTH_SIZE + metadata_length = @buffer.get_value(metadata_length_type, offset) + expected_metadata_length = + block.meta_data_length - + continuation_size - + metadata_length_size + unless metadata_length == expected_metadata_length + raise FileReadError.new(@buffer, + "Invalid metadata length #{i}: " + + "expected:#{expected_metadata_length} " + + "actual:#{metadata_length}") + end + offset += metadata_length_size + + metadata = @buffer.slice(offset, metadata_length) + fb_message = Org::Apache::Arrow::Flatbuf::Message.new(metadata) + fb_header = fb_message.header + unless fb_header.is_a?(Org::Apache::Arrow::Flatbuf::RecordBatch) + raise FileReadError.new(@buffer, + "Not a record batch message: #{i}: " + + fb_header.class.name) + end + offset += metadata_length + + body = @buffer.slice(offset, block.body_length) + read_record_batch(fb_header, @schema, body) + end + + def each + return to_enum(__method__) {n_record_batches} unless block_given? + + @record_batches.size.times do |i| + yield(read(i)) end end @@ -127,204 +127,28 @@ def validate FOOTER_SIZE_SIZE + END_MARKER_SIZE if @buffer.size < minimum_size - raise ReadError.new(@buffer, - "Input must be larger than or equal to " + - "#{minimum_size}: #{@buffer.size}") + raise FileReadError.new(@buffer, + "Input must be larger than or equal to " + + "#{minimum_size}: #{@buffer.size}") end start_marker = @buffer.slice(0, START_MARKER_SIZE) if start_marker != MAGIC_BUFFER - raise ReadError.new(@buffer, "No start marker") + raise FileReadError.new(@buffer, "No start marker") end - end_marker = @buffer.slice(@buffer.size - END_MARKER_SIZE, END_MARKER_SIZE) + end_marker = @buffer.slice(@buffer.size - END_MARKER_SIZE, + END_MARKER_SIZE) if end_marker != MAGIC_BUFFER - raise ReadError.new(@buffer, "No end marker") + raise FileReadError.new(@buffer, "No end marker") end end def read_footer footer_size_offset = @buffer.size - END_MARKER_SIZE - FOOTER_SIZE_SIZE - footer_size = @buffer.get_value(:u32, footer_size_offset) - footer_data = @buffer.slice(footer_size_offset - footer_size, footer_size) + footer_size = @buffer.get_value(FOOTER_SIZE_FORMAT, footer_size_offset) + footer_data = @buffer.slice(footer_size_offset - footer_size, + footer_size) Org::Apache::Arrow::Flatbuf::Footer.new(footer_data) end - - def read_field(fb_field) - fb_type = fb_field.type - case fb_type - when Org::Apache::Arrow::Flatbuf::Null - type = NullType.singleton - when Org::Apache::Arrow::Flatbuf::Bool - type = BooleanType.singleton - when Org::Apache::Arrow::Flatbuf::Int - case fb_type.bit_width - when 8 - if fb_type.signed? - type = Int8Type.singleton - else - type = UInt8Type.singleton - end - when 16 - if fb_type.signed? - type = Int16Type.singleton - else - type = UInt16Type.singleton - end - when 32 - if fb_type.signed? - type = Int32Type.singleton - else - type = UInt32Type.singleton - end - when 64 - if fb_type.signed? - type = Int64Type.singleton - else - type = UInt64Type.singleton - end - end - when Org::Apache::Arrow::Flatbuf::FloatingPoint - case fb_type.precision - when Org::Apache::Arrow::Flatbuf::Precision::SINGLE - type = Float32Type.singleton - when Org::Apache::Arrow::Flatbuf::Precision::DOUBLE - type = Float64Type.singleton - end - when Org::Apache::Arrow::Flatbuf::Date - case fb_type.unit - when Org::Apache::Arrow::Flatbuf::DateUnit::DAY - type = Date32Type.singleton - when Org::Apache::Arrow::Flatbuf::DateUnit::MILLISECOND - type = Date64Type.singleton - end - when Org::Apache::Arrow::Flatbuf::Time - case fb_type.bit_width - when 32 - case fb_type.unit - when Org::Apache::Arrow::Flatbuf::TimeUnit::SECOND - type = Time32Type.new(:second) - when Org::Apache::Arrow::Flatbuf::TimeUnit::MILLISECOND - type = Time32Type.new(:millisecond) - end - when 64 - case fb_type.unit - when Org::Apache::Arrow::Flatbuf::TimeUnit::MICROSECOND - type = Time64Type.new(:microsecond) - when Org::Apache::Arrow::Flatbuf::TimeUnit::NANOSECOND - type = Time64Type.new(:nanosecond) - end - end - when Org::Apache::Arrow::Flatbuf::Timestamp - unit = fb_type.unit.name.downcase.to_sym - type = TimestampType.new(unit, fb_type.timezone) - when Org::Apache::Arrow::Flatbuf::Interval - case fb_type.unit - when Org::Apache::Arrow::Flatbuf::IntervalUnit::YEAR_MONTH - type = YearMonthIntervalType.new - when Org::Apache::Arrow::Flatbuf::IntervalUnit::DAY_TIME - type = DayTimeIntervalType.new - when Org::Apache::Arrow::Flatbuf::IntervalUnit::MONTH_DAY_NANO - type = MonthDayNanoIntervalType.new - end - when Org::Apache::Arrow::Flatbuf::Duration - unit = fb_type.unit.name.downcase.to_sym - type = DurationType.new(unit) - when Org::Apache::Arrow::Flatbuf::List - type = ListType.new(read_field(fb_field.children[0])) - when Org::Apache::Arrow::Flatbuf::LargeList - type = LargeListType.new(read_field(fb_field.children[0])) - when Org::Apache::Arrow::Flatbuf::Struct - children = fb_field.children.collect {|child| read_field(child)} - type = StructType.new(children) - when Org::Apache::Arrow::Flatbuf::Union - children = fb_field.children.collect {|child| read_field(child)} - type_ids = fb_type.type_ids - case fb_type.mode - when Org::Apache::Arrow::Flatbuf::UnionMode::DENSE - type = DenseUnionType.new(children, type_ids) - when Org::Apache::Arrow::Flatbuf::UnionMode::SPARSE - type = SparseUnionType.new(children, type_ids) - end - when Org::Apache::Arrow::Flatbuf::Map - type = MapType.new(read_field(fb_field.children[0])) - when Org::Apache::Arrow::Flatbuf::Binary - type = BinaryType.singleton - when Org::Apache::Arrow::Flatbuf::LargeBinary - type = LargeBinaryType.singleton - when Org::Apache::Arrow::Flatbuf::Utf8 - type = UTF8Type.singleton - when Org::Apache::Arrow::Flatbuf::LargeUtf8 - type = LargeUTF8Type.singleton - when Org::Apache::Arrow::Flatbuf::FixedSizeBinary - type = FixedSizeBinaryType.new(fb_type.byte_width) - end - Field.new(fb_field.name, type, fb_field.nullable?) - end - - def read_schema(fb_schema) - fields = fb_schema.fields.collect do |fb_field| - read_field(fb_field) - end - Schema.new(fields) - end - - def read_column(field, nodes, buffers, body) - node = nodes.shift - length = node.length - - return field.type.build_array(length) if field.type.is_a?(NullType) - - validity_buffer = buffers.shift - if validity_buffer.length.zero? - validity = nil - else - validity = body.slice(validity_buffer.offset, validity_buffer.length) - end - - case field.type - when BooleanType, - NumberType, - TemporalType - values_buffer = buffers.shift - values = body.slice(values_buffer.offset, values_buffer.length) - field.type.build_array(length, validity, values) - when VariableSizeBinaryType - offsets_buffer = buffers.shift - values_buffer = buffers.shift - offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) - values = body.slice(values_buffer.offset, values_buffer.length) - field.type.build_array(length, validity, offsets, values) - when FixedSizeBinaryType - values_buffer = buffers.shift - values = body.slice(values_buffer.offset, values_buffer.length) - field.type.build_array(length, validity, values) - when VariableSizeListType - offsets_buffer = buffers.shift - offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) - child = read_column(field.type.child, nodes, buffers, body) - field.type.build_array(length, validity, offsets, child) - when StructType - children = field.type.children.collect do |child| - read_column(child, nodes, buffers, body) - end - field.type.build_array(length, validity, children) - when DenseUnionType - # dense union type doesn't have validity. - types = validity - offsets_buffer = buffers.shift - offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) - children = field.type.children.collect do |child| - read_column(child, nodes, buffers, body) - end - field.type.build_array(length, types, offsets, children) - when SparseUnionType - # sparse union type doesn't have validity. - types = validity - children = field.type.children.collect do |child| - read_column(child, nodes, buffers, body) - end - field.type.build_array(length, types, children) - end - end end end diff --git a/ruby/red-arrow-format/lib/arrow-format/readable.rb b/ruby/red-arrow-format/lib/arrow-format/readable.rb new file mode 100644 index 00000000000..2d64d5387ff --- /dev/null +++ b/ruby/red-arrow-format/lib/arrow-format/readable.rb @@ -0,0 +1,242 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "array" +require_relative "field" +require_relative "record-batch" +require_relative "schema" +require_relative "type" + +require_relative "org/apache/arrow/flatbuf/binary" +require_relative "org/apache/arrow/flatbuf/bool" +require_relative "org/apache/arrow/flatbuf/date" +require_relative "org/apache/arrow/flatbuf/date_unit" +require_relative "org/apache/arrow/flatbuf/duration" +require_relative "org/apache/arrow/flatbuf/fixed_size_binary" +require_relative "org/apache/arrow/flatbuf/floating_point" +require_relative "org/apache/arrow/flatbuf/int" +require_relative "org/apache/arrow/flatbuf/interval" +require_relative "org/apache/arrow/flatbuf/interval_unit" +require_relative "org/apache/arrow/flatbuf/large_binary" +require_relative "org/apache/arrow/flatbuf/large_list" +require_relative "org/apache/arrow/flatbuf/large_utf8" +require_relative "org/apache/arrow/flatbuf/list" +require_relative "org/apache/arrow/flatbuf/map" +require_relative "org/apache/arrow/flatbuf/message" +require_relative "org/apache/arrow/flatbuf/null" +require_relative "org/apache/arrow/flatbuf/precision" +require_relative "org/apache/arrow/flatbuf/schema" +require_relative "org/apache/arrow/flatbuf/struct_" +require_relative "org/apache/arrow/flatbuf/time" +require_relative "org/apache/arrow/flatbuf/timestamp" +require_relative "org/apache/arrow/flatbuf/time_unit" +require_relative "org/apache/arrow/flatbuf/union" +require_relative "org/apache/arrow/flatbuf/union_mode" +require_relative "org/apache/arrow/flatbuf/utf8" + +module ArrowFormat + module Readable + private + def read_schema(fb_schema) + fields = fb_schema.fields.collect do |fb_field| + read_field(fb_field) + end + Schema.new(fields) + end + + def read_field(fb_field) + fb_type = fb_field.type + case fb_type + when Org::Apache::Arrow::Flatbuf::Null + type = NullType.singleton + when Org::Apache::Arrow::Flatbuf::Bool + type = BooleanType.singleton + when Org::Apache::Arrow::Flatbuf::Int + case fb_type.bit_width + when 8 + if fb_type.signed? + type = Int8Type.singleton + else + type = UInt8Type.singleton + end + when 16 + if fb_type.signed? + type = Int16Type.singleton + else + type = UInt16Type.singleton + end + when 32 + if fb_type.signed? + type = Int32Type.singleton + else + type = UInt32Type.singleton + end + when 64 + if fb_type.signed? + type = Int64Type.singleton + else + type = UInt64Type.singleton + end + end + when Org::Apache::Arrow::Flatbuf::FloatingPoint + case fb_type.precision + when Org::Apache::Arrow::Flatbuf::Precision::SINGLE + type = Float32Type.singleton + when Org::Apache::Arrow::Flatbuf::Precision::DOUBLE + type = Float64Type.singleton + end + when Org::Apache::Arrow::Flatbuf::Date + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::DateUnit::DAY + type = Date32Type.singleton + when Org::Apache::Arrow::Flatbuf::DateUnit::MILLISECOND + type = Date64Type.singleton + end + when Org::Apache::Arrow::Flatbuf::Time + case fb_type.bit_width + when 32 + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::TimeUnit::SECOND + type = Time32Type.new(:second) + when Org::Apache::Arrow::Flatbuf::TimeUnit::MILLISECOND + type = Time32Type.new(:millisecond) + end + when 64 + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::TimeUnit::MICROSECOND + type = Time64Type.new(:microsecond) + when Org::Apache::Arrow::Flatbuf::TimeUnit::NANOSECOND + type = Time64Type.new(:nanosecond) + end + end + when Org::Apache::Arrow::Flatbuf::Timestamp + unit = fb_type.unit.name.downcase.to_sym + type = TimestampType.new(unit, fb_type.timezone) + when Org::Apache::Arrow::Flatbuf::Interval + case fb_type.unit + when Org::Apache::Arrow::Flatbuf::IntervalUnit::YEAR_MONTH + type = YearMonthIntervalType.new + when Org::Apache::Arrow::Flatbuf::IntervalUnit::DAY_TIME + type = DayTimeIntervalType.new + when Org::Apache::Arrow::Flatbuf::IntervalUnit::MONTH_DAY_NANO + type = MonthDayNanoIntervalType.new + end + when Org::Apache::Arrow::Flatbuf::Duration + unit = fb_type.unit.name.downcase.to_sym + type = DurationType.new(unit) + when Org::Apache::Arrow::Flatbuf::List + type = ListType.new(read_field(fb_field.children[0])) + when Org::Apache::Arrow::Flatbuf::LargeList + type = LargeListType.new(read_field(fb_field.children[0])) + when Org::Apache::Arrow::Flatbuf::Struct + children = fb_field.children.collect {|child| read_field(child)} + type = StructType.new(children) + when Org::Apache::Arrow::Flatbuf::Union + children = fb_field.children.collect {|child| read_field(child)} + type_ids = fb_type.type_ids + case fb_type.mode + when Org::Apache::Arrow::Flatbuf::UnionMode::DENSE + type = DenseUnionType.new(children, type_ids) + when Org::Apache::Arrow::Flatbuf::UnionMode::SPARSE + type = SparseUnionType.new(children, type_ids) + end + when Org::Apache::Arrow::Flatbuf::Map + type = MapType.new(read_field(fb_field.children[0])) + when Org::Apache::Arrow::Flatbuf::Binary + type = BinaryType.singleton + when Org::Apache::Arrow::Flatbuf::LargeBinary + type = LargeBinaryType.singleton + when Org::Apache::Arrow::Flatbuf::Utf8 + type = UTF8Type.singleton + when Org::Apache::Arrow::Flatbuf::LargeUtf8 + type = LargeUTF8Type.singleton + when Org::Apache::Arrow::Flatbuf::FixedSizeBinary + type = FixedSizeBinaryType.new(fb_type.byte_width) + end + Field.new(fb_field.name, type, fb_field.nullable?) + end + + def read_record_batch(fb_record_batch, schema, body) + n_rows = fb_record_batch.length + nodes = fb_record_batch.nodes + buffers = fb_record_batch.buffers + columns = @schema.fields.collect do |field| + read_column(field, nodes, buffers, body) + end + RecordBatch.new(schema, n_rows, columns) + end + + def read_column(field, nodes, buffers, body) + node = nodes.shift + length = node.length + + return field.type.build_array(length) if field.type.is_a?(NullType) + + validity_buffer = buffers.shift + if validity_buffer.length.zero? + validity = nil + else + validity = body.slice(validity_buffer.offset, validity_buffer.length) + end + + case field.type + when BooleanType, + NumberType, + TemporalType + values_buffer = buffers.shift + values = body.slice(values_buffer.offset, values_buffer.length) + field.type.build_array(length, validity, values) + when VariableSizeBinaryType + offsets_buffer = buffers.shift + values_buffer = buffers.shift + offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) + values = body.slice(values_buffer.offset, values_buffer.length) + field.type.build_array(length, validity, offsets, values) + when FixedSizeBinaryType + values_buffer = buffers.shift + values = body.slice(values_buffer.offset, values_buffer.length) + field.type.build_array(length, validity, values) + when VariableSizeListType + offsets_buffer = buffers.shift + offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) + child = read_column(field.type.child, nodes, buffers, body) + field.type.build_array(length, validity, offsets, child) + when StructType + children = field.type.children.collect do |child| + read_column(child, nodes, buffers, body) + end + field.type.build_array(length, validity, children) + when DenseUnionType + # dense union type doesn't have validity. + types = validity + offsets_buffer = buffers.shift + offsets = body.slice(offsets_buffer.offset, offsets_buffer.length) + children = field.type.children.collect do |child| + read_column(child, nodes, buffers, body) + end + field.type.build_array(length, types, offsets, children) + when SparseUnionType + # sparse union type doesn't have validity. + types = validity + children = field.type.children.collect do |child| + read_column(child, nodes, buffers, body) + end + field.type.build_array(length, types, children) + end + end + end +end diff --git a/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb b/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb new file mode 100644 index 00000000000..ae231fccbc6 --- /dev/null +++ b/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb @@ -0,0 +1,200 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "array" +require_relative "error" +require_relative "field" +require_relative "readable" +require_relative "record-batch" +require_relative "schema" +require_relative "type" + +module ArrowFormat + class MessagePullReader + CONTINUATION_TYPE = :s32 + CONTINUATION_SIZE = IO::Buffer.size_of(CONTINUATION_TYPE) + CONTINUATION_STRING = "\xFF\xFF\xFF\xFF".b.freeze + CONTINUATION_INT32 = -1 + METADATA_LENGTH_TYPE = :s32 + METADATA_LENGTH_SIZE = IO::Buffer.size_of(METADATA_LENGTH_TYPE) + + def initialize(&on_read) + @on_read = on_read + @buffer = IO::Buffer.new(0) + @metadata_length = nil + @body_length = nil + @state = :initial + end + + def next_required_size + case @state + when :initial + CONTINUATION_SIZE + when :metadata_length + METADATA_LENGTH_SIZE + when :metadata + @metadata_length + when :body + @body_length + when :eos + 0 + end + end + + def eos? + @state == :eos + end + + def consume(chunk) + return if eos? + + if @buffer.size.zero? + target = chunk + else + @buffer.resize(@buffer.size + chunk.size) + @buffer.copy(chunk) + target = @buffer + end + + loop do + next_size = next_required_size + break if next_size.zero? + + if target.size < next_size + @buffer.resize(target.size) if @buffer.size < target.size + @buffer.copy(target) + @buffer.resize(target.size) + return + end + + case @state + when :initial + consume_initial(target) + when :metadata_length + consume_metadata_length(target) + when :metadata + consume_metadata(target) + when :body + consume_body(target) + end + break if target.size == next_size + + target = target.slice(next_size) + end + end + + private + def consume_initial(target) + continuation = target.get_value(CONTINUATION_TYPE, 0) + unless continuation == CONTINUATION_INT32 + raise ReadError.new("Invalid continuation token: " + + continuation.inspect) + end + @state = :metadata_length + end + + def consume_metadata_length(target) + length = target.get_value(METADATA_LENGTH_TYPE, 0) + if length < 0 + raise ReadError.new("Negative metadata length: " + + length.inspect) + end + if length == 0 + @state = :eos + else + @metadata_length = length + @state = :metadata + end + end + + def consume_metadata(target) + metadata_buffer = target.slice(0, @metadata_length) + @message = Org::Apache::Arrow::Flatbuf::Message.new(metadata_buffer) + @body_length = @message.body_length + if @body_length < 0 + raise ReadError.new("Negative body length: " + + @body_length.inspect) + end + @state = :body + consume_body if @body_length.zero? + end + + def consume_body(target=nil) + body = target&.slice(0, @body_length) + @on_read.call(@message, body) + @state = :initial + end + end + + class StreamingPullReader + include Readable + + attr_reader :schema + def initialize(&on_read) + @on_read = on_read + @message_pull_reader = MessagePullReader.new do |message, body| + process_message(message, body) + end + @state = :schema + @schema = nil + end + + def next_required_size + @message_pull_reader.next_required_size + end + + def eos? + @message_pull_reader.eos? + end + + def consume(chunk) + @message_pull_reader.consume(chunk) + end + + private + def process_message(message, body) + case @state + when :schema + process_schema_message(message, body) + when :record_batch + process_record_batch_message(message, body) + end + end + + def process_schema_message(message, body) + header = message.header + unless header.is_a?(Org::Apache::Arrow::Flatbuf::Schema) + raise ReadError.new("Not a schema message: " + + header.inspect) + end + + @schema = read_schema(header) + # TODO: initial dictionaries support + @state = :record_batch + end + + def process_record_batch_message(message, body) + header = message.header + unless header.is_a?(Org::Apache::Arrow::Flatbuf::RecordBatch) + raise ReadError.new("Not a record batch message: " + + header.inspect) + end + + @on_read.call(read_record_batch(header, @schema, body)) + end + end +end diff --git a/ruby/red-arrow-format/lib/arrow-format/streaming-reader.rb b/ruby/red-arrow-format/lib/arrow-format/streaming-reader.rb new file mode 100644 index 00000000000..f11972c67a2 --- /dev/null +++ b/ruby/red-arrow-format/lib/arrow-format/streaming-reader.rb @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "streaming-pull-reader" + +module ArrowFormat + class StreamingReader + include Enumerable + + attr_reader :schema + def initialize(input) + @input = input + @schema = nil + end + + def each + return to_enum(__method__) unless block_given? + + reader = StreamingPullReader.new do |record_batch| + @schema ||= reader.schema + yield(record_batch) + end + + buffer = "".b + loop do + next_size = reader.next_required_size + break if next_size.zero? + + next_chunk = @input.read(next_size, buffer) + break if next_chunk.nil? + + reader.consume(IO::Buffer.for(next_chunk)) + end + end + end +end diff --git a/ruby/red-arrow-format/test/test-file-reader.rb b/ruby/red-arrow-format/test/test-file-reader.rb deleted file mode 100644 index 6198f0cb96d..00000000000 --- a/ruby/red-arrow-format/test/test-file-reader.rb +++ /dev/null @@ -1,790 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestFileReader < Test::Unit::TestCase - def setup - Dir.mktmpdir do |tmp_dir| - table = Arrow::Table.new(value: build_array) - @path = File.join(tmp_dir, "data.arrow") - table.save(@path) - File.open(@path, "rb") do |input| - @reader = ArrowFormat::FileReader.new(input) - yield - @reader = nil - end - GC.start - end - end - - def read - @reader.to_a.collect do |record_batch| - record_batch.to_h.tap do |hash| - hash.each do |key, value| - hash[key] = value.to_a - end - end - end - end - - def type - @type ||= @reader.first.schema.fields[0].type - end - - sub_test_case("Null") do - def build_array - Arrow::NullArray.new(3) - end - - def test_read - assert_equal([{"value" => [nil, nil, nil]}], - read) - end - end - - sub_test_case("Boolean") do - def build_array - Arrow::BooleanArray.new([true, nil, false]) - end - - def test_read - assert_equal([{"value" => [true, nil, false]}], - read) - end - end - - sub_test_case("Int8") do - def build_array - Arrow::Int8Array.new([-128, nil, 127]) - end - - def test_read - assert_equal([{"value" => [-128, nil, 127]}], - read) - end - end - - sub_test_case("UInt8") do - def build_array - Arrow::UInt8Array.new([0, nil, 255]) - end - - def test_read - assert_equal([{"value" => [0, nil, 255]}], - read) - end - end - - sub_test_case("Int16") do - def build_array - Arrow::Int16Array.new([-32768, nil, 32767]) - end - - def test_read - assert_equal([{"value" => [-32768, nil, 32767]}], - read) - end - end - - sub_test_case("UInt16") do - def build_array - Arrow::UInt16Array.new([0, nil, 65535]) - end - - def test_read - assert_equal([{"value" => [0, nil, 65535]}], - read) - end - end - - sub_test_case("Int32") do - def build_array - Arrow::Int32Array.new([-2147483648, nil, 2147483647]) - end - - def test_read - assert_equal([{"value" => [-2147483648, nil, 2147483647]}], - read) - end - end - - sub_test_case("UInt32") do - def build_array - Arrow::UInt32Array.new([0, nil, 4294967295]) - end - - def test_read - assert_equal([{"value" => [0, nil, 4294967295]}], - read) - end - end - - sub_test_case("Int64") do - def build_array - Arrow::Int64Array.new([ - -9223372036854775808, - nil, - 9223372036854775807 - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - -9223372036854775808, - nil, - 9223372036854775807 - ] - } - ], - read) - end - end - - sub_test_case("UInt64") do - def build_array - Arrow::UInt64Array.new([0, nil, 18446744073709551615]) - end - - def test_read - assert_equal([{"value" => [0, nil, 18446744073709551615]}], - read) - end - end - - sub_test_case("Float32") do - def build_array - Arrow::FloatArray.new([-0.5, nil, 0.5]) - end - - def test_read - assert_equal([{"value" => [-0.5, nil, 0.5]}], - read) - end - end - - sub_test_case("Float64") do - def build_array - Arrow::DoubleArray.new([-0.5, nil, 0.5]) - end - - def test_read - assert_equal([{"value" => [-0.5, nil, 0.5]}], - read) - end - end - - sub_test_case("Date32") do - def setup(&block) - @date_2017_08_28 = 17406 - @date_2025_12_09 = 20431 - super(&block) - end - - def build_array - Arrow::Date32Array.new([@date_2017_08_28, nil, @date_2025_12_09]) - end - - def test_read - assert_equal([{"value" => [@date_2017_08_28, nil, @date_2025_12_09]}], - read) - end - end - - sub_test_case("Date64") do - def setup(&block) - @date_2017_08_28_00_00_00 = 1503878400000 - @date_2025_12_09_00_00_00 = 1765324800000 - super(&block) - end - - def build_array - Arrow::Date64Array.new([ - @date_2017_08_28_00_00_00, - nil, - @date_2025_12_09_00_00_00, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @date_2017_08_28_00_00_00, - nil, - @date_2025_12_09_00_00_00, - ], - }, - ], - read) - end - end - - sub_test_case("Time32(:second)") do - def setup(&block) - @time_00_00_10 = 10 - @time_00_01_10 = 60 + 10 - super(&block) - end - - def build_array - Arrow::Time32Array.new(:second, [@time_00_00_10, nil, @time_00_01_10]) - end - - def test_read - assert_equal([{"value" => [@time_00_00_10, nil, @time_00_01_10]}], - read) - end - - def test_type - assert_equal(:second, type.unit) - end - end - - sub_test_case("Time32(:millisecond)") do - def setup(&block) - @time_00_00_10_000 = 10 * 1000 - @time_00_01_10_000 = (60 + 10) * 1000 - super(&block) - end - - def build_array - Arrow::Time32Array.new(:milli, - [@time_00_00_10_000, nil, @time_00_01_10_000]) - end - - def test_read - assert_equal([{"value" => [@time_00_00_10_000, nil, @time_00_01_10_000]}], - read) - end - - def test_type - assert_equal(:millisecond, type.unit) - end - end - - sub_test_case("Time64(:microsecond)") do - def setup(&block) - @time_00_00_10_000_000 = 10 * 1_000_000 - @time_00_01_10_000_000 = (60 + 10) * 1_000_000 - super(&block) - end - - def build_array - Arrow::Time64Array.new(:micro, - [ - @time_00_00_10_000_000, - nil, - @time_00_01_10_000_000, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @time_00_00_10_000_000, - nil, - @time_00_01_10_000_000, - ], - }, - ], - read) - end - - def test_type - assert_equal(:microsecond, type.unit) - end - end - - sub_test_case("Time64(:nanosecond)") do - def setup(&block) - @time_00_00_10_000_000_000 = 10 * 1_000_000_000 - @time_00_01_10_000_000_000 = (60 + 10) * 1_000_000_000 - super(&block) - end - - def build_array - Arrow::Time64Array.new(:nano, - [ - @time_00_00_10_000_000_000, - nil, - @time_00_01_10_000_000_000, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @time_00_00_10_000_000_000, - nil, - @time_00_01_10_000_000_000, - ], - }, - ], - read) - end - - def test_type - assert_equal(:nanosecond, type.unit) - end - end - - sub_test_case("Timestamp(:second)") do - def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 - @timestamp_2025_12_16_05_33_58 = 1765863238 - super(&block) - end - - def build_array - Arrow::TimestampArray.new(:second, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ], - }, - ], - read) - end - end - - sub_test_case("Timestamp(:millisecond)") do - def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000 - @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000 - super(&block) - end - - def build_array - Arrow::TimestampArray.new(:milli, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ], - }, - ], - read) - end - end - - sub_test_case("Timestamp(:microsecond)") do - def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000 - @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000 - super(&block) - end - - def build_array - Arrow::TimestampArray.new(:micro, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ], - }, - ], - read) - end - end - - sub_test_case("Timestamp(:nanosecond)") do - def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000_000 - @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000_000 - super(&block) - end - - def build_array - Arrow::TimestampArray.new(:nano, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ], - }, - ], - read) - end - end - - sub_test_case("Timestamp(timezone)") do - def setup(&block) - @timezone = "UTC" - @timestamp_2019_11_18_00_09_11 = 1574003351 - @timestamp_2025_12_16_05_33_58 = 1765863238 - super(&block) - end - - def build_array - data_type = Arrow::TimestampDataType.new(:second, @timezone) - Arrow::TimestampArray.new(data_type, - [ - @timestamp_2019_11_18_00_09_11, - nil, - @timestamp_2025_12_16_05_33_58, - ]) - end - - def test_type - assert_equal([:second, @timezone], - [type.unit, type.timezone]) - end - end - - sub_test_case("YearMonthInterval") do - def build_array - Arrow::MonthIntervalArray.new([0, nil, 100]) - end - - def test_read - assert_equal([{"value" => [0, nil, 100]}], - read) - end - end - - sub_test_case("DayTimeInterval") do - def build_array - Arrow::DayTimeIntervalArray.new([ - {day: 1, millisecond: 100}, - nil, - {day: 3, millisecond: 300}, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - [1, 100], - nil, - [3, 300], - ], - }, - ], - read) - end - end - - sub_test_case("MonthDayNanoInterval") do - def build_array - Arrow::MonthDayNanoIntervalArray.new([ - { - month: 1, - day: 1, - nanosecond: 100, - }, - nil, - { - month: 3, - day: 3, - nanosecond: 300, - }, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - [1, 1, 100], - nil, - [3, 3, 300], - ], - }, - ], - read) - end - end - - sub_test_case("Duration(:second)") do - def build_array - Arrow::DurationArray.new(:second, [0, nil, 100]) - end - - def test_read - assert_equal([{"value" => [0, nil, 100]}], - read) - end - - def test_type - assert_equal(:second, type.unit) - end - end - - sub_test_case("Duration(:millisecond)") do - def build_array - Arrow::DurationArray.new(:milli, [0, nil, 100_000]) - end - - def test_read - assert_equal([{"value" => [0, nil, 100_000]}], - read) - end - - def test_type - assert_equal(:millisecond, type.unit) - end - end - - sub_test_case("Duration(:microsecond)") do - def build_array - Arrow::DurationArray.new(:micro, [0, nil, 100_000_000]) - end - - def test_read - assert_equal([{"value" => [0, nil, 100_000_000]}], - read) - end - - def test_type - assert_equal(:microsecond, type.unit) - end - end - - sub_test_case("Duration(:nanosecond)") do - def build_array - Arrow::DurationArray.new(:nano, [0, nil, 100_000_000_000]) - end - - def test_read - assert_equal([{"value" => [0, nil, 100_000_000_000]}], - read) - end - - def test_type - assert_equal(:nanosecond, type.unit) - end - end - - sub_test_case("Binary") do - def build_array - Arrow::BinaryArray.new(["Hello".b, nil, "World".b]) - end - - def test_read - assert_equal([{"value" => ["Hello".b, nil, "World".b]}], - read) - end - end - - sub_test_case("LargeBinary") do - def build_array - Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b]) - end - - def test_read - assert_equal([{"value" => ["Hello".b, nil, "World".b]}], - read) - end - end - - sub_test_case("UTF8") do - def build_array - Arrow::StringArray.new(["Hello", nil, "World"]) - end - - def test_read - assert_equal([{"value" => ["Hello", nil, "World"]}], - read) - end - end - - sub_test_case("LargeUTF8") do - def build_array - Arrow::LargeStringArray.new(["Hello", nil, "World"]) - end - - def test_read - assert_equal([{"value" => ["Hello", nil, "World"]}], - read) - end - end - - sub_test_case("FixedSizeBinary") do - def build_array - data_type = Arrow::FixedSizeBinaryDataType.new(4) - Arrow::FixedSizeBinaryArray.new(data_type, ["0124".b, nil, "abcd".b]) - end - - def test_read - assert_equal([{"value" => ["0124".b, nil, "abcd".b]}], - read) - end - end - - sub_test_case("List") do - def build_array - data_type = Arrow::ListDataType.new(name: "count", type: :int8) - Arrow::ListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]]) - end - - def test_read - assert_equal([{"value" => [[-128, 127], nil, [-1, 0, 1]]}], - read) - end - end - - sub_test_case("LargeList") do - def build_array - data_type = Arrow::LargeListDataType.new(name: "count", type: :int8) - Arrow::LargeListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]]) - end - - def test_read - assert_equal([{"value" => [[-128, 127], nil, [-1, 0, 1]]}], - read) - end - end - - sub_test_case("Struct") do - def build_array - data_type = Arrow::StructDataType.new(count: :int8, - visible: :boolean) - Arrow::StructArray.new(data_type, [[-128, nil], nil, [nil, true]]) - end - - def test_read - assert_equal([ - { - "value" => [ - [-128, nil], - nil, - [nil, true], - ], - }, - ], - read) - end - end - - sub_test_case("DenseUnion") do - def build_array - fields = [ - Arrow::Field.new("number", :int8), - Arrow::Field.new("text", :string), - ] - type_ids = [11, 13] - data_type = Arrow::DenseUnionDataType.new(fields, type_ids) - types = Arrow::Int8Array.new([11, 13, 11, 13, 13]) - value_offsets = Arrow::Int32Array.new([0, 0, 1, 1, 2]) - children = [ - Arrow::Int8Array.new([1, nil]), - Arrow::StringArray.new(["a", "b", "c"]) - ] - Arrow::DenseUnionArray.new(data_type, - types, - value_offsets, - children) - end - - def test_read - assert_equal([{"value" => [1, "a", nil, "b", "c"]}], - read) - end - end - - sub_test_case("SparseUnion") do - def build_array - fields = [ - Arrow::Field.new("number", :int8), - Arrow::Field.new("text", :string), - ] - type_ids = [11, 13] - data_type = Arrow::SparseUnionDataType.new(fields, type_ids) - types = Arrow::Int8Array.new([11, 13, 11, 13, 11]) - children = [ - Arrow::Int8Array.new([1, nil, nil, nil, 5]), - Arrow::StringArray.new([nil, "b", nil, "d", nil]) - ] - Arrow::SparseUnionArray.new(data_type, types, children) - end - - def test_read - assert_equal([{"value" => [1, "b", nil, "d", 5]}], - read) - end - end - - sub_test_case("Map") do - def build_array - data_type = Arrow::MapDataType.new(:string, :int8) - Arrow::MapArray.new(data_type, - [ - {"a" => -128, "b" => 127}, - nil, - {"c" => nil}, - ]) - end - - def test_read - assert_equal([ - { - "value" => [ - {"a" => -128, "b" => 127}, - nil, - {"c" => nil}, - ], - }, - ], - read) - end - end -end diff --git a/ruby/red-arrow-format/test/test-reader.rb b/ruby/red-arrow-format/test/test-reader.rb new file mode 100644 index 00000000000..8095adfd50f --- /dev/null +++ b/ruby/red-arrow-format/test/test-reader.rb @@ -0,0 +1,872 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ReaderTests + class << self + def included(base) + base.class_eval do + sub_test_case("Null") do + def build_array + Arrow::NullArray.new(3) + end + + def test_read + assert_equal([{"value" => [nil, nil, nil]}], + read) + end + end + + sub_test_case("Boolean") do + def build_array + Arrow::BooleanArray.new([true, nil, false]) + end + + def test_read + assert_equal([{"value" => [true, nil, false]}], + read) + end + end + + sub_test_case("Int8") do + def build_array + Arrow::Int8Array.new([-128, nil, 127]) + end + + def test_read + assert_equal([{"value" => [-128, nil, 127]}], + read) + end + end + + sub_test_case("UInt8") do + def build_array + Arrow::UInt8Array.new([0, nil, 255]) + end + + def test_read + assert_equal([{"value" => [0, nil, 255]}], + read) + end + end + + sub_test_case("Int16") do + def build_array + Arrow::Int16Array.new([-32768, nil, 32767]) + end + + def test_read + assert_equal([{"value" => [-32768, nil, 32767]}], + read) + end + end + + sub_test_case("UInt16") do + def build_array + Arrow::UInt16Array.new([0, nil, 65535]) + end + + def test_read + assert_equal([{"value" => [0, nil, 65535]}], + read) + end + end + + sub_test_case("Int32") do + def build_array + Arrow::Int32Array.new([-2147483648, nil, 2147483647]) + end + + def test_read + assert_equal([{"value" => [-2147483648, nil, 2147483647]}], + read) + end + end + + sub_test_case("UInt32") do + def build_array + Arrow::UInt32Array.new([0, nil, 4294967295]) + end + + def test_read + assert_equal([{"value" => [0, nil, 4294967295]}], + read) + end + end + + sub_test_case("Int64") do + def build_array + Arrow::Int64Array.new([ + -9223372036854775808, + nil, + 9223372036854775807 + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + -9223372036854775808, + nil, + 9223372036854775807 + ] + } + ], + read) + end + end + + sub_test_case("UInt64") do + def build_array + Arrow::UInt64Array.new([0, nil, 18446744073709551615]) + end + + def test_read + assert_equal([{"value" => [0, nil, 18446744073709551615]}], + read) + end + end + + sub_test_case("Float32") do + def build_array + Arrow::FloatArray.new([-0.5, nil, 0.5]) + end + + def test_read + assert_equal([{"value" => [-0.5, nil, 0.5]}], + read) + end + end + + sub_test_case("Float64") do + def build_array + Arrow::DoubleArray.new([-0.5, nil, 0.5]) + end + + def test_read + assert_equal([{"value" => [-0.5, nil, 0.5]}], + read) + end + end + + sub_test_case("Date32") do + def setup(&block) + @date_2017_08_28 = 17406 + @date_2025_12_09 = 20431 + super(&block) + end + + def build_array + Arrow::Date32Array.new([@date_2017_08_28, nil, @date_2025_12_09]) + end + + def test_read + assert_equal([ + { + "value" => [ + @date_2017_08_28, + nil, + @date_2025_12_09, + ], + }, + ], + read) + end + end + + sub_test_case("Date64") do + def setup(&block) + @date_2017_08_28_00_00_00 = 1503878400000 + @date_2025_12_09_00_00_00 = 1765324800000 + super(&block) + end + + def build_array + Arrow::Date64Array.new([ + @date_2017_08_28_00_00_00, + nil, + @date_2025_12_09_00_00_00, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @date_2017_08_28_00_00_00, + nil, + @date_2025_12_09_00_00_00, + ], + }, + ], + read) + end + end + + sub_test_case("Time32(:second)") do + def setup(&block) + @time_00_00_10 = 10 + @time_00_01_10 = 60 + 10 + super(&block) + end + + def build_array + Arrow::Time32Array.new(:second, [@time_00_00_10, nil, @time_00_01_10]) + end + + def test_read + assert_equal([ + { + "value" => [ + @time_00_00_10, + nil, + @time_00_01_10, + ], + }, + ], + read) + end + + def test_type + assert_equal(:second, type.unit) + end + end + + sub_test_case("Time32(:millisecond)") do + def setup(&block) + @time_00_00_10_000 = 10 * 1000 + @time_00_01_10_000 = (60 + 10) * 1000 + super(&block) + end + + def build_array + Arrow::Time32Array.new(:milli, + [ + @time_00_00_10_000, + nil, + @time_00_01_10_000, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @time_00_00_10_000, + nil, + @time_00_01_10_000, + ], + }, + ], + read) + end + + def test_type + assert_equal(:millisecond, type.unit) + end + end + + sub_test_case("Time64(:microsecond)") do + def setup(&block) + @time_00_00_10_000_000 = 10 * 1_000_000 + @time_00_01_10_000_000 = (60 + 10) * 1_000_000 + super(&block) + end + + def build_array + Arrow::Time64Array.new(:micro, + [ + @time_00_00_10_000_000, + nil, + @time_00_01_10_000_000, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @time_00_00_10_000_000, + nil, + @time_00_01_10_000_000, + ], + }, + ], + read) + end + + def test_type + assert_equal(:microsecond, type.unit) + end + end + + sub_test_case("Time64(:nanosecond)") do + def setup(&block) + @time_00_00_10_000_000_000 = 10 * 1_000_000_000 + @time_00_01_10_000_000_000 = (60 + 10) * 1_000_000_000 + super(&block) + end + + def build_array + Arrow::Time64Array.new(:nano, + [ + @time_00_00_10_000_000_000, + nil, + @time_00_01_10_000_000_000, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @time_00_00_10_000_000_000, + nil, + @time_00_01_10_000_000_000, + ], + }, + ], + read) + end + + def test_type + assert_equal(:nanosecond, type.unit) + end + end + + sub_test_case("Timestamp(:second)") do + def setup(&block) + @timestamp_2019_11_18_00_09_11 = 1574003351 + @timestamp_2025_12_16_05_33_58 = 1765863238 + super(&block) + end + + def build_array + Arrow::TimestampArray.new(:second, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ], + }, + ], + read) + end + end + + sub_test_case("Timestamp(:millisecond)") do + def setup(&block) + @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000 + @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000 + super(&block) + end + + def build_array + Arrow::TimestampArray.new(:milli, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ], + }, + ], + read) + end + end + + sub_test_case("Timestamp(:microsecond)") do + def setup(&block) + @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000 + @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000 + super(&block) + end + + def build_array + Arrow::TimestampArray.new(:micro, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ], + }, + ], + read) + end + end + + sub_test_case("Timestamp(:nanosecond)") do + def setup(&block) + @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000_000 + @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000_000 + super(&block) + end + + def build_array + Arrow::TimestampArray.new(:nano, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ], + }, + ], + read) + end + end + + sub_test_case("Timestamp(timezone)") do + def setup(&block) + @timezone = "UTC" + @timestamp_2019_11_18_00_09_11 = 1574003351 + @timestamp_2025_12_16_05_33_58 = 1765863238 + super(&block) + end + + def build_array + data_type = Arrow::TimestampDataType.new(:second, @timezone) + Arrow::TimestampArray.new(data_type, + [ + @timestamp_2019_11_18_00_09_11, + nil, + @timestamp_2025_12_16_05_33_58, + ]) + end + + def test_type + assert_equal([:second, @timezone], + [type.unit, type.timezone]) + end + end + + sub_test_case("YearMonthInterval") do + def build_array + Arrow::MonthIntervalArray.new([0, nil, 100]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100]}], + read) + end + end + + sub_test_case("DayTimeInterval") do + def build_array + Arrow::DayTimeIntervalArray.new([ + {day: 1, millisecond: 100}, + nil, + {day: 3, millisecond: 300}, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + [1, 100], + nil, + [3, 300], + ], + }, + ], + read) + end + end + + sub_test_case("MonthDayNanoInterval") do + def build_array + Arrow::MonthDayNanoIntervalArray.new([ + { + month: 1, + day: 1, + nanosecond: 100, + }, + nil, + { + month: 3, + day: 3, + nanosecond: 300, + }, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + [1, 1, 100], + nil, + [3, 3, 300], + ], + }, + ], + read) + end + end + + sub_test_case("Duration(:second)") do + def build_array + Arrow::DurationArray.new(:second, [0, nil, 100]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100]}], + read) + end + + def test_type + assert_equal(:second, type.unit) + end + end + + sub_test_case("Duration(:millisecond)") do + def build_array + Arrow::DurationArray.new(:milli, [0, nil, 100_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000]}], + read) + end + + def test_type + assert_equal(:millisecond, type.unit) + end + end + + sub_test_case("Duration(:microsecond)") do + def build_array + Arrow::DurationArray.new(:micro, [0, nil, 100_000_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000_000]}], + read) + end + + def test_type + assert_equal(:microsecond, type.unit) + end + end + + sub_test_case("Duration(:nanosecond)") do + def build_array + Arrow::DurationArray.new(:nano, [0, nil, 100_000_000_000]) + end + + def test_read + assert_equal([{"value" => [0, nil, 100_000_000_000]}], + read) + end + + def test_type + assert_equal(:nanosecond, type.unit) + end + end + + sub_test_case("Binary") do + def build_array + Arrow::BinaryArray.new(["Hello".b, nil, "World".b]) + end + + def test_read + assert_equal([{"value" => ["Hello".b, nil, "World".b]}], + read) + end + end + + sub_test_case("LargeBinary") do + def build_array + Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b]) + end + + def test_read + assert_equal([{"value" => ["Hello".b, nil, "World".b]}], + read) + end + end + + sub_test_case("UTF8") do + def build_array + Arrow::StringArray.new(["Hello", nil, "World"]) + end + + def test_read + assert_equal([{"value" => ["Hello", nil, "World"]}], + read) + end + end + + sub_test_case("LargeUTF8") do + def build_array + Arrow::LargeStringArray.new(["Hello", nil, "World"]) + end + + def test_read + assert_equal([{"value" => ["Hello", nil, "World"]}], + read) + end + end + + sub_test_case("FixedSizeBinary") do + def build_array + data_type = Arrow::FixedSizeBinaryDataType.new(4) + Arrow::FixedSizeBinaryArray.new(data_type, + ["0124".b, nil, "abcd".b]) + end + + def test_read + assert_equal([{"value" => ["0124".b, nil, "abcd".b]}], + read) + end + end + + sub_test_case("List") do + def build_array + data_type = Arrow::ListDataType.new(name: "count", type: :int8) + Arrow::ListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]]) + end + + def test_read + assert_equal([{"value" => [[-128, 127], nil, [-1, 0, 1]]}], + read) + end + end + + sub_test_case("LargeList") do + def build_array + data_type = Arrow::LargeListDataType.new(name: "count", + type: :int8) + Arrow::LargeListArray.new(data_type, + [[-128, 127], nil, [-1, 0, 1]]) + end + + def test_read + assert_equal([ + { + "value" => [ + [-128, 127], + nil, + [-1, 0, 1], + ], + }, + ], + read) + end + end + + sub_test_case("Struct") do + def build_array + data_type = Arrow::StructDataType.new(count: :int8, + visible: :boolean) + Arrow::StructArray.new(data_type, + [[-128, nil], nil, [nil, true]]) + end + + def test_read + assert_equal([ + { + "value" => [ + [-128, nil], + nil, + [nil, true], + ], + }, + ], + read) + end + end + + sub_test_case("DenseUnion") do + def build_array + fields = [ + Arrow::Field.new("number", :int8), + Arrow::Field.new("text", :string), + ] + type_ids = [11, 13] + data_type = Arrow::DenseUnionDataType.new(fields, type_ids) + types = Arrow::Int8Array.new([11, 13, 11, 13, 13]) + value_offsets = Arrow::Int32Array.new([0, 0, 1, 1, 2]) + children = [ + Arrow::Int8Array.new([1, nil]), + Arrow::StringArray.new(["a", "b", "c"]) + ] + Arrow::DenseUnionArray.new(data_type, + types, + value_offsets, + children) + end + + def test_read + assert_equal([{"value" => [1, "a", nil, "b", "c"]}], + read) + end + end + + sub_test_case("SparseUnion") do + def build_array + fields = [ + Arrow::Field.new("number", :int8), + Arrow::Field.new("text", :string), + ] + type_ids = [11, 13] + data_type = Arrow::SparseUnionDataType.new(fields, type_ids) + types = Arrow::Int8Array.new([11, 13, 11, 13, 11]) + children = [ + Arrow::Int8Array.new([1, nil, nil, nil, 5]), + Arrow::StringArray.new([nil, "b", nil, "d", nil]) + ] + Arrow::SparseUnionArray.new(data_type, types, children) + end + + def test_read + assert_equal([{"value" => [1, "b", nil, "d", 5]}], + read) + end + end + + sub_test_case("Map") do + def build_array + data_type = Arrow::MapDataType.new(:string, :int8) + Arrow::MapArray.new(data_type, + [ + {"a" => -128, "b" => 127}, + nil, + {"c" => nil}, + ]) + end + + def test_read + assert_equal([ + { + "value" => [ + {"a" => -128, "b" => 127}, + nil, + {"c" => nil}, + ], + }, + ], + read) + end + end + end + end + end +end + +class TestFileReader < Test::Unit::TestCase + include ReaderTests + + def setup + Dir.mktmpdir do |tmp_dir| + table = Arrow::Table.new(value: build_array) + @path = File.join(tmp_dir, "data.arrow") + table.save(@path) + File.open(@path, "rb") do |input| + @reader = ArrowFormat::FileReader.new(input) + yield + @reader = nil + end + GC.start + end + end + + def read + @reader.to_a.collect do |record_batch| + record_batch.to_h.tap do |hash| + hash.each do |key, value| + hash[key] = value.to_a + end + end + end + end + + def type + @type ||= @reader.first.schema.fields[0].type + end +end + +class TestStreamingReader < Test::Unit::TestCase + include ReaderTests + + def setup + Dir.mktmpdir do |tmp_dir| + table = Arrow::Table.new(value: build_array) + @path = File.join(tmp_dir, "data.arrows") + table.save(@path) + File.open(@path, "rb") do |input| + @reader = ArrowFormat::StreamingReader.new(input) + yield + @reader = nil + end + GC.start + end + end + + def read + @reader.to_a.collect do |record_batch| + record_batch.to_h.tap do |hash| + hash.each do |key, value| + hash[key] = value.to_a + end + end + end + end + + def type + @type ||= @reader.first.schema.fields[0].type + end +end