From ae15c3f4735e5fdaabc19a71824493036fa0bfa3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 13 Jan 2025 11:04:41 +0800 Subject: [PATCH 01/11] WIP: [C++][Parquet] Add RowRanges API --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/row_range.cc | 225 +++++++++++++++++++++++++++++++++ cpp/src/parquet/row_range.h | 87 +++++++++++++ 3 files changed, 313 insertions(+) create mode 100644 cpp/src/parquet/row_range.cc create mode 100644 cpp/src/parquet/row_range.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 83eb522484b..e1400713491 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -180,6 +180,7 @@ set(PARQUET_SRCS platform.cc printer.cc properties.cc + row_range.cc schema.cc size_statistics.cc statistics.cc diff --git a/cpp/src/parquet/row_range.cc b/cpp/src/parquet/row_range.cc new file mode 100644 index 00000000000..5a950d54237 --- /dev/null +++ b/cpp/src/parquet/row_range.cc @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/row_range.h" + +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/unreachable.h" +#include "parquet/exception.h" + +namespace parquet { + +class IteratorImpl : public RowRanges::Iterator { + public: + explicit IteratorImpl(const RowRanges& ranges) + : iter_(ranges.ranges_.cbegin()), end_(ranges.ranges_.cend()) {} + + ~IteratorImpl() override = default; + + std::variant + NextRange() override { + if (iter_ == end_) { + return RowRanges::End(); + } + if (std::holds_alternative(*iter_)) { + return std::get(*iter_); + } + if (std::holds_alternative(*iter_)) { + return std::get(*iter_); + } + arrow::Unreachable("Invalid row ranges type"); + } + + private: + decltype(RowRanges::ranges_.cbegin()) iter_; + decltype(RowRanges::ranges_.cend()) end_; +}; + +std::unique_ptr RowRanges::NewIterator() const { + return std::make_unique(*this); +} + +void RowRanges::Validate() const { + int64_t last_end = -1; + for (const auto& range : ranges_) { + if (std::holds_alternative(range)) { + const auto& interval = std::get(range); + if (interval.start <= last_end) { + throw ParquetException("Row ranges are not in ascending order"); + } + if (interval.end < interval.start) { + throw ParquetException("Invalid interval range"); + } + last_end = interval.end; + continue; + } + if (std::holds_alternative(range)) { + const auto& bitmap = std::get(range); + if (bitmap.offset <= last_end) { + throw ParquetException("Row ranges are not in ascending order"); + } + last_end = bitmap.offset + sizeof(bitmap.bitmap) - 1; + continue; + } + arrow::Unreachable("Invalid row ranges type"); + } +} + +int64_t RowRanges::row_count() const { + int64_t count = 0; + for (const auto& range : ranges_) { + if (std::holds_alternative(range)) { + const auto& interval = std::get(range); + count += interval.end - interval.start + 1; + } + if (std::holds_alternative(range)) { + const auto& bitmap = std::get(range); + count += arrow::internal::CountSetBits( + reinterpret_cast(&bitmap.bitmap), 0, sizeof(bitmap.bitmap)); + } + arrow::Unreachable("Invalid row ranges type"); + } + return count; +} + +RowRanges RowRanges::Intersect(const RowRanges& lhs, const RowRanges& rhs) { + RowRanges result; + auto lhs_iter = lhs.NewIterator(); + auto rhs_iter = rhs.NewIterator(); + auto lhs_range = lhs_iter->NextRange(); + auto rhs_range = rhs_iter->NextRange(); + + while (!std::holds_alternative(lhs_range) && + !std::holds_alternative(rhs_range)) { + if (!std::holds_alternative(lhs_range) || + !std::holds_alternative(rhs_range)) { + throw ParquetException("Bitmap range is not yet supported"); + } + + auto& left = std::get(lhs_range); + auto& right = std::get(rhs_range); + + // Find overlapping region + int64_t start = std::max(left.start, right.start); + int64_t end = std::min(left.end, right.end); + + // If there is an overlap, add it to results + if (start <= end) { + result.ranges_.push_back(IntervalRange{start, end}); + } + + // Advance the iterator with smaller end + if (left.end < right.end) { + lhs_range = lhs_iter->NextRange(); + } else { + rhs_range = rhs_iter->NextRange(); + } + } + + return result; +} + +RowRanges RowRanges::Union(const RowRanges& lhs, const RowRanges& rhs) { + RowRanges result; + auto lhs_iter = lhs.NewIterator(); + auto rhs_iter = rhs.NewIterator(); + auto lhs_range = lhs_iter->NextRange(); + auto rhs_range = rhs_iter->NextRange(); + + if (std::holds_alternative(lhs_range)) { + return rhs; + } + if (std::holds_alternative(rhs_range)) { + return lhs; + } + + if (std::holds_alternative(lhs_range)) { + throw ParquetException("Bitmap range is not yet supported"); + } + IntervalRange current = std::get(lhs_range); + lhs_range = lhs_iter->NextRange(); + + while (!std::holds_alternative(lhs_range) || + !std::holds_alternative(rhs_range)) { + IntervalRange next; + + if (std::holds_alternative(rhs_range)) { + // Only lhs ranges remain + if (std::holds_alternative(lhs_range)) { + throw ParquetException("Bitmap range is not yet supported"); + } + next = std::get(lhs_range); + lhs_range = lhs_iter->NextRange(); + } else if (std::holds_alternative(lhs_range)) { + // Only rhs ranges remain + if (std::holds_alternative(rhs_range)) { + throw ParquetException("Bitmap range is not yet supported"); + } + next = std::get(rhs_range); + rhs_range = rhs_iter->NextRange(); + } else { + // Both iterators have ranges - pick the one with smaller start + if (std::holds_alternative(lhs_range) || + std::holds_alternative(rhs_range)) { + throw ParquetException("Bitmap range is not yet supported"); + } + const auto& left = std::get(lhs_range); + const auto& right = std::get(rhs_range); + + if (left.start <= right.start) { + next = left; + lhs_range = lhs_iter->NextRange(); + } else { + next = right; + rhs_range = rhs_iter->NextRange(); + } + } + + if (current.end + 1 >= next.start) { + // Concatenate overlapping or adjacent ranges + current.end = std::max(current.end, next.end); + } else { + // Gap between current and next range + result.ranges_.push_back(current); + current = next; + } + } + + result.ranges_.push_back(current); + return result; +} + +RowRanges RowRanges::MakeSingle(int64_t row_count) { + RowRanges rowRanges; + rowRanges.ranges_.push_back(IntervalRange{0, row_count - 1}); + return rowRanges; +} + +RowRanges RowRanges::MakeSingle(int64_t start, int64_t end) { + RowRanges rowRanges; + rowRanges.ranges_.push_back(IntervalRange{start, end}); + return rowRanges; +} + +RowRanges RowRanges::MakeIntervals(const std::vector& intervals) { + RowRanges rowRanges; + rowRanges.ranges_.reserve(intervals.size()); + rowRanges.ranges_.insert(rowRanges.ranges_.end(), intervals.cbegin(), intervals.cend()); + return rowRanges; +} + +} // namespace parquet \ No newline at end of file diff --git a/cpp/src/parquet/row_range.h b/cpp/src/parquet/row_range.h new file mode 100644 index 00000000000..a1de68ea251 --- /dev/null +++ b/cpp/src/parquet/row_range.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). + int64_t end; + }; + + /// \brief A range of contiguous rows represented by a bitmap. + struct BitmapRange { + /// Start row of the range (inclusive). + int64_t offset; + /// Zero appended if there are less than 64 elements. + uint64_t bitmap; + }; + + /// \brief An end marker for the row range iterator. + struct End {}; + + /// \brief An iterator for accessing row ranges in order. + class Iterator { + public: + virtual ~Iterator() = default; + virtual std::variant NextRange() = 0; + }; + + /// \brief Create a new iterator for accessing row ranges in order. + std::unique_ptr NewIterator() const; + + /// \brief Validate the row ranges. + /// \throws ParquetException if the row ranges are not in ascending order or + /// overlapped. + void Validate() const; + + /// \brief Get the total number of rows in the row ranges. + int64_t row_count() const; + + /// \brief Compute the intersection of two row ranges. + static RowRanges Intersect(const RowRanges& lhs, const RowRanges& rhs); + + /// \brief Compute the union of two row ranges. + static RowRanges Union(const RowRanges& lhs, const RowRanges& rhs); + + /// \brief Make a single row range of [0, row_count - 1]. + static RowRanges MakeSingle(int64_t row_count); + + /// \brief Make a single row range of [start, end]. + static RowRanges MakeSingle(int64_t start, int64_t end); + + /// \brief Make a row range from a list of intervals. + static RowRanges MakeIntervals(const std::vector& intervals); + + private: + friend class IteratorImpl; + std::vector> ranges_; +}; + +} // namespace parquet From b7ad059eecea83dfadc16dcf05386852dc1cc6a4 Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Sun, 21 Dec 2025 14:20:49 +0100 Subject: [PATCH 02/11] mark row_range as experimental. --- cpp/src/parquet/row_range.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/row_range.h b/cpp/src/parquet/row_range.h index a1de68ea251..06d1b62d284 100644 --- a/cpp/src/parquet/row_range.h +++ b/cpp/src/parquet/row_range.h @@ -27,7 +27,7 @@ namespace parquet { /// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. class PARQUET_EXPORT RowRanges { public: - /// \brief A range of contiguous rows represented by an interval. + /// \brief EXPERIMENTAL: A range of contiguous rows represented by an interval. struct IntervalRange { /// Start row of the range (inclusive). int64_t start; @@ -35,7 +35,7 @@ class PARQUET_EXPORT RowRanges { int64_t end; }; - /// \brief A range of contiguous rows represented by a bitmap. + /// \brief EXPERIMENTAL: A range of contiguous rows represented by a bitmap. struct BitmapRange { /// Start row of the range (inclusive). int64_t offset; @@ -43,40 +43,40 @@ class PARQUET_EXPORT RowRanges { uint64_t bitmap; }; - /// \brief An end marker for the row range iterator. + /// \brief EXPERIMENTAL: An end marker for the row range iterator. struct End {}; - /// \brief An iterator for accessing row ranges in order. + /// \brief EXPERIMENTAL: An iterator for accessing row ranges in order. class Iterator { public: virtual ~Iterator() = default; virtual std::variant NextRange() = 0; }; - /// \brief Create a new iterator for accessing row ranges in order. + /// \brief EXPERIMENTAL: Create a new iterator for accessing row ranges in order. std::unique_ptr NewIterator() const; - /// \brief Validate the row ranges. + /// \brief EXPERIMENTAL: Validate the row ranges. /// \throws ParquetException if the row ranges are not in ascending order or /// overlapped. void Validate() const; - /// \brief Get the total number of rows in the row ranges. + /// \brief EXPERIMENTAL: Get the total number of rows in the row ranges. int64_t row_count() const; - /// \brief Compute the intersection of two row ranges. + /// \brief EXPERIMENTAL: Compute the intersection of two row ranges. static RowRanges Intersect(const RowRanges& lhs, const RowRanges& rhs); - /// \brief Compute the union of two row ranges. + /// \brief EXPERIMENTAL: Compute the union of two row ranges. static RowRanges Union(const RowRanges& lhs, const RowRanges& rhs); - /// \brief Make a single row range of [0, row_count - 1]. + /// \brief EXPERIMENTAL: Make a single row range of [0, row_count - 1]. static RowRanges MakeSingle(int64_t row_count); - /// \brief Make a single row range of [start, end]. + /// \brief EXPERIMENTAL: Make a single row range of [start, end]. static RowRanges MakeSingle(int64_t start, int64_t end); - /// \brief Make a row range from a list of intervals. + /// \brief EXPERIMENTAL: Make a row range from a list of intervals. static RowRanges MakeIntervals(const std::vector& intervals); private: From 5075180ce937b435f8cf50e5256a38192698057e Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Sun, 21 Dec 2025 16:45:57 +0100 Subject: [PATCH 03/11] add cstdint to includes, mark new functionality as experimental. --- cpp/src/parquet/row_range.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/row_range.h b/cpp/src/parquet/row_range.h index 06d1b62d284..3160c450ded 100644 --- a/cpp/src/parquet/row_range.h +++ b/cpp/src/parquet/row_range.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include From 714b83f691f5211aaae295c36e02a5220302ffb8 Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Sun, 21 Dec 2025 22:33:58 +0100 Subject: [PATCH 04/11] add unit tests for row_ranges --- cpp/src/parquet/CMakeLists.txt | 3 +- cpp/src/parquet/row_range_test.cc | 515 ++++++++++++++++++++++++++++++ 2 files changed, 517 insertions(+), 1 deletion(-) create mode 100644 cpp/src/parquet/row_range_test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index a7512b6cd02..4a89a34a62e 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -394,7 +394,8 @@ add_parquet_test(reader-test level_conversion_test.cc column_scanner_test.cc reader_test.cc - stream_reader_test.cc) + stream_reader_test.cc + row_range_test.cc) add_parquet_test(writer-test SOURCES diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc new file mode 100644 index 00000000000..b68db286370 --- /dev/null +++ b/cpp/src/parquet/row_range_test.cc @@ -0,0 +1,515 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/exception.h" +#include "parquet/row_range.h" + +namespace parquet { + +// Test factory methods +TEST(RowRanges, MakeSingleWithCount) { + auto ranges = RowRanges::MakeSingle(100); + ASSERT_EQ(ranges.row_count(), 100); + + auto iter = ranges.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 99); + + // Should be exhausted + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowRanges, MakeSingleWithStartEnd) { + auto ranges = RowRanges::MakeSingle(10, 20); + ASSERT_EQ(ranges.row_count(), 11); + + auto iter = ranges.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + + auto interval = std::get(range); + EXPECT_EQ(interval.start, 10); + EXPECT_EQ(interval.end, 20); + + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowRanges, MakeIntervals) { + std::vector intervals = { + {0, 10}, + {20, 30}, + {40, 50} + }; + + auto ranges = RowRanges::MakeIntervals(intervals); + ASSERT_EQ(ranges.row_count(), 33); // 11 + 11 + 11 + + auto iter = ranges.NewIterator(); + + // First interval + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 10); + + // Second interval + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 30); + + // Third interval + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 40); + EXPECT_EQ(interval.end, 50); + + // Exhausted + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowRanges, EmptyRanges) { + std::vector intervals; + auto ranges = RowRanges::MakeIntervals(intervals); + ASSERT_EQ(ranges.row_count(), 0); + + auto iter = ranges.NewIterator(); + auto range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +// Test validation +TEST(RowRanges, ValidateValidRanges) { + std::vector intervals = { + {0, 10}, + {15, 20}, + {25, 30} + }; + + auto ranges = RowRanges::MakeIntervals(intervals); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowRanges, ValidateSingleRange) { + auto ranges = RowRanges::MakeSingle(100); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowRanges, ValidateOverlappingRanges) { + std::vector intervals = { + {0, 10}, + {5, 15} // Overlaps with previous + }; + + auto ranges = RowRanges::MakeIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowRanges, ValidateAdjacentRanges) { + std::vector intervals = { + {0, 10}, + {11, 20} // Adjacent but not overlapping + }; + + auto ranges = RowRanges::MakeIntervals(intervals); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowRanges, ValidateInvalidRangeTouching) { + std::vector intervals = { + {0, 10}, + {10, 20} // Touches at end/start (overlaps at 10) + }; + + auto ranges = RowRanges::MakeIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowRanges, ValidateNotAscendingOrder) { + std::vector intervals = { + {20, 30}, + {0, 10} // Not in ascending order + }; + + auto ranges = RowRanges::MakeIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowRanges, ValidateInvalidInterval) { + std::vector intervals = { + {10, 5} // end < start + }; + + auto ranges = RowRanges::MakeIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +// Test row_count +TEST(RowRanges, RowCountSingle) { + auto ranges = RowRanges::MakeSingle(50); + EXPECT_EQ(ranges.row_count(), 50); +} + +TEST(RowRanges, RowCountMultiple) { + std::vector intervals = { + {0, 9}, // 10 rows + {20, 29}, // 10 rows + {50, 54} // 5 rows + }; + + auto ranges = RowRanges::MakeIntervals(intervals); + EXPECT_EQ(ranges.row_count(), 25); +} + +TEST(RowRanges, RowCountEmpty) { + std::vector intervals; + auto ranges = RowRanges::MakeIntervals(intervals); + EXPECT_EQ(ranges.row_count(), 0); +} + +TEST(RowRanges, RowCountSingleRow) { + auto ranges = RowRanges::MakeSingle(5, 5); + EXPECT_EQ(ranges.row_count(), 1); +} + +// Test Intersect +TEST(RowRanges, IntersectNoOverlap) { + auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowRanges::MakeIntervals({{40, 50}, {60, 70}}); + + auto result = RowRanges::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 0); +} + +TEST(RowRanges, IntersectCompleteOverlap) { + auto lhs = RowRanges::MakeIntervals({{0, 100}}); + auto rhs = RowRanges::MakeIntervals({{20, 30}, {40, 50}}); + + auto result = RowRanges::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); // (30-20+1) + (50-40+1) + + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 30); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 40); + EXPECT_EQ(interval.end, 50); +} + +TEST(RowRanges, IntersectPartialOverlap) { + auto lhs = RowRanges::MakeIntervals({{0, 15}, {20, 35}}); + auto rhs = RowRanges::MakeIntervals({{10, 25}, {40, 50}}); + + auto result = RowRanges::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 12); // (15-10+1) + (25-20+1) + + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 10); + EXPECT_EQ(interval.end, 15); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 25); +} + +TEST(RowRanges, IntersectIdentical) { + auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); + + auto result = RowRanges::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); +} + +TEST(RowRanges, IntersectWithEmpty) { + auto lhs = RowRanges::MakeIntervals({{0, 10}}); + auto rhs = RowRanges::MakeIntervals({}); + + auto result = RowRanges::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 0); +} + +TEST(RowRanges, IntersectComplex) { + auto lhs = RowRanges::MakeIntervals({{0, 10}, {15, 25}, {30, 40}, {50, 60}}); + auto rhs = RowRanges::MakeIntervals({{5, 12}, {20, 35}, {55, 70}}); + + auto result = RowRanges::Intersect(lhs, rhs); + + // Expected intersections: + // [5, 10] from [0,10] ∩ [5,12] = 6 rows + // [20, 25] from [15,25] ∩ [20,35] = 6 rows + // [30, 35] from [30,40] ∩ [20,35] = 6 rows + // [55, 60] from [50,60] ∩ [55,70] = 6 rows + EXPECT_EQ(result.row_count(), 24); +} + +// Test Union +TEST(RowRanges, UnionNoOverlap) { + auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowRanges::MakeIntervals({{40, 50}, {60, 70}}); + + auto result = RowRanges::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 44); // 11+11+11+11 + + auto iter = result.NewIterator(); + + // Should have 4 separate ranges + for (int i = 0; i < 4; ++i) { + auto range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); + } + + auto range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowRanges, UnionWithOverlap) { + auto lhs = RowRanges::MakeIntervals({{0, 15}}); + auto rhs = RowRanges::MakeIntervals({{10, 25}}); + + auto result = RowRanges::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 26); // [0, 25] = 26 rows + + auto iter = result.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 25); +} + +TEST(RowRanges, UnionAdjacent) { + auto lhs = RowRanges::MakeIntervals({{0, 10}}); + auto rhs = RowRanges::MakeIntervals({{11, 20}}); + + auto result = RowRanges::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 21); // [0, 20] = 21 rows + + // Should merge adjacent ranges + auto iter = result.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 20); + + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowRanges, UnionWithGap) { + auto lhs = RowRanges::MakeIntervals({{0, 10}}); + auto rhs = RowRanges::MakeIntervals({{20, 30}}); + + auto result = RowRanges::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); + + // Should have 2 ranges + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 10); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 30); +} + +TEST(RowRanges, UnionWithEmpty) { + auto lhs = RowRanges::MakeIntervals({{0, 10}}); + auto rhs = RowRanges::MakeIntervals({}); + + auto result = RowRanges::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 11); +} + +TEST(RowRanges, UnionEmptyWithNonEmpty) { + auto lhs = RowRanges::MakeIntervals({}); + auto rhs = RowRanges::MakeIntervals({{0, 10}}); + + auto result = RowRanges::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 11); +} + +TEST(RowRanges, UnionIdentical) { + auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); + + auto result = RowRanges::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); + + // Should still have 2 ranges (merged) + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowRanges, UnionComplex) { + auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}, {50, 60}}); + auto rhs = RowRanges::MakeIntervals({{5, 15}, {25, 35}, {45, 55}}); + + auto result = RowRanges::Union(lhs, rhs); + + // Expected: [0,15], [20,35], [45,60] + EXPECT_EQ(result.row_count(), 48); // 16 + 16 + 16 + + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 15); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 35); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 45); + EXPECT_EQ(interval.end, 60); +} + +TEST(RowRanges, UnionManyOverlapping) { + auto lhs = RowRanges::MakeIntervals({{0, 100}}); + auto rhs = RowRanges::MakeIntervals({{50, 150}}); + + auto result = RowRanges::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 151); // [0, 150] + + auto iter = result.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 150); +} + +// Test iterator behavior +TEST(RowRanges, IteratorMultipleIterators) { + auto ranges = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); + + auto iter1 = ranges.NewIterator(); + auto iter2 = ranges.NewIterator(); + + // Both iterators should work independently + auto range1 = iter1->NextRange(); + auto range2 = iter2->NextRange(); + + ASSERT_TRUE(std::holds_alternative(range1)); + ASSERT_TRUE(std::holds_alternative(range2)); + + auto interval1 = std::get(range1); + auto interval2 = std::get(range2); + + EXPECT_EQ(interval1.start, interval2.start); + EXPECT_EQ(interval1.end, interval2.end); +} + +TEST(RowRanges, IteratorExhaustion) { + auto ranges = RowRanges::MakeSingle(10); + auto iter = ranges.NewIterator(); + + // First call returns the range + auto range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); + + // Subsequent calls should return End + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); + + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +// Test edge cases +TEST(RowRanges, LargeRanges) { + auto ranges = RowRanges::MakeSingle(0, 1000000); + EXPECT_EQ(ranges.row_count(), 1000001); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowRanges, ZeroStartRange) { + auto ranges = RowRanges::MakeSingle(0, 0); + EXPECT_EQ(ranges.row_count(), 1); + + auto iter = ranges.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 0); +} + +TEST(RowRanges, IntersectAndUnionCommutative) { + auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowRanges::MakeIntervals({{5, 15}, {25, 35}}); + + // Intersect should be commutative + auto intersect1 = RowRanges::Intersect(lhs, rhs); + auto intersect2 = RowRanges::Intersect(rhs, lhs); + EXPECT_EQ(intersect1.row_count(), intersect2.row_count()); + + // Union should be commutative + auto union1 = RowRanges::Union(lhs, rhs); + auto union2 = RowRanges::Union(rhs, lhs); + EXPECT_EQ(union1.row_count(), union2.row_count()); +} + +} // namespace parquet From aa16c1a93c2524f8b17d3103aa9df104a5e92878 Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Sun, 21 Dec 2025 22:46:14 +0100 Subject: [PATCH 05/11] rename row_range to row_selection --- cpp/src/parquet/CMakeLists.txt | 4 +- cpp/src/parquet/row_range_test.cc | 515 ------------------ .../{row_range.cc => row_selection.cc} | 78 +-- .../parquet/{row_range.h => row_selection.h} | 14 +- cpp/src/parquet/row_selection_test.cc | 515 ++++++++++++++++++ 5 files changed, 563 insertions(+), 563 deletions(-) delete mode 100644 cpp/src/parquet/row_range_test.cc rename cpp/src/parquet/{row_range.cc => row_selection.cc} (71%) rename cpp/src/parquet/{row_range.h => row_selection.h} (84%) create mode 100644 cpp/src/parquet/row_selection_test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 4a89a34a62e..5474cf12523 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -185,7 +185,7 @@ set(PARQUET_SRCS platform.cc printer.cc properties.cc - row_range.cc + row_selection.cc schema.cc size_statistics.cc statistics.cc @@ -395,7 +395,7 @@ add_parquet_test(reader-test column_scanner_test.cc reader_test.cc stream_reader_test.cc - row_range_test.cc) + row_selection_test.cc) add_parquet_test(writer-test SOURCES diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc deleted file mode 100644 index b68db286370..00000000000 --- a/cpp/src/parquet/row_range_test.cc +++ /dev/null @@ -1,515 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include - -#include "parquet/exception.h" -#include "parquet/row_range.h" - -namespace parquet { - -// Test factory methods -TEST(RowRanges, MakeSingleWithCount) { - auto ranges = RowRanges::MakeSingle(100); - ASSERT_EQ(ranges.row_count(), 100); - - auto iter = ranges.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - - auto interval = std::get(range); - EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 99); - - // Should be exhausted - range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); -} - -TEST(RowRanges, MakeSingleWithStartEnd) { - auto ranges = RowRanges::MakeSingle(10, 20); - ASSERT_EQ(ranges.row_count(), 11); - - auto iter = ranges.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - - auto interval = std::get(range); - EXPECT_EQ(interval.start, 10); - EXPECT_EQ(interval.end, 20); - - range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); -} - -TEST(RowRanges, MakeIntervals) { - std::vector intervals = { - {0, 10}, - {20, 30}, - {40, 50} - }; - - auto ranges = RowRanges::MakeIntervals(intervals); - ASSERT_EQ(ranges.row_count(), 33); // 11 + 11 + 11 - - auto iter = ranges.NewIterator(); - - // First interval - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 10); - - // Second interval - range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); - EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 30); - - // Third interval - range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); - EXPECT_EQ(interval.start, 40); - EXPECT_EQ(interval.end, 50); - - // Exhausted - range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); -} - -TEST(RowRanges, EmptyRanges) { - std::vector intervals; - auto ranges = RowRanges::MakeIntervals(intervals); - ASSERT_EQ(ranges.row_count(), 0); - - auto iter = ranges.NewIterator(); - auto range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); -} - -// Test validation -TEST(RowRanges, ValidateValidRanges) { - std::vector intervals = { - {0, 10}, - {15, 20}, - {25, 30} - }; - - auto ranges = RowRanges::MakeIntervals(intervals); - EXPECT_NO_THROW(ranges.Validate()); -} - -TEST(RowRanges, ValidateSingleRange) { - auto ranges = RowRanges::MakeSingle(100); - EXPECT_NO_THROW(ranges.Validate()); -} - -TEST(RowRanges, ValidateOverlappingRanges) { - std::vector intervals = { - {0, 10}, - {5, 15} // Overlaps with previous - }; - - auto ranges = RowRanges::MakeIntervals(intervals); - EXPECT_THROW(ranges.Validate(), ParquetException); -} - -TEST(RowRanges, ValidateAdjacentRanges) { - std::vector intervals = { - {0, 10}, - {11, 20} // Adjacent but not overlapping - }; - - auto ranges = RowRanges::MakeIntervals(intervals); - EXPECT_NO_THROW(ranges.Validate()); -} - -TEST(RowRanges, ValidateInvalidRangeTouching) { - std::vector intervals = { - {0, 10}, - {10, 20} // Touches at end/start (overlaps at 10) - }; - - auto ranges = RowRanges::MakeIntervals(intervals); - EXPECT_THROW(ranges.Validate(), ParquetException); -} - -TEST(RowRanges, ValidateNotAscendingOrder) { - std::vector intervals = { - {20, 30}, - {0, 10} // Not in ascending order - }; - - auto ranges = RowRanges::MakeIntervals(intervals); - EXPECT_THROW(ranges.Validate(), ParquetException); -} - -TEST(RowRanges, ValidateInvalidInterval) { - std::vector intervals = { - {10, 5} // end < start - }; - - auto ranges = RowRanges::MakeIntervals(intervals); - EXPECT_THROW(ranges.Validate(), ParquetException); -} - -// Test row_count -TEST(RowRanges, RowCountSingle) { - auto ranges = RowRanges::MakeSingle(50); - EXPECT_EQ(ranges.row_count(), 50); -} - -TEST(RowRanges, RowCountMultiple) { - std::vector intervals = { - {0, 9}, // 10 rows - {20, 29}, // 10 rows - {50, 54} // 5 rows - }; - - auto ranges = RowRanges::MakeIntervals(intervals); - EXPECT_EQ(ranges.row_count(), 25); -} - -TEST(RowRanges, RowCountEmpty) { - std::vector intervals; - auto ranges = RowRanges::MakeIntervals(intervals); - EXPECT_EQ(ranges.row_count(), 0); -} - -TEST(RowRanges, RowCountSingleRow) { - auto ranges = RowRanges::MakeSingle(5, 5); - EXPECT_EQ(ranges.row_count(), 1); -} - -// Test Intersect -TEST(RowRanges, IntersectNoOverlap) { - auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowRanges::MakeIntervals({{40, 50}, {60, 70}}); - - auto result = RowRanges::Intersect(lhs, rhs); - EXPECT_EQ(result.row_count(), 0); -} - -TEST(RowRanges, IntersectCompleteOverlap) { - auto lhs = RowRanges::MakeIntervals({{0, 100}}); - auto rhs = RowRanges::MakeIntervals({{20, 30}, {40, 50}}); - - auto result = RowRanges::Intersect(lhs, rhs); - EXPECT_EQ(result.row_count(), 22); // (30-20+1) + (50-40+1) - - auto iter = result.NewIterator(); - - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 30); - - range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); - EXPECT_EQ(interval.start, 40); - EXPECT_EQ(interval.end, 50); -} - -TEST(RowRanges, IntersectPartialOverlap) { - auto lhs = RowRanges::MakeIntervals({{0, 15}, {20, 35}}); - auto rhs = RowRanges::MakeIntervals({{10, 25}, {40, 50}}); - - auto result = RowRanges::Intersect(lhs, rhs); - EXPECT_EQ(result.row_count(), 12); // (15-10+1) + (25-20+1) - - auto iter = result.NewIterator(); - - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 10); - EXPECT_EQ(interval.end, 15); - - range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); - EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 25); -} - -TEST(RowRanges, IntersectIdentical) { - auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); - - auto result = RowRanges::Intersect(lhs, rhs); - EXPECT_EQ(result.row_count(), 22); -} - -TEST(RowRanges, IntersectWithEmpty) { - auto lhs = RowRanges::MakeIntervals({{0, 10}}); - auto rhs = RowRanges::MakeIntervals({}); - - auto result = RowRanges::Intersect(lhs, rhs); - EXPECT_EQ(result.row_count(), 0); -} - -TEST(RowRanges, IntersectComplex) { - auto lhs = RowRanges::MakeIntervals({{0, 10}, {15, 25}, {30, 40}, {50, 60}}); - auto rhs = RowRanges::MakeIntervals({{5, 12}, {20, 35}, {55, 70}}); - - auto result = RowRanges::Intersect(lhs, rhs); - - // Expected intersections: - // [5, 10] from [0,10] ∩ [5,12] = 6 rows - // [20, 25] from [15,25] ∩ [20,35] = 6 rows - // [30, 35] from [30,40] ∩ [20,35] = 6 rows - // [55, 60] from [50,60] ∩ [55,70] = 6 rows - EXPECT_EQ(result.row_count(), 24); -} - -// Test Union -TEST(RowRanges, UnionNoOverlap) { - auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowRanges::MakeIntervals({{40, 50}, {60, 70}}); - - auto result = RowRanges::Union(lhs, rhs); - EXPECT_EQ(result.row_count(), 44); // 11+11+11+11 - - auto iter = result.NewIterator(); - - // Should have 4 separate ranges - for (int i = 0; i < 4; ++i) { - auto range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); - } - - auto range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); -} - -TEST(RowRanges, UnionWithOverlap) { - auto lhs = RowRanges::MakeIntervals({{0, 15}}); - auto rhs = RowRanges::MakeIntervals({{10, 25}}); - - auto result = RowRanges::Union(lhs, rhs); - EXPECT_EQ(result.row_count(), 26); // [0, 25] = 26 rows - - auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 25); -} - -TEST(RowRanges, UnionAdjacent) { - auto lhs = RowRanges::MakeIntervals({{0, 10}}); - auto rhs = RowRanges::MakeIntervals({{11, 20}}); - - auto result = RowRanges::Union(lhs, rhs); - EXPECT_EQ(result.row_count(), 21); // [0, 20] = 21 rows - - // Should merge adjacent ranges - auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 20); - - range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); -} - -TEST(RowRanges, UnionWithGap) { - auto lhs = RowRanges::MakeIntervals({{0, 10}}); - auto rhs = RowRanges::MakeIntervals({{20, 30}}); - - auto result = RowRanges::Union(lhs, rhs); - EXPECT_EQ(result.row_count(), 22); - - // Should have 2 ranges - auto iter = result.NewIterator(); - - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 10); - - range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); - EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 30); -} - -TEST(RowRanges, UnionWithEmpty) { - auto lhs = RowRanges::MakeIntervals({{0, 10}}); - auto rhs = RowRanges::MakeIntervals({}); - - auto result = RowRanges::Union(lhs, rhs); - EXPECT_EQ(result.row_count(), 11); -} - -TEST(RowRanges, UnionEmptyWithNonEmpty) { - auto lhs = RowRanges::MakeIntervals({}); - auto rhs = RowRanges::MakeIntervals({{0, 10}}); - - auto result = RowRanges::Union(lhs, rhs); - EXPECT_EQ(result.row_count(), 11); -} - -TEST(RowRanges, UnionIdentical) { - auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); - - auto result = RowRanges::Union(lhs, rhs); - EXPECT_EQ(result.row_count(), 22); - - // Should still have 2 ranges (merged) - auto iter = result.NewIterator(); - - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - - range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - - range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); -} - -TEST(RowRanges, UnionComplex) { - auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}, {50, 60}}); - auto rhs = RowRanges::MakeIntervals({{5, 15}, {25, 35}, {45, 55}}); - - auto result = RowRanges::Union(lhs, rhs); - - // Expected: [0,15], [20,35], [45,60] - EXPECT_EQ(result.row_count(), 48); // 16 + 16 + 16 - - auto iter = result.NewIterator(); - - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 15); - - range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); - EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 35); - - range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); - EXPECT_EQ(interval.start, 45); - EXPECT_EQ(interval.end, 60); -} - -TEST(RowRanges, UnionManyOverlapping) { - auto lhs = RowRanges::MakeIntervals({{0, 100}}); - auto rhs = RowRanges::MakeIntervals({{50, 150}}); - - auto result = RowRanges::Union(lhs, rhs); - EXPECT_EQ(result.row_count(), 151); // [0, 150] - - auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 150); -} - -// Test iterator behavior -TEST(RowRanges, IteratorMultipleIterators) { - auto ranges = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); - - auto iter1 = ranges.NewIterator(); - auto iter2 = ranges.NewIterator(); - - // Both iterators should work independently - auto range1 = iter1->NextRange(); - auto range2 = iter2->NextRange(); - - ASSERT_TRUE(std::holds_alternative(range1)); - ASSERT_TRUE(std::holds_alternative(range2)); - - auto interval1 = std::get(range1); - auto interval2 = std::get(range2); - - EXPECT_EQ(interval1.start, interval2.start); - EXPECT_EQ(interval1.end, interval2.end); -} - -TEST(RowRanges, IteratorExhaustion) { - auto ranges = RowRanges::MakeSingle(10); - auto iter = ranges.NewIterator(); - - // First call returns the range - auto range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); - - // Subsequent calls should return End - range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); - - range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); -} - -// Test edge cases -TEST(RowRanges, LargeRanges) { - auto ranges = RowRanges::MakeSingle(0, 1000000); - EXPECT_EQ(ranges.row_count(), 1000001); - EXPECT_NO_THROW(ranges.Validate()); -} - -TEST(RowRanges, ZeroStartRange) { - auto ranges = RowRanges::MakeSingle(0, 0); - EXPECT_EQ(ranges.row_count(), 1); - - auto iter = ranges.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); - EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 0); -} - -TEST(RowRanges, IntersectAndUnionCommutative) { - auto lhs = RowRanges::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowRanges::MakeIntervals({{5, 15}, {25, 35}}); - - // Intersect should be commutative - auto intersect1 = RowRanges::Intersect(lhs, rhs); - auto intersect2 = RowRanges::Intersect(rhs, lhs); - EXPECT_EQ(intersect1.row_count(), intersect2.row_count()); - - // Union should be commutative - auto union1 = RowRanges::Union(lhs, rhs); - auto union2 = RowRanges::Union(rhs, lhs); - EXPECT_EQ(union1.row_count(), union2.row_count()); -} - -} // namespace parquet diff --git a/cpp/src/parquet/row_range.cc b/cpp/src/parquet/row_selection.cc similarity index 71% rename from cpp/src/parquet/row_range.cc rename to cpp/src/parquet/row_selection.cc index 5a950d54237..c0e8c529f13 100644 --- a/cpp/src/parquet/row_range.cc +++ b/cpp/src/parquet/row_selection.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/row_range.h" +#include "parquet/row_selection.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/unreachable.h" @@ -23,41 +23,41 @@ namespace parquet { -class IteratorImpl : public RowRanges::Iterator { +class IteratorImpl : public RowSelection::Iterator { public: - explicit IteratorImpl(const RowRanges& ranges) + explicit IteratorImpl(const RowSelection& ranges) : iter_(ranges.ranges_.cbegin()), end_(ranges.ranges_.cend()) {} ~IteratorImpl() override = default; - std::variant + std::variant NextRange() override { if (iter_ == end_) { - return RowRanges::End(); + return RowSelection::End(); } - if (std::holds_alternative(*iter_)) { - return std::get(*iter_); + if (std::holds_alternative(*iter_)) { + return std::get(*iter_); } - if (std::holds_alternative(*iter_)) { - return std::get(*iter_); + if (std::holds_alternative(*iter_)) { + return std::get(*iter_); } arrow::Unreachable("Invalid row ranges type"); } private: - decltype(RowRanges::ranges_.cbegin()) iter_; - decltype(RowRanges::ranges_.cend()) end_; + decltype(RowSelection::ranges_.cbegin()) iter_; + decltype(RowSelection::ranges_.cend()) end_; }; -std::unique_ptr RowRanges::NewIterator() const { +std::unique_ptr RowSelection::NewIterator() const { return std::make_unique(*this); } -void RowRanges::Validate() const { +void RowSelection::Validate() const { int64_t last_end = -1; for (const auto& range : ranges_) { - if (std::holds_alternative(range)) { - const auto& interval = std::get(range); + if (std::holds_alternative(range)) { + const auto& interval = std::get(range); if (interval.start <= last_end) { throw ParquetException("Row ranges are not in ascending order"); } @@ -67,8 +67,8 @@ void RowRanges::Validate() const { last_end = interval.end; continue; } - if (std::holds_alternative(range)) { - const auto& bitmap = std::get(range); + if (std::holds_alternative(range)) { + const auto& bitmap = std::get(range); if (bitmap.offset <= last_end) { throw ParquetException("Row ranges are not in ascending order"); } @@ -79,15 +79,15 @@ void RowRanges::Validate() const { } } -int64_t RowRanges::row_count() const { +int64_t RowSelection::row_count() const { int64_t count = 0; for (const auto& range : ranges_) { - if (std::holds_alternative(range)) { - const auto& interval = std::get(range); + if (std::holds_alternative(range)) { + const auto& interval = std::get(range); count += interval.end - interval.start + 1; } - if (std::holds_alternative(range)) { - const auto& bitmap = std::get(range); + if (std::holds_alternative(range)) { + const auto& bitmap = std::get(range); count += arrow::internal::CountSetBits( reinterpret_cast(&bitmap.bitmap), 0, sizeof(bitmap.bitmap)); } @@ -96,8 +96,8 @@ int64_t RowRanges::row_count() const { return count; } -RowRanges RowRanges::Intersect(const RowRanges& lhs, const RowRanges& rhs) { - RowRanges result; +RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection& rhs) { + RowSelection result; auto lhs_iter = lhs.NewIterator(); auto rhs_iter = rhs.NewIterator(); auto lhs_range = lhs_iter->NextRange(); @@ -133,8 +133,8 @@ RowRanges RowRanges::Intersect(const RowRanges& lhs, const RowRanges& rhs) { return result; } -RowRanges RowRanges::Union(const RowRanges& lhs, const RowRanges& rhs) { - RowRanges result; +RowSelection RowSelection::Union(const RowSelection& lhs, const RowSelection& rhs) { + RowSelection result; auto lhs_iter = lhs.NewIterator(); auto rhs_iter = rhs.NewIterator(); auto lhs_range = lhs_iter->NextRange(); @@ -203,23 +203,23 @@ RowRanges RowRanges::Union(const RowRanges& lhs, const RowRanges& rhs) { return result; } -RowRanges RowRanges::MakeSingle(int64_t row_count) { - RowRanges rowRanges; - rowRanges.ranges_.push_back(IntervalRange{0, row_count - 1}); - return rowRanges; +RowSelection RowSelection::MakeSingle(int64_t row_count) { + RowSelection rowSelection; + rowSelection.ranges_.push_back(IntervalRange{0, row_count - 1}); + return rowSelection; } -RowRanges RowRanges::MakeSingle(int64_t start, int64_t end) { - RowRanges rowRanges; - rowRanges.ranges_.push_back(IntervalRange{start, end}); - return rowRanges; +RowSelection RowSelection::MakeSingle(int64_t start, int64_t end) { + RowSelection rowSelection; + rowSelection.ranges_.push_back(IntervalRange{start, end}); + return rowSelection; } -RowRanges RowRanges::MakeIntervals(const std::vector& intervals) { - RowRanges rowRanges; - rowRanges.ranges_.reserve(intervals.size()); - rowRanges.ranges_.insert(rowRanges.ranges_.end(), intervals.cbegin(), intervals.cend()); - return rowRanges; +RowSelection RowSelection::MakeIntervals(const std::vector& intervals) { + RowSelection rowSelection; + rowSelection.ranges_.reserve(intervals.size()); + rowSelection.ranges_.insert(rowSelection.ranges_.end(), intervals.cbegin(), intervals.cend()); + return rowSelection; } } // namespace parquet \ No newline at end of file diff --git a/cpp/src/parquet/row_range.h b/cpp/src/parquet/row_selection.h similarity index 84% rename from cpp/src/parquet/row_range.h rename to cpp/src/parquet/row_selection.h index 3160c450ded..e20d072cef0 100644 --- a/cpp/src/parquet/row_range.h +++ b/cpp/src/parquet/row_selection.h @@ -25,8 +25,8 @@ namespace parquet { -/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. -class PARQUET_EXPORT RowRanges { +/// RowSelection is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowSelection { public: /// \brief EXPERIMENTAL: A range of contiguous rows represented by an interval. struct IntervalRange { @@ -66,19 +66,19 @@ class PARQUET_EXPORT RowRanges { int64_t row_count() const; /// \brief EXPERIMENTAL: Compute the intersection of two row ranges. - static RowRanges Intersect(const RowRanges& lhs, const RowRanges& rhs); + static RowSelection Intersect(const RowSelection& lhs, const RowSelection& rhs); /// \brief EXPERIMENTAL: Compute the union of two row ranges. - static RowRanges Union(const RowRanges& lhs, const RowRanges& rhs); + static RowSelection Union(const RowSelection& lhs, const RowSelection& rhs); /// \brief EXPERIMENTAL: Make a single row range of [0, row_count - 1]. - static RowRanges MakeSingle(int64_t row_count); + static RowSelection MakeSingle(int64_t row_count); /// \brief EXPERIMENTAL: Make a single row range of [start, end]. - static RowRanges MakeSingle(int64_t start, int64_t end); + static RowSelection MakeSingle(int64_t start, int64_t end); /// \brief EXPERIMENTAL: Make a row range from a list of intervals. - static RowRanges MakeIntervals(const std::vector& intervals); + static RowSelection MakeIntervals(const std::vector& intervals); private: friend class IteratorImpl; diff --git a/cpp/src/parquet/row_selection_test.cc b/cpp/src/parquet/row_selection_test.cc new file mode 100644 index 00000000000..9ba76468b1f --- /dev/null +++ b/cpp/src/parquet/row_selection_test.cc @@ -0,0 +1,515 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/exception.h" +#include "parquet/row_selection.h" + +namespace parquet { + +// Test factory methods +TEST(RowSelection, MakeSingleWithCount) { + auto ranges = RowSelection::MakeSingle(100); + ASSERT_EQ(ranges.row_count(), 100); + + auto iter = ranges.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 99); + + // Should be exhausted + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowSelection, MakeSingleWithStartEnd) { + auto ranges = RowSelection::MakeSingle(10, 20); + ASSERT_EQ(ranges.row_count(), 11); + + auto iter = ranges.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + + auto interval = std::get(range); + EXPECT_EQ(interval.start, 10); + EXPECT_EQ(interval.end, 20); + + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowSelection, MakeIntervals) { + std::vector intervals = { + {0, 10}, + {20, 30}, + {40, 50} + }; + + auto ranges = RowSelection::MakeIntervals(intervals); + ASSERT_EQ(ranges.row_count(), 33); // 11 + 11 + 11 + + auto iter = ranges.NewIterator(); + + // First interval + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 10); + + // Second interval + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 30); + + // Third interval + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 40); + EXPECT_EQ(interval.end, 50); + + // Exhausted + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowSelection, EmptyRanges) { + std::vector intervals; + auto ranges = RowSelection::MakeIntervals(intervals); + ASSERT_EQ(ranges.row_count(), 0); + + auto iter = ranges.NewIterator(); + auto range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +// Test validation +TEST(RowSelection, ValidateValidRanges) { + std::vector intervals = { + {0, 10}, + {15, 20}, + {25, 30} + }; + + auto ranges = RowSelection::MakeIntervals(intervals); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowSelection, ValidateSingleRange) { + auto ranges = RowSelection::MakeSingle(100); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowSelection, ValidateOverlappingRanges) { + std::vector intervals = { + {0, 10}, + {5, 15} // Overlaps with previous + }; + + auto ranges = RowSelection::MakeIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowSelection, ValidateAdjacentRanges) { + std::vector intervals = { + {0, 10}, + {11, 20} // Adjacent but not overlapping + }; + + auto ranges = RowSelection::MakeIntervals(intervals); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowSelection, ValidateInvalidRangeTouching) { + std::vector intervals = { + {0, 10}, + {10, 20} // Touches at end/start (overlaps at 10) + }; + + auto ranges = RowSelection::MakeIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowSelection, ValidateNotAscendingOrder) { + std::vector intervals = { + {20, 30}, + {0, 10} // Not in ascending order + }; + + auto ranges = RowSelection::MakeIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +TEST(RowSelection, ValidateInvalidInterval) { + std::vector intervals = { + {10, 5} // end < start + }; + + auto ranges = RowSelection::MakeIntervals(intervals); + EXPECT_THROW(ranges.Validate(), ParquetException); +} + +// Test row_count +TEST(RowSelection, RowCountSingle) { + auto ranges = RowSelection::MakeSingle(50); + EXPECT_EQ(ranges.row_count(), 50); +} + +TEST(RowSelection, RowCountMultiple) { + std::vector intervals = { + {0, 9}, // 10 rows + {20, 29}, // 10 rows + {50, 54} // 5 rows + }; + + auto ranges = RowSelection::MakeIntervals(intervals); + EXPECT_EQ(ranges.row_count(), 25); +} + +TEST(RowSelection, RowCountEmpty) { + std::vector intervals; + auto ranges = RowSelection::MakeIntervals(intervals); + EXPECT_EQ(ranges.row_count(), 0); +} + +TEST(RowSelection, RowCountSingleRow) { + auto ranges = RowSelection::MakeSingle(5, 5); + EXPECT_EQ(ranges.row_count(), 1); +} + +// Test Intersect +TEST(RowSelection, IntersectNoOverlap) { + auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowSelection::MakeIntervals({{40, 50}, {60, 70}}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 0); +} + +TEST(RowSelection, IntersectCompleteOverlap) { + auto lhs = RowSelection::MakeIntervals({{0, 100}}); + auto rhs = RowSelection::MakeIntervals({{20, 30}, {40, 50}}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); // (30-20+1) + (50-40+1) + + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 30); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 40); + EXPECT_EQ(interval.end, 50); +} + +TEST(RowSelection, IntersectPartialOverlap) { + auto lhs = RowSelection::MakeIntervals({{0, 15}, {20, 35}}); + auto rhs = RowSelection::MakeIntervals({{10, 25}, {40, 50}}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 12); // (15-10+1) + (25-20+1) + + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 10); + EXPECT_EQ(interval.end, 15); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 25); +} + +TEST(RowSelection, IntersectIdentical) { + auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); +} + +TEST(RowSelection, IntersectWithEmpty) { + auto lhs = RowSelection::MakeIntervals({{0, 10}}); + auto rhs = RowSelection::MakeIntervals({}); + + auto result = RowSelection::Intersect(lhs, rhs); + EXPECT_EQ(result.row_count(), 0); +} + +TEST(RowSelection, IntersectComplex) { + auto lhs = RowSelection::MakeIntervals({{0, 10}, {15, 25}, {30, 40}, {50, 60}}); + auto rhs = RowSelection::MakeIntervals({{5, 12}, {20, 35}, {55, 70}}); + + auto result = RowSelection::Intersect(lhs, rhs); + + // Expected intersections: + // [5, 10] from [0,10] ∩ [5,12] = 6 rows + // [20, 25] from [15,25] ∩ [20,35] = 6 rows + // [30, 35] from [30,40] ∩ [20,35] = 6 rows + // [55, 60] from [50,60] ∩ [55,70] = 6 rows + EXPECT_EQ(result.row_count(), 24); +} + +// Test Union +TEST(RowSelection, UnionNoOverlap) { + auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowSelection::MakeIntervals({{40, 50}, {60, 70}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 44); // 11+11+11+11 + + auto iter = result.NewIterator(); + + // Should have 4 separate ranges + for (int i = 0; i < 4; ++i) { + auto range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); + } + + auto range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowSelection, UnionWithOverlap) { + auto lhs = RowSelection::MakeIntervals({{0, 15}}); + auto rhs = RowSelection::MakeIntervals({{10, 25}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 26); // [0, 25] = 26 rows + + auto iter = result.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 25); +} + +TEST(RowSelection, UnionAdjacent) { + auto lhs = RowSelection::MakeIntervals({{0, 10}}); + auto rhs = RowSelection::MakeIntervals({{11, 20}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 21); // [0, 20] = 21 rows + + // Should merge adjacent ranges + auto iter = result.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 20); + + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowSelection, UnionWithGap) { + auto lhs = RowSelection::MakeIntervals({{0, 10}}); + auto rhs = RowSelection::MakeIntervals({{20, 30}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); + + // Should have 2 ranges + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 10); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 30); +} + +TEST(RowSelection, UnionWithEmpty) { + auto lhs = RowSelection::MakeIntervals({{0, 10}}); + auto rhs = RowSelection::MakeIntervals({}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 11); +} + +TEST(RowSelection, UnionEmptyWithNonEmpty) { + auto lhs = RowSelection::MakeIntervals({}); + auto rhs = RowSelection::MakeIntervals({{0, 10}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 11); +} + +TEST(RowSelection, UnionIdentical) { + auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 22); + + // Should still have 2 ranges (merged) + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +TEST(RowSelection, UnionComplex) { + auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}, {50, 60}}); + auto rhs = RowSelection::MakeIntervals({{5, 15}, {25, 35}, {45, 55}}); + + auto result = RowSelection::Union(lhs, rhs); + + // Expected: [0,15], [20,35], [45,60] + EXPECT_EQ(result.row_count(), 48); // 16 + 16 + 16 + + auto iter = result.NewIterator(); + + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 15); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 20); + EXPECT_EQ(interval.end, 35); + + range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + interval = std::get(range); + EXPECT_EQ(interval.start, 45); + EXPECT_EQ(interval.end, 60); +} + +TEST(RowSelection, UnionManyOverlapping) { + auto lhs = RowSelection::MakeIntervals({{0, 100}}); + auto rhs = RowSelection::MakeIntervals({{50, 150}}); + + auto result = RowSelection::Union(lhs, rhs); + EXPECT_EQ(result.row_count(), 151); // [0, 150] + + auto iter = result.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 150); +} + +// Test iterator behavior +TEST(RowSelection, IteratorMultipleIterators) { + auto ranges = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + + auto iter1 = ranges.NewIterator(); + auto iter2 = ranges.NewIterator(); + + // Both iterators should work independently + auto range1 = iter1->NextRange(); + auto range2 = iter2->NextRange(); + + ASSERT_TRUE(std::holds_alternative(range1)); + ASSERT_TRUE(std::holds_alternative(range2)); + + auto interval1 = std::get(range1); + auto interval2 = std::get(range2); + + EXPECT_EQ(interval1.start, interval2.start); + EXPECT_EQ(interval1.end, interval2.end); +} + +TEST(RowSelection, IteratorExhaustion) { + auto ranges = RowSelection::MakeSingle(10); + auto iter = ranges.NewIterator(); + + // First call returns the range + auto range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); + + // Subsequent calls should return End + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); + + range = iter->NextRange(); + EXPECT_TRUE(std::holds_alternative(range)); +} + +// Test edge cases +TEST(RowSelection, LargeRanges) { + auto ranges = RowSelection::MakeSingle(0, 1000000); + EXPECT_EQ(ranges.row_count(), 1000001); + EXPECT_NO_THROW(ranges.Validate()); +} + +TEST(RowSelection, ZeroStartRange) { + auto ranges = RowSelection::MakeSingle(0, 0); + EXPECT_EQ(ranges.row_count(), 1); + + auto iter = ranges.NewIterator(); + auto range = iter->NextRange(); + ASSERT_TRUE(std::holds_alternative(range)); + auto interval = std::get(range); + EXPECT_EQ(interval.start, 0); + EXPECT_EQ(interval.end, 0); +} + +TEST(RowSelection, IntersectAndUnionCommutative) { + auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + auto rhs = RowSelection::MakeIntervals({{5, 15}, {25, 35}}); + + // Intersect should be commutative + auto intersect1 = RowSelection::Intersect(lhs, rhs); + auto intersect2 = RowSelection::Intersect(rhs, lhs); + EXPECT_EQ(intersect1.row_count(), intersect2.row_count()); + + // Union should be commutative + auto union1 = RowSelection::Union(lhs, rhs); + auto union2 = RowSelection::Union(rhs, lhs); + EXPECT_EQ(union1.row_count(), union2.row_count()); +} + +} // namespace parquet From 469359bd260482ed456045c83f1e2df82108590d Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Mon, 22 Dec 2025 09:31:26 -0500 Subject: [PATCH 06/11] Remove bitmap support for now. --- cpp/src/parquet/row_selection.cc | 129 ++++++++++--------------------- cpp/src/parquet/row_selection.h | 18 +---- 2 files changed, 44 insertions(+), 103 deletions(-) diff --git a/cpp/src/parquet/row_selection.cc b/cpp/src/parquet/row_selection.cc index c0e8c529f13..66a263f73a6 100644 --- a/cpp/src/parquet/row_selection.cc +++ b/cpp/src/parquet/row_selection.cc @@ -30,18 +30,11 @@ class IteratorImpl : public RowSelection::Iterator { ~IteratorImpl() override = default; - std::variant - NextRange() override { + std::optional NextRange() override { if (iter_ == end_) { - return RowSelection::End(); + return std::nullopt; } - if (std::holds_alternative(*iter_)) { - return std::get(*iter_); - } - if (std::holds_alternative(*iter_)) { - return std::get(*iter_); - } - arrow::Unreachable("Invalid row ranges type"); + return *iter_++; } private: @@ -55,63 +48,33 @@ std::unique_ptr RowSelection::NewIterator() const { void RowSelection::Validate() const { int64_t last_end = -1; - for (const auto& range : ranges_) { - if (std::holds_alternative(range)) { - const auto& interval = std::get(range); - if (interval.start <= last_end) { - throw ParquetException("Row ranges are not in ascending order"); - } - if (interval.end < interval.start) { - throw ParquetException("Invalid interval range"); - } - last_end = interval.end; - continue; + for (const auto& interval : ranges_) { + if (interval.start <= last_end) { + throw ParquetException("Row ranges are not in ascending order"); } - if (std::holds_alternative(range)) { - const auto& bitmap = std::get(range); - if (bitmap.offset <= last_end) { - throw ParquetException("Row ranges are not in ascending order"); - } - last_end = bitmap.offset + sizeof(bitmap.bitmap) - 1; - continue; + if (interval.end < interval.start) { + throw ParquetException("Invalid interval range"); } - arrow::Unreachable("Invalid row ranges type"); + last_end = interval.end; } } int64_t RowSelection::row_count() const { int64_t count = 0; - for (const auto& range : ranges_) { - if (std::holds_alternative(range)) { - const auto& interval = std::get(range); - count += interval.end - interval.start + 1; - } - if (std::holds_alternative(range)) { - const auto& bitmap = std::get(range); - count += arrow::internal::CountSetBits( - reinterpret_cast(&bitmap.bitmap), 0, sizeof(bitmap.bitmap)); - } - arrow::Unreachable("Invalid row ranges type"); + for (const auto& interval : ranges_) { + count += interval.end - interval.start + 1; } return count; } RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection& rhs) { RowSelection result; - auto lhs_iter = lhs.NewIterator(); - auto rhs_iter = rhs.NewIterator(); - auto lhs_range = lhs_iter->NextRange(); - auto rhs_range = rhs_iter->NextRange(); - - while (!std::holds_alternative(lhs_range) && - !std::holds_alternative(rhs_range)) { - if (!std::holds_alternative(lhs_range) || - !std::holds_alternative(rhs_range)) { - throw ParquetException("Bitmap range is not yet supported"); - } + size_t lhs_idx = 0; + size_t rhs_idx = 0; - auto& left = std::get(lhs_range); - auto& right = std::get(rhs_range); + while (lhs_idx < lhs.ranges_.size() && rhs_idx < rhs.ranges_.size()) { + const auto& left = lhs.ranges_[lhs_idx]; + const auto& right = rhs.ranges_[rhs_idx]; // Find overlapping region int64_t start = std::max(left.start, right.start); @@ -124,9 +87,9 @@ RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection // Advance the iterator with smaller end if (left.end < right.end) { - lhs_range = lhs_iter->NextRange(); + lhs_idx++; } else { - rhs_range = rhs_iter->NextRange(); + rhs_idx++; } } @@ -135,57 +98,45 @@ RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection RowSelection RowSelection::Union(const RowSelection& lhs, const RowSelection& rhs) { RowSelection result; - auto lhs_iter = lhs.NewIterator(); - auto rhs_iter = rhs.NewIterator(); - auto lhs_range = lhs_iter->NextRange(); - auto rhs_range = rhs_iter->NextRange(); - - if (std::holds_alternative(lhs_range)) { + + if (lhs.ranges_.empty()) { return rhs; } - if (std::holds_alternative(rhs_range)) { + if (rhs.ranges_.empty()) { return lhs; } - if (std::holds_alternative(lhs_range)) { - throw ParquetException("Bitmap range is not yet supported"); + size_t lhs_idx = 0; + size_t rhs_idx = 0; + + // Start with whichever range has the smaller start + IntervalRange current; + if (lhs.ranges_[0].start <= rhs.ranges_[0].start) { + current = lhs.ranges_[lhs_idx++]; + } else { + current = rhs.ranges_[rhs_idx++]; } - IntervalRange current = std::get(lhs_range); - lhs_range = lhs_iter->NextRange(); - while (!std::holds_alternative(lhs_range) || - !std::holds_alternative(rhs_range)) { + while (lhs_idx < lhs.ranges_.size() || rhs_idx < rhs.ranges_.size()) { IntervalRange next; - if (std::holds_alternative(rhs_range)) { + if (rhs_idx >= rhs.ranges_.size()) { // Only lhs ranges remain - if (std::holds_alternative(lhs_range)) { - throw ParquetException("Bitmap range is not yet supported"); - } - next = std::get(lhs_range); - lhs_range = lhs_iter->NextRange(); - } else if (std::holds_alternative(lhs_range)) { + next = lhs.ranges_[lhs_idx++]; + } else if (lhs_idx >= lhs.ranges_.size()) { // Only rhs ranges remain - if (std::holds_alternative(rhs_range)) { - throw ParquetException("Bitmap range is not yet supported"); - } - next = std::get(rhs_range); - rhs_range = rhs_iter->NextRange(); + next = rhs.ranges_[rhs_idx++]; } else { - // Both iterators have ranges - pick the one with smaller start - if (std::holds_alternative(lhs_range) || - std::holds_alternative(rhs_range)) { - throw ParquetException("Bitmap range is not yet supported"); - } - const auto& left = std::get(lhs_range); - const auto& right = std::get(rhs_range); + // Both have ranges - pick the one with smaller start + const auto& left = lhs.ranges_[lhs_idx]; + const auto& right = rhs.ranges_[rhs_idx]; if (left.start <= right.start) { next = left; - lhs_range = lhs_iter->NextRange(); + lhs_idx++; } else { next = right; - rhs_range = rhs_iter->NextRange(); + rhs_idx++; } } diff --git a/cpp/src/parquet/row_selection.h b/cpp/src/parquet/row_selection.h index e20d072cef0..e0c1d6dde0f 100644 --- a/cpp/src/parquet/row_selection.h +++ b/cpp/src/parquet/row_selection.h @@ -18,7 +18,7 @@ #pragma once #include -#include +#include #include #include "parquet/platform.h" @@ -36,22 +36,12 @@ class PARQUET_EXPORT RowSelection { int64_t end; }; - /// \brief EXPERIMENTAL: A range of contiguous rows represented by a bitmap. - struct BitmapRange { - /// Start row of the range (inclusive). - int64_t offset; - /// Zero appended if there are less than 64 elements. - uint64_t bitmap; - }; - - /// \brief EXPERIMENTAL: An end marker for the row range iterator. - struct End {}; - /// \brief EXPERIMENTAL: An iterator for accessing row ranges in order. class Iterator { public: virtual ~Iterator() = default; - virtual std::variant NextRange() = 0; + /// \brief Get the next range. Returns std::nullopt when exhausted. + virtual std::optional NextRange() = 0; }; /// \brief EXPERIMENTAL: Create a new iterator for accessing row ranges in order. @@ -82,7 +72,7 @@ class PARQUET_EXPORT RowSelection { private: friend class IteratorImpl; - std::vector> ranges_; + std::vector ranges_; }; } // namespace parquet From 8f9d7a5cac06c0a56259149a66c712557ddc7b8e Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Mon, 22 Dec 2025 09:59:32 -0500 Subject: [PATCH 07/11] make intervalRange(start,length) --- cpp/src/parquet/row_selection.cc | 28 +-- cpp/src/parquet/row_selection.h | 4 +- cpp/src/parquet/row_selection_test.cc | 240 +++++++++++++------------- 3 files changed, 139 insertions(+), 133 deletions(-) diff --git a/cpp/src/parquet/row_selection.cc b/cpp/src/parquet/row_selection.cc index 66a263f73a6..f61d4d449a3 100644 --- a/cpp/src/parquet/row_selection.cc +++ b/cpp/src/parquet/row_selection.cc @@ -52,17 +52,17 @@ void RowSelection::Validate() const { if (interval.start <= last_end) { throw ParquetException("Row ranges are not in ascending order"); } - if (interval.end < interval.start) { - throw ParquetException("Invalid interval range"); + if (interval.length <= 0) { + throw ParquetException("Invalid interval range: length must be positive"); } - last_end = interval.end; + last_end = interval.start + interval.length - 1; } } int64_t RowSelection::row_count() const { int64_t count = 0; for (const auto& interval : ranges_) { - count += interval.end - interval.start + 1; + count += interval.length; } return count; } @@ -76,17 +76,20 @@ RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection const auto& left = lhs.ranges_[lhs_idx]; const auto& right = rhs.ranges_[rhs_idx]; + int64_t left_end = left.start + left.length - 1; + int64_t right_end = right.start + right.length - 1; + // Find overlapping region int64_t start = std::max(left.start, right.start); - int64_t end = std::min(left.end, right.end); + int64_t end = std::min(left_end, right_end); // If there is an overlap, add it to results if (start <= end) { - result.ranges_.push_back(IntervalRange{start, end}); + result.ranges_.push_back(IntervalRange{start, end - start + 1}); } // Advance the iterator with smaller end - if (left.end < right.end) { + if (left_end < right_end) { lhs_idx++; } else { rhs_idx++; @@ -140,9 +143,12 @@ RowSelection RowSelection::Union(const RowSelection& lhs, const RowSelection& rh } } - if (current.end + 1 >= next.start) { + int64_t current_end = current.start + current.length - 1; + if (current_end + 1 >= next.start) { // Concatenate overlapping or adjacent ranges - current.end = std::max(current.end, next.end); + int64_t next_end = next.start + next.length - 1; + int64_t new_end = std::max(current_end, next_end); + current.length = new_end - current.start + 1; } else { // Gap between current and next range result.ranges_.push_back(current); @@ -156,13 +162,13 @@ RowSelection RowSelection::Union(const RowSelection& lhs, const RowSelection& rh RowSelection RowSelection::MakeSingle(int64_t row_count) { RowSelection rowSelection; - rowSelection.ranges_.push_back(IntervalRange{0, row_count - 1}); + rowSelection.ranges_.push_back(IntervalRange{0, row_count}); return rowSelection; } RowSelection RowSelection::MakeSingle(int64_t start, int64_t end) { RowSelection rowSelection; - rowSelection.ranges_.push_back(IntervalRange{start, end}); + rowSelection.ranges_.push_back(IntervalRange{start, end - start + 1}); return rowSelection; } diff --git a/cpp/src/parquet/row_selection.h b/cpp/src/parquet/row_selection.h index e0c1d6dde0f..1d15da64db2 100644 --- a/cpp/src/parquet/row_selection.h +++ b/cpp/src/parquet/row_selection.h @@ -32,8 +32,8 @@ class PARQUET_EXPORT RowSelection { struct IntervalRange { /// Start row of the range (inclusive). int64_t start; - /// End row of the range (inclusive). - int64_t end; + /// Number of rows in the range. + int64_t length; }; /// \brief EXPERIMENTAL: An iterator for accessing row ranges in order. diff --git a/cpp/src/parquet/row_selection_test.cc b/cpp/src/parquet/row_selection_test.cc index 9ba76468b1f..2530a6e5682 100644 --- a/cpp/src/parquet/row_selection_test.cc +++ b/cpp/src/parquet/row_selection_test.cc @@ -31,15 +31,15 @@ TEST(RowSelection, MakeSingleWithCount) { auto iter = ranges.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); + ASSERT_TRUE(range.has_value()); - auto interval = std::get(range); + auto interval = range.value(); EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 99); + EXPECT_EQ((interval.start + interval.length - 1), 99); // Should be exhausted range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); } TEST(RowSelection, MakeSingleWithStartEnd) { @@ -48,21 +48,21 @@ TEST(RowSelection, MakeSingleWithStartEnd) { auto iter = ranges.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); + ASSERT_TRUE(range.has_value()); - auto interval = std::get(range); + auto interval = range.value(); EXPECT_EQ(interval.start, 10); - EXPECT_EQ(interval.end, 20); + EXPECT_EQ((interval.start + interval.length - 1), 20); range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); } TEST(RowSelection, MakeIntervals) { std::vector intervals = { - {0, 10}, - {20, 30}, - {40, 50} + {0, 11}, + {20, 11}, + {40, 11} }; auto ranges = RowSelection::MakeIntervals(intervals); @@ -72,28 +72,28 @@ TEST(RowSelection, MakeIntervals) { // First interval auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 10); + EXPECT_EQ((interval.start + interval.length - 1), 10); // Second interval range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); + ASSERT_TRUE(range.has_value()); + interval = range.value(); EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 30); + EXPECT_EQ((interval.start + interval.length - 1), 30); // Third interval range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); + ASSERT_TRUE(range.has_value()); + interval = range.value(); EXPECT_EQ(interval.start, 40); - EXPECT_EQ(interval.end, 50); + EXPECT_EQ((interval.start + interval.length - 1), 50); // Exhausted range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); } TEST(RowSelection, EmptyRanges) { @@ -103,15 +103,15 @@ TEST(RowSelection, EmptyRanges) { auto iter = ranges.NewIterator(); auto range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); } // Test validation TEST(RowSelection, ValidateValidRanges) { std::vector intervals = { - {0, 10}, - {15, 20}, - {25, 30} + {0, 11}, + {15, 6}, + {25, 6} }; auto ranges = RowSelection::MakeIntervals(intervals); @@ -125,8 +125,8 @@ TEST(RowSelection, ValidateSingleRange) { TEST(RowSelection, ValidateOverlappingRanges) { std::vector intervals = { - {0, 10}, - {5, 15} // Overlaps with previous + {0, 11}, + {5, 11} // Overlaps with previous }; auto ranges = RowSelection::MakeIntervals(intervals); @@ -135,8 +135,8 @@ TEST(RowSelection, ValidateOverlappingRanges) { TEST(RowSelection, ValidateAdjacentRanges) { std::vector intervals = { - {0, 10}, - {11, 20} // Adjacent but not overlapping + {0, 11}, + {11, 10} // Adjacent but not overlapping }; auto ranges = RowSelection::MakeIntervals(intervals); @@ -145,8 +145,8 @@ TEST(RowSelection, ValidateAdjacentRanges) { TEST(RowSelection, ValidateInvalidRangeTouching) { std::vector intervals = { - {0, 10}, - {10, 20} // Touches at end/start (overlaps at 10) + {0, 11}, + {10, 11} // Touches at end/start (overlaps at 10) }; auto ranges = RowSelection::MakeIntervals(intervals); @@ -155,8 +155,8 @@ TEST(RowSelection, ValidateInvalidRangeTouching) { TEST(RowSelection, ValidateNotAscendingOrder) { std::vector intervals = { - {20, 30}, - {0, 10} // Not in ascending order + {20, 11}, + {0, 11} // Not in ascending order }; auto ranges = RowSelection::MakeIntervals(intervals); @@ -165,7 +165,7 @@ TEST(RowSelection, ValidateNotAscendingOrder) { TEST(RowSelection, ValidateInvalidInterval) { std::vector intervals = { - {10, 5} // end < start + {10, -4} // end < start }; auto ranges = RowSelection::MakeIntervals(intervals); @@ -180,9 +180,9 @@ TEST(RowSelection, RowCountSingle) { TEST(RowSelection, RowCountMultiple) { std::vector intervals = { - {0, 9}, // 10 rows - {20, 29}, // 10 rows - {50, 54} // 5 rows + {0, 10}, // 10 rows + {20, 10}, // 10 rows + {50, 5} // 5 rows }; auto ranges = RowSelection::MakeIntervals(intervals); @@ -202,16 +202,16 @@ TEST(RowSelection, RowCountSingleRow) { // Test Intersect TEST(RowSelection, IntersectNoOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowSelection::MakeIntervals({{40, 50}, {60, 70}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::MakeIntervals({{40, 11}, {60, 11}}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 0); } TEST(RowSelection, IntersectCompleteOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 100}}); - auto rhs = RowSelection::MakeIntervals({{20, 30}, {40, 50}}); + auto lhs = RowSelection::MakeIntervals({{0, 101}}); + auto rhs = RowSelection::MakeIntervals({{20, 11}, {40, 11}}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 22); // (30-20+1) + (50-40+1) @@ -219,21 +219,21 @@ TEST(RowSelection, IntersectCompleteOverlap) { auto iter = result.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 30); + EXPECT_EQ((interval.start + interval.length - 1), 30); range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); + ASSERT_TRUE(range.has_value()); + interval = range.value(); EXPECT_EQ(interval.start, 40); - EXPECT_EQ(interval.end, 50); + EXPECT_EQ((interval.start + interval.length - 1), 50); } TEST(RowSelection, IntersectPartialOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 15}, {20, 35}}); - auto rhs = RowSelection::MakeIntervals({{10, 25}, {40, 50}}); + auto lhs = RowSelection::MakeIntervals({{0, 16}, {20, 16}}); + auto rhs = RowSelection::MakeIntervals({{10, 16}, {40, 11}}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 12); // (15-10+1) + (25-20+1) @@ -241,28 +241,28 @@ TEST(RowSelection, IntersectPartialOverlap) { auto iter = result.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 10); - EXPECT_EQ(interval.end, 15); + EXPECT_EQ((interval.start + interval.length - 1), 15); range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); + ASSERT_TRUE(range.has_value()); + interval = range.value(); EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 25); + EXPECT_EQ((interval.start + interval.length - 1), 25); } TEST(RowSelection, IntersectIdentical) { - auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 22); } TEST(RowSelection, IntersectWithEmpty) { - auto lhs = RowSelection::MakeIntervals({{0, 10}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}}); auto rhs = RowSelection::MakeIntervals({}); auto result = RowSelection::Intersect(lhs, rhs); @@ -270,8 +270,8 @@ TEST(RowSelection, IntersectWithEmpty) { } TEST(RowSelection, IntersectComplex) { - auto lhs = RowSelection::MakeIntervals({{0, 10}, {15, 25}, {30, 40}, {50, 60}}); - auto rhs = RowSelection::MakeIntervals({{5, 12}, {20, 35}, {55, 70}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}, {15, 11}, {30, 11}, {50, 11}}); + auto rhs = RowSelection::MakeIntervals({{5, 8}, {20, 16}, {55, 16}}); auto result = RowSelection::Intersect(lhs, rhs); @@ -285,8 +285,8 @@ TEST(RowSelection, IntersectComplex) { // Test Union TEST(RowSelection, UnionNoOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowSelection::MakeIntervals({{40, 50}, {60, 70}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::MakeIntervals({{40, 11}, {60, 11}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 44); // 11+11+11+11 @@ -296,31 +296,31 @@ TEST(RowSelection, UnionNoOverlap) { // Should have 4 separate ranges for (int i = 0; i < 4; ++i) { auto range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(range.has_value()); } auto range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); } TEST(RowSelection, UnionWithOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 15}}); - auto rhs = RowSelection::MakeIntervals({{10, 25}}); + auto lhs = RowSelection::MakeIntervals({{0, 16}}); + auto rhs = RowSelection::MakeIntervals({{10, 16}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 26); // [0, 25] = 26 rows auto iter = result.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 25); + EXPECT_EQ((interval.start + interval.length - 1), 25); } TEST(RowSelection, UnionAdjacent) { - auto lhs = RowSelection::MakeIntervals({{0, 10}}); - auto rhs = RowSelection::MakeIntervals({{11, 20}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}}); + auto rhs = RowSelection::MakeIntervals({{11, 10}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 21); // [0, 20] = 21 rows @@ -328,18 +328,18 @@ TEST(RowSelection, UnionAdjacent) { // Should merge adjacent ranges auto iter = result.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 20); + EXPECT_EQ((interval.start + interval.length - 1), 20); range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); } TEST(RowSelection, UnionWithGap) { - auto lhs = RowSelection::MakeIntervals({{0, 10}}); - auto rhs = RowSelection::MakeIntervals({{20, 30}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}}); + auto rhs = RowSelection::MakeIntervals({{20, 11}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 22); @@ -348,20 +348,20 @@ TEST(RowSelection, UnionWithGap) { auto iter = result.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 10); + EXPECT_EQ((interval.start + interval.length - 1), 10); range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); + ASSERT_TRUE(range.has_value()); + interval = range.value(); EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 30); + EXPECT_EQ((interval.start + interval.length - 1), 30); } TEST(RowSelection, UnionWithEmpty) { - auto lhs = RowSelection::MakeIntervals({{0, 10}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}}); auto rhs = RowSelection::MakeIntervals({}); auto result = RowSelection::Union(lhs, rhs); @@ -370,15 +370,15 @@ TEST(RowSelection, UnionWithEmpty) { TEST(RowSelection, UnionEmptyWithNonEmpty) { auto lhs = RowSelection::MakeIntervals({}); - auto rhs = RowSelection::MakeIntervals({{0, 10}}); + auto rhs = RowSelection::MakeIntervals({{0, 11}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 11); } TEST(RowSelection, UnionIdentical) { - auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 22); @@ -387,18 +387,18 @@ TEST(RowSelection, UnionIdentical) { auto iter = result.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); + ASSERT_TRUE(range.has_value()); range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); + ASSERT_TRUE(range.has_value()); range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); } TEST(RowSelection, UnionComplex) { - auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}, {50, 60}}); - auto rhs = RowSelection::MakeIntervals({{5, 15}, {25, 35}, {45, 55}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}, {50, 11}}); + auto rhs = RowSelection::MakeIntervals({{5, 11}, {25, 11}, {45, 11}}); auto result = RowSelection::Union(lhs, rhs); @@ -408,42 +408,42 @@ TEST(RowSelection, UnionComplex) { auto iter = result.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 15); + EXPECT_EQ((interval.start + interval.length - 1), 15); range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); + ASSERT_TRUE(range.has_value()); + interval = range.value(); EXPECT_EQ(interval.start, 20); - EXPECT_EQ(interval.end, 35); + EXPECT_EQ((interval.start + interval.length - 1), 35); range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - interval = std::get(range); + ASSERT_TRUE(range.has_value()); + interval = range.value(); EXPECT_EQ(interval.start, 45); - EXPECT_EQ(interval.end, 60); + EXPECT_EQ((interval.start + interval.length - 1), 60); } TEST(RowSelection, UnionManyOverlapping) { - auto lhs = RowSelection::MakeIntervals({{0, 100}}); - auto rhs = RowSelection::MakeIntervals({{50, 150}}); + auto lhs = RowSelection::MakeIntervals({{0, 101}}); + auto rhs = RowSelection::MakeIntervals({{50, 101}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 151); // [0, 150] auto iter = result.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 150); + EXPECT_EQ((interval.start + interval.length - 1), 150); } // Test iterator behavior TEST(RowSelection, IteratorMultipleIterators) { - auto ranges = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); + auto ranges = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); auto iter1 = ranges.NewIterator(); auto iter2 = ranges.NewIterator(); @@ -452,14 +452,14 @@ TEST(RowSelection, IteratorMultipleIterators) { auto range1 = iter1->NextRange(); auto range2 = iter2->NextRange(); - ASSERT_TRUE(std::holds_alternative(range1)); - ASSERT_TRUE(std::holds_alternative(range2)); + ASSERT_TRUE(range1.has_value()); + ASSERT_TRUE(range2.has_value()); - auto interval1 = std::get(range1); - auto interval2 = std::get(range2); + auto interval1 = range1.value(); + auto interval2 = range2.value(); EXPECT_EQ(interval1.start, interval2.start); - EXPECT_EQ(interval1.end, interval2.end); + EXPECT_EQ((interval1.start + interval1.length - 1), (interval2.start + interval2.length - 1)); } TEST(RowSelection, IteratorExhaustion) { @@ -468,14 +468,14 @@ TEST(RowSelection, IteratorExhaustion) { // First call returns the range auto range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(range.has_value()); // Subsequent calls should return End range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); range = iter->NextRange(); - EXPECT_TRUE(std::holds_alternative(range)); + EXPECT_TRUE(!range.has_value()); } // Test edge cases @@ -491,15 +491,15 @@ TEST(RowSelection, ZeroStartRange) { auto iter = ranges.NewIterator(); auto range = iter->NextRange(); - ASSERT_TRUE(std::holds_alternative(range)); - auto interval = std::get(range); + ASSERT_TRUE(range.has_value()); + auto interval = range.value(); EXPECT_EQ(interval.start, 0); - EXPECT_EQ(interval.end, 0); + EXPECT_EQ((interval.start + interval.length - 1), 0); } TEST(RowSelection, IntersectAndUnionCommutative) { - auto lhs = RowSelection::MakeIntervals({{0, 10}, {20, 30}}); - auto rhs = RowSelection::MakeIntervals({{5, 15}, {25, 35}}); + auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::MakeIntervals({{5, 11}, {25, 11}}); // Intersect should be commutative auto intersect1 = RowSelection::Intersect(lhs, rhs); From 44bece9462c014abfcd9862c891a96a58cfeb6cd Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Mon, 22 Dec 2025 10:55:01 -0500 Subject: [PATCH 08/11] use span for iterator api --- cpp/src/parquet/row_selection.cc | 23 ++-- cpp/src/parquet/row_selection.h | 9 +- cpp/src/parquet/row_selection_test.cc | 172 +++++++++++++------------- 3 files changed, 106 insertions(+), 98 deletions(-) diff --git a/cpp/src/parquet/row_selection.cc b/cpp/src/parquet/row_selection.cc index f61d4d449a3..30985b6d2cf 100644 --- a/cpp/src/parquet/row_selection.cc +++ b/cpp/src/parquet/row_selection.cc @@ -25,21 +25,28 @@ namespace parquet { class IteratorImpl : public RowSelection::Iterator { public: - explicit IteratorImpl(const RowSelection& ranges) - : iter_(ranges.ranges_.cbegin()), end_(ranges.ranges_.cend()) {} + explicit IteratorImpl(const RowSelection& ranges, size_t batch_size = 1) + : ranges_(ranges.ranges_), index_(0), batch_size_(batch_size) {} ~IteratorImpl() override = default; - std::optional NextRange() override { - if (iter_ == end_) { - return std::nullopt; + ::arrow::util::span NextRanges() override { + if (index_ >= ranges_.size()) { + return {}; } - return *iter_++; + // Return up to batch_size_ ranges + size_t remaining = ranges_.size() - index_; + size_t count = std::min(batch_size_, remaining); + auto result = ::arrow::util::span( + ranges_.data() + index_, count); + index_ += count; + return result; } private: - decltype(RowSelection::ranges_.cbegin()) iter_; - decltype(RowSelection::ranges_.cend()) end_; + const std::vector& ranges_; + size_t index_; + size_t batch_size_; }; std::unique_ptr RowSelection::NewIterator() const { diff --git a/cpp/src/parquet/row_selection.h b/cpp/src/parquet/row_selection.h index 1d15da64db2..dcaf0796251 100644 --- a/cpp/src/parquet/row_selection.h +++ b/cpp/src/parquet/row_selection.h @@ -18,9 +18,9 @@ #pragma once #include -#include #include +#include "arrow/util/span.h" #include "parquet/platform.h" namespace parquet { @@ -36,12 +36,13 @@ class PARQUET_EXPORT RowSelection { int64_t length; }; - /// \brief EXPERIMENTAL: An iterator for accessing row ranges in order. + /// \brief EXPERIMENTAL: An iterator for accessing row ranges in batches. class Iterator { public: virtual ~Iterator() = default; - /// \brief Get the next range. Returns std::nullopt when exhausted. - virtual std::optional NextRange() = 0; + /// \brief Get the next batch of ranges. + /// Returns an empty span when exhausted. + virtual ::arrow::util::span NextRanges() = 0; }; /// \brief EXPERIMENTAL: Create a new iterator for accessing row ranges in order. diff --git a/cpp/src/parquet/row_selection_test.cc b/cpp/src/parquet/row_selection_test.cc index 2530a6e5682..255f3a5b1fd 100644 --- a/cpp/src/parquet/row_selection_test.cc +++ b/cpp/src/parquet/row_selection_test.cc @@ -30,16 +30,16 @@ TEST(RowSelection, MakeSingleWithCount) { ASSERT_EQ(ranges.row_count(), 100); auto iter = ranges.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); - auto interval = range.value(); + auto interval = batch[0]; EXPECT_EQ(interval.start, 0); EXPECT_EQ((interval.start + interval.length - 1), 99); // Should be exhausted - range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); } TEST(RowSelection, MakeSingleWithStartEnd) { @@ -47,15 +47,15 @@ TEST(RowSelection, MakeSingleWithStartEnd) { ASSERT_EQ(ranges.row_count(), 11); auto iter = ranges.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); - auto interval = range.value(); + auto interval = batch[0]; EXPECT_EQ(interval.start, 10); EXPECT_EQ((interval.start + interval.length - 1), 20); - range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); } TEST(RowSelection, MakeIntervals) { @@ -71,29 +71,29 @@ TEST(RowSelection, MakeIntervals) { auto iter = ranges.NewIterator(); // First interval - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 0); EXPECT_EQ((interval.start + interval.length - 1), 10); // Second interval - range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - interval = range.value(); + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; EXPECT_EQ(interval.start, 20); EXPECT_EQ((interval.start + interval.length - 1), 30); // Third interval - range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - interval = range.value(); + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; EXPECT_EQ(interval.start, 40); EXPECT_EQ((interval.start + interval.length - 1), 50); // Exhausted - range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); } TEST(RowSelection, EmptyRanges) { @@ -102,8 +102,8 @@ TEST(RowSelection, EmptyRanges) { ASSERT_EQ(ranges.row_count(), 0); auto iter = ranges.NewIterator(); - auto range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + auto batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); } // Test validation @@ -218,15 +218,15 @@ TEST(RowSelection, IntersectCompleteOverlap) { auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 20); EXPECT_EQ((interval.start + interval.length - 1), 30); - range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - interval = range.value(); + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; EXPECT_EQ(interval.start, 40); EXPECT_EQ((interval.start + interval.length - 1), 50); } @@ -240,15 +240,15 @@ TEST(RowSelection, IntersectPartialOverlap) { auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 10); EXPECT_EQ((interval.start + interval.length - 1), 15); - range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - interval = range.value(); + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; EXPECT_EQ(interval.start, 20); EXPECT_EQ((interval.start + interval.length - 1), 25); } @@ -295,12 +295,12 @@ TEST(RowSelection, UnionNoOverlap) { // Should have 4 separate ranges for (int i = 0; i < 4; ++i) { - auto range = iter->NextRange(); - EXPECT_TRUE(range.has_value()); + auto batch = iter->NextRanges(); + EXPECT_TRUE(batch.size() > 0); } - auto range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + auto batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); } TEST(RowSelection, UnionWithOverlap) { @@ -311,9 +311,9 @@ TEST(RowSelection, UnionWithOverlap) { EXPECT_EQ(result.row_count(), 26); // [0, 25] = 26 rows auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 0); EXPECT_EQ((interval.start + interval.length - 1), 25); } @@ -327,14 +327,14 @@ TEST(RowSelection, UnionAdjacent) { // Should merge adjacent ranges auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 0); EXPECT_EQ((interval.start + interval.length - 1), 20); - range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); } TEST(RowSelection, UnionWithGap) { @@ -347,15 +347,15 @@ TEST(RowSelection, UnionWithGap) { // Should have 2 ranges auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 0); EXPECT_EQ((interval.start + interval.length - 1), 10); - range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - interval = range.value(); + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; EXPECT_EQ(interval.start, 20); EXPECT_EQ((interval.start + interval.length - 1), 30); } @@ -386,14 +386,14 @@ TEST(RowSelection, UnionIdentical) { // Should still have 2 ranges (merged) auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); - range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); - range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); } TEST(RowSelection, UnionComplex) { @@ -407,21 +407,21 @@ TEST(RowSelection, UnionComplex) { auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 0); EXPECT_EQ((interval.start + interval.length - 1), 15); - range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - interval = range.value(); + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; EXPECT_EQ(interval.start, 20); EXPECT_EQ((interval.start + interval.length - 1), 35); - range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - interval = range.value(); + batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + interval = batch[0]; EXPECT_EQ(interval.start, 45); EXPECT_EQ((interval.start + interval.length - 1), 60); } @@ -434,9 +434,9 @@ TEST(RowSelection, UnionManyOverlapping) { EXPECT_EQ(result.row_count(), 151); // [0, 150] auto iter = result.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 0); EXPECT_EQ((interval.start + interval.length - 1), 150); } @@ -449,14 +449,14 @@ TEST(RowSelection, IteratorMultipleIterators) { auto iter2 = ranges.NewIterator(); // Both iterators should work independently - auto range1 = iter1->NextRange(); - auto range2 = iter2->NextRange(); + auto batch1 = iter1->NextRanges(); + auto batch2 = iter2->NextRanges(); - ASSERT_TRUE(range1.has_value()); - ASSERT_TRUE(range2.has_value()); + ASSERT_FALSE(batch1.empty()); + ASSERT_FALSE(batch2.empty()); - auto interval1 = range1.value(); - auto interval2 = range2.value(); + auto interval1 = batch1[0]; + auto interval2 = batch2[0]; EXPECT_EQ(interval1.start, interval2.start); EXPECT_EQ((interval1.start + interval1.length - 1), (interval2.start + interval2.length - 1)); @@ -467,15 +467,15 @@ TEST(RowSelection, IteratorExhaustion) { auto iter = ranges.NewIterator(); // First call returns the range - auto range = iter->NextRange(); - EXPECT_TRUE(range.has_value()); + auto batch = iter->NextRanges(); + EXPECT_TRUE(batch.size() > 0); // Subsequent calls should return End - range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); - range = iter->NextRange(); - EXPECT_TRUE(!range.has_value()); + batch = iter->NextRanges(); + EXPECT_TRUE(batch.empty()); } // Test edge cases @@ -490,9 +490,9 @@ TEST(RowSelection, ZeroStartRange) { EXPECT_EQ(ranges.row_count(), 1); auto iter = ranges.NewIterator(); - auto range = iter->NextRange(); - ASSERT_TRUE(range.has_value()); - auto interval = range.value(); + auto batch = iter->NextRanges(); + ASSERT_FALSE(batch.empty()); + auto interval = batch[0]; EXPECT_EQ(interval.start, 0); EXPECT_EQ((interval.start + interval.length - 1), 0); } From ace365382f4db8dd704b2d806a9133ee446efb95 Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Mon, 22 Dec 2025 11:02:08 -0500 Subject: [PATCH 09/11] Rename MakeIntervals to FromIntervals --- cpp/src/parquet/row_selection.cc | 8 ++- cpp/src/parquet/row_selection.h | 5 +- cpp/src/parquet/row_selection_test.cc | 88 +++++++++++++-------------- 3 files changed, 54 insertions(+), 47 deletions(-) diff --git a/cpp/src/parquet/row_selection.cc b/cpp/src/parquet/row_selection.cc index 30985b6d2cf..a7648a27516 100644 --- a/cpp/src/parquet/row_selection.cc +++ b/cpp/src/parquet/row_selection.cc @@ -179,11 +179,15 @@ RowSelection RowSelection::MakeSingle(int64_t start, int64_t end) { return rowSelection; } -RowSelection RowSelection::MakeIntervals(const std::vector& intervals) { +RowSelection RowSelection::FromIntervals(::arrow::util::span intervals) { RowSelection rowSelection; rowSelection.ranges_.reserve(intervals.size()); - rowSelection.ranges_.insert(rowSelection.ranges_.end(), intervals.cbegin(), intervals.cend()); + rowSelection.ranges_.insert(rowSelection.ranges_.end(), intervals.begin(), intervals.end()); return rowSelection; } +RowSelection RowSelection::FromIntervals(const std::vector& intervals) { + return FromIntervals(::arrow::util::span(intervals)); +} + } // namespace parquet \ No newline at end of file diff --git a/cpp/src/parquet/row_selection.h b/cpp/src/parquet/row_selection.h index dcaf0796251..539d38a7743 100644 --- a/cpp/src/parquet/row_selection.h +++ b/cpp/src/parquet/row_selection.h @@ -69,7 +69,10 @@ class PARQUET_EXPORT RowSelection { static RowSelection MakeSingle(int64_t start, int64_t end); /// \brief EXPERIMENTAL: Make a row range from a list of intervals. - static RowSelection MakeIntervals(const std::vector& intervals); + static RowSelection FromIntervals(::arrow::util::span intervals); + + /// \brief EXPERIMENTAL: Make a row range from a vector of intervals. + static RowSelection FromIntervals(const std::vector& intervals); private: friend class IteratorImpl; diff --git a/cpp/src/parquet/row_selection_test.cc b/cpp/src/parquet/row_selection_test.cc index 255f3a5b1fd..1ea4917a670 100644 --- a/cpp/src/parquet/row_selection_test.cc +++ b/cpp/src/parquet/row_selection_test.cc @@ -58,14 +58,14 @@ TEST(RowSelection, MakeSingleWithStartEnd) { EXPECT_TRUE(batch.empty()); } -TEST(RowSelection, MakeIntervals) { +TEST(RowSelection, FromIntervals) { std::vector intervals = { {0, 11}, {20, 11}, {40, 11} }; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); ASSERT_EQ(ranges.row_count(), 33); // 11 + 11 + 11 auto iter = ranges.NewIterator(); @@ -98,7 +98,7 @@ TEST(RowSelection, MakeIntervals) { TEST(RowSelection, EmptyRanges) { std::vector intervals; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); ASSERT_EQ(ranges.row_count(), 0); auto iter = ranges.NewIterator(); @@ -114,7 +114,7 @@ TEST(RowSelection, ValidateValidRanges) { {25, 6} }; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); EXPECT_NO_THROW(ranges.Validate()); } @@ -129,7 +129,7 @@ TEST(RowSelection, ValidateOverlappingRanges) { {5, 11} // Overlaps with previous }; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); EXPECT_THROW(ranges.Validate(), ParquetException); } @@ -139,7 +139,7 @@ TEST(RowSelection, ValidateAdjacentRanges) { {11, 10} // Adjacent but not overlapping }; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); EXPECT_NO_THROW(ranges.Validate()); } @@ -149,7 +149,7 @@ TEST(RowSelection, ValidateInvalidRangeTouching) { {10, 11} // Touches at end/start (overlaps at 10) }; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); EXPECT_THROW(ranges.Validate(), ParquetException); } @@ -159,7 +159,7 @@ TEST(RowSelection, ValidateNotAscendingOrder) { {0, 11} // Not in ascending order }; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); EXPECT_THROW(ranges.Validate(), ParquetException); } @@ -168,7 +168,7 @@ TEST(RowSelection, ValidateInvalidInterval) { {10, -4} // end < start }; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); EXPECT_THROW(ranges.Validate(), ParquetException); } @@ -185,13 +185,13 @@ TEST(RowSelection, RowCountMultiple) { {50, 5} // 5 rows }; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); EXPECT_EQ(ranges.row_count(), 25); } TEST(RowSelection, RowCountEmpty) { std::vector intervals; - auto ranges = RowSelection::MakeIntervals(intervals); + auto ranges = RowSelection::FromIntervals(intervals); EXPECT_EQ(ranges.row_count(), 0); } @@ -202,16 +202,16 @@ TEST(RowSelection, RowCountSingleRow) { // Test Intersect TEST(RowSelection, IntersectNoOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); - auto rhs = RowSelection::MakeIntervals({{40, 11}, {60, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{40, 11}, {60, 11}}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 0); } TEST(RowSelection, IntersectCompleteOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 101}}); - auto rhs = RowSelection::MakeIntervals({{20, 11}, {40, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 101}}); + auto rhs = RowSelection::FromIntervals({{20, 11}, {40, 11}}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 22); // (30-20+1) + (50-40+1) @@ -232,8 +232,8 @@ TEST(RowSelection, IntersectCompleteOverlap) { } TEST(RowSelection, IntersectPartialOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 16}, {20, 16}}); - auto rhs = RowSelection::MakeIntervals({{10, 16}, {40, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 16}, {20, 16}}); + auto rhs = RowSelection::FromIntervals({{10, 16}, {40, 11}}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 12); // (15-10+1) + (25-20+1) @@ -254,24 +254,24 @@ TEST(RowSelection, IntersectPartialOverlap) { } TEST(RowSelection, IntersectIdentical) { - auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); - auto rhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 22); } TEST(RowSelection, IntersectWithEmpty) { - auto lhs = RowSelection::MakeIntervals({{0, 11}}); - auto rhs = RowSelection::MakeIntervals({}); + auto lhs = RowSelection::FromIntervals({{0, 11}}); + auto rhs = RowSelection::FromIntervals(std::vector{}); auto result = RowSelection::Intersect(lhs, rhs); EXPECT_EQ(result.row_count(), 0); } TEST(RowSelection, IntersectComplex) { - auto lhs = RowSelection::MakeIntervals({{0, 11}, {15, 11}, {30, 11}, {50, 11}}); - auto rhs = RowSelection::MakeIntervals({{5, 8}, {20, 16}, {55, 16}}); + auto lhs = RowSelection::FromIntervals({{0, 11}, {15, 11}, {30, 11}, {50, 11}}); + auto rhs = RowSelection::FromIntervals({{5, 8}, {20, 16}, {55, 16}}); auto result = RowSelection::Intersect(lhs, rhs); @@ -285,8 +285,8 @@ TEST(RowSelection, IntersectComplex) { // Test Union TEST(RowSelection, UnionNoOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); - auto rhs = RowSelection::MakeIntervals({{40, 11}, {60, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{40, 11}, {60, 11}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 44); // 11+11+11+11 @@ -304,8 +304,8 @@ TEST(RowSelection, UnionNoOverlap) { } TEST(RowSelection, UnionWithOverlap) { - auto lhs = RowSelection::MakeIntervals({{0, 16}}); - auto rhs = RowSelection::MakeIntervals({{10, 16}}); + auto lhs = RowSelection::FromIntervals({{0, 16}}); + auto rhs = RowSelection::FromIntervals({{10, 16}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 26); // [0, 25] = 26 rows @@ -319,8 +319,8 @@ TEST(RowSelection, UnionWithOverlap) { } TEST(RowSelection, UnionAdjacent) { - auto lhs = RowSelection::MakeIntervals({{0, 11}}); - auto rhs = RowSelection::MakeIntervals({{11, 10}}); + auto lhs = RowSelection::FromIntervals({{0, 11}}); + auto rhs = RowSelection::FromIntervals({{11, 10}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 21); // [0, 20] = 21 rows @@ -338,8 +338,8 @@ TEST(RowSelection, UnionAdjacent) { } TEST(RowSelection, UnionWithGap) { - auto lhs = RowSelection::MakeIntervals({{0, 11}}); - auto rhs = RowSelection::MakeIntervals({{20, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 11}}); + auto rhs = RowSelection::FromIntervals({{20, 11}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 22); @@ -361,24 +361,24 @@ TEST(RowSelection, UnionWithGap) { } TEST(RowSelection, UnionWithEmpty) { - auto lhs = RowSelection::MakeIntervals({{0, 11}}); - auto rhs = RowSelection::MakeIntervals({}); + auto lhs = RowSelection::FromIntervals({{0, 11}}); + auto rhs = RowSelection::FromIntervals(std::vector{}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 11); } TEST(RowSelection, UnionEmptyWithNonEmpty) { - auto lhs = RowSelection::MakeIntervals({}); - auto rhs = RowSelection::MakeIntervals({{0, 11}}); + auto lhs = RowSelection::FromIntervals(std::vector{}); + auto rhs = RowSelection::FromIntervals({{0, 11}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 11); } TEST(RowSelection, UnionIdentical) { - auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); - auto rhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 22); @@ -397,8 +397,8 @@ TEST(RowSelection, UnionIdentical) { } TEST(RowSelection, UnionComplex) { - auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}, {50, 11}}); - auto rhs = RowSelection::MakeIntervals({{5, 11}, {25, 11}, {45, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}, {50, 11}}); + auto rhs = RowSelection::FromIntervals({{5, 11}, {25, 11}, {45, 11}}); auto result = RowSelection::Union(lhs, rhs); @@ -427,8 +427,8 @@ TEST(RowSelection, UnionComplex) { } TEST(RowSelection, UnionManyOverlapping) { - auto lhs = RowSelection::MakeIntervals({{0, 101}}); - auto rhs = RowSelection::MakeIntervals({{50, 101}}); + auto lhs = RowSelection::FromIntervals({{0, 101}}); + auto rhs = RowSelection::FromIntervals({{50, 101}}); auto result = RowSelection::Union(lhs, rhs); EXPECT_EQ(result.row_count(), 151); // [0, 150] @@ -443,7 +443,7 @@ TEST(RowSelection, UnionManyOverlapping) { // Test iterator behavior TEST(RowSelection, IteratorMultipleIterators) { - auto ranges = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); + auto ranges = RowSelection::FromIntervals({{0, 11}, {20, 11}}); auto iter1 = ranges.NewIterator(); auto iter2 = ranges.NewIterator(); @@ -498,8 +498,8 @@ TEST(RowSelection, ZeroStartRange) { } TEST(RowSelection, IntersectAndUnionCommutative) { - auto lhs = RowSelection::MakeIntervals({{0, 11}, {20, 11}}); - auto rhs = RowSelection::MakeIntervals({{5, 11}, {25, 11}}); + auto lhs = RowSelection::FromIntervals({{0, 11}, {20, 11}}); + auto rhs = RowSelection::FromIntervals({{5, 11}, {25, 11}}); // Intersect should be commutative auto intersect1 = RowSelection::Intersect(lhs, rhs); From a979901124eafacf09da2a4b25b20596d9d459be Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Mon, 22 Dec 2025 11:07:21 -0500 Subject: [PATCH 10/11] remove duplicate MakeSingle --- cpp/src/parquet/row_selection.cc | 6 ------ cpp/src/parquet/row_selection.h | 3 --- cpp/src/parquet/row_selection_test.cc | 8 ++++---- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/row_selection.cc b/cpp/src/parquet/row_selection.cc index a7648a27516..0228ba2b385 100644 --- a/cpp/src/parquet/row_selection.cc +++ b/cpp/src/parquet/row_selection.cc @@ -167,12 +167,6 @@ RowSelection RowSelection::Union(const RowSelection& lhs, const RowSelection& rh return result; } -RowSelection RowSelection::MakeSingle(int64_t row_count) { - RowSelection rowSelection; - rowSelection.ranges_.push_back(IntervalRange{0, row_count}); - return rowSelection; -} - RowSelection RowSelection::MakeSingle(int64_t start, int64_t end) { RowSelection rowSelection; rowSelection.ranges_.push_back(IntervalRange{start, end - start + 1}); diff --git a/cpp/src/parquet/row_selection.h b/cpp/src/parquet/row_selection.h index 539d38a7743..14df8b97071 100644 --- a/cpp/src/parquet/row_selection.h +++ b/cpp/src/parquet/row_selection.h @@ -62,9 +62,6 @@ class PARQUET_EXPORT RowSelection { /// \brief EXPERIMENTAL: Compute the union of two row ranges. static RowSelection Union(const RowSelection& lhs, const RowSelection& rhs); - /// \brief EXPERIMENTAL: Make a single row range of [0, row_count - 1]. - static RowSelection MakeSingle(int64_t row_count); - /// \brief EXPERIMENTAL: Make a single row range of [start, end]. static RowSelection MakeSingle(int64_t start, int64_t end); diff --git a/cpp/src/parquet/row_selection_test.cc b/cpp/src/parquet/row_selection_test.cc index 1ea4917a670..ecfb4b5ccb8 100644 --- a/cpp/src/parquet/row_selection_test.cc +++ b/cpp/src/parquet/row_selection_test.cc @@ -26,7 +26,7 @@ namespace parquet { // Test factory methods TEST(RowSelection, MakeSingleWithCount) { - auto ranges = RowSelection::MakeSingle(100); + auto ranges = RowSelection::MakeSingle(0, 99); ASSERT_EQ(ranges.row_count(), 100); auto iter = ranges.NewIterator(); @@ -119,7 +119,7 @@ TEST(RowSelection, ValidateValidRanges) { } TEST(RowSelection, ValidateSingleRange) { - auto ranges = RowSelection::MakeSingle(100); + auto ranges = RowSelection::MakeSingle(0, 99); EXPECT_NO_THROW(ranges.Validate()); } @@ -174,7 +174,7 @@ TEST(RowSelection, ValidateInvalidInterval) { // Test row_count TEST(RowSelection, RowCountSingle) { - auto ranges = RowSelection::MakeSingle(50); + auto ranges = RowSelection::MakeSingle(0, 49); EXPECT_EQ(ranges.row_count(), 50); } @@ -463,7 +463,7 @@ TEST(RowSelection, IteratorMultipleIterators) { } TEST(RowSelection, IteratorExhaustion) { - auto ranges = RowSelection::MakeSingle(10); + auto ranges = RowSelection::MakeSingle(0, 9); auto iter = ranges.NewIterator(); // First call returns the range From 24e815af7906b79a371117817bccbac99513bfd1 Mon Sep 17 00:00:00 2001 From: Samuel Wein Date: Mon, 22 Dec 2025 11:11:55 -0500 Subject: [PATCH 11/11] use batched iteration for intersect and union --- cpp/.gitignore | 1 + cpp/src/parquet/row_selection.cc | 74 ++++++++++++++++++++++++++------ 2 files changed, 61 insertions(+), 14 deletions(-) diff --git a/cpp/.gitignore b/cpp/.gitignore index 134bd4f3ee9..64c147d2868 100644 --- a/cpp/.gitignore +++ b/cpp/.gitignore @@ -27,6 +27,7 @@ build/ Testing/ build-support/boost_* vcpkg_installed/ +_deps/ # Build directories created by Clion cmake-build-*/ diff --git a/cpp/src/parquet/row_selection.cc b/cpp/src/parquet/row_selection.cc index 0228ba2b385..692942c10e1 100644 --- a/cpp/src/parquet/row_selection.cc +++ b/cpp/src/parquet/row_selection.cc @@ -76,12 +76,20 @@ int64_t RowSelection::row_count() const { RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection& rhs) { RowSelection result; + + // Use iterators to get batches + auto lhs_iter = lhs.NewIterator(); + auto rhs_iter = rhs.NewIterator(); + + auto lhs_batch = lhs_iter->NextRanges(); + auto rhs_batch = rhs_iter->NextRanges(); size_t lhs_idx = 0; size_t rhs_idx = 0; - while (lhs_idx < lhs.ranges_.size() && rhs_idx < rhs.ranges_.size()) { - const auto& left = lhs.ranges_[lhs_idx]; - const auto& right = rhs.ranges_[rhs_idx]; + while (!lhs_batch.empty() && !rhs_batch.empty()) { + // Get current ranges from batches + const auto& left = lhs_batch[lhs_idx]; + const auto& right = rhs_batch[rhs_idx]; int64_t left_end = left.start + left.length - 1; int64_t right_end = right.start + right.length - 1; @@ -95,11 +103,19 @@ RowSelection RowSelection::Intersect(const RowSelection& lhs, const RowSelection result.ranges_.push_back(IntervalRange{start, end - start + 1}); } - // Advance the iterator with smaller end + // Advance the index with smaller end if (left_end < right_end) { lhs_idx++; + if (lhs_idx >= lhs_batch.size()) { + lhs_batch = lhs_iter->NextRanges(); + lhs_idx = 0; + } } else { rhs_idx++; + if (rhs_idx >= rhs_batch.size()) { + rhs_batch = rhs_iter->NextRanges(); + rhs_idx = 0; + } } } @@ -116,37 +132,67 @@ RowSelection RowSelection::Union(const RowSelection& lhs, const RowSelection& rh return lhs; } + // Use iterators to get batches + auto lhs_iter = lhs.NewIterator(); + auto rhs_iter = rhs.NewIterator(); + + auto lhs_batch = lhs_iter->NextRanges(); + auto rhs_batch = rhs_iter->NextRanges(); size_t lhs_idx = 0; size_t rhs_idx = 0; // Start with whichever range has the smaller start IntervalRange current; - if (lhs.ranges_[0].start <= rhs.ranges_[0].start) { - current = lhs.ranges_[lhs_idx++]; + if (lhs_batch[0].start <= rhs_batch[0].start) { + current = lhs_batch[lhs_idx++]; + if (lhs_idx >= lhs_batch.size()) { + lhs_batch = lhs_iter->NextRanges(); + lhs_idx = 0; + } } else { - current = rhs.ranges_[rhs_idx++]; + current = rhs_batch[rhs_idx++]; + if (rhs_idx >= rhs_batch.size()) { + rhs_batch = rhs_iter->NextRanges(); + rhs_idx = 0; + } } - while (lhs_idx < lhs.ranges_.size() || rhs_idx < rhs.ranges_.size()) { + while (!lhs_batch.empty() || !rhs_batch.empty()) { IntervalRange next; - if (rhs_idx >= rhs.ranges_.size()) { + if (rhs_batch.empty()) { // Only lhs ranges remain - next = lhs.ranges_[lhs_idx++]; - } else if (lhs_idx >= lhs.ranges_.size()) { + next = lhs_batch[lhs_idx++]; + if (lhs_idx >= lhs_batch.size()) { + lhs_batch = lhs_iter->NextRanges(); + lhs_idx = 0; + } + } else if (lhs_batch.empty()) { // Only rhs ranges remain - next = rhs.ranges_[rhs_idx++]; + next = rhs_batch[rhs_idx++]; + if (rhs_idx >= rhs_batch.size()) { + rhs_batch = rhs_iter->NextRanges(); + rhs_idx = 0; + } } else { // Both have ranges - pick the one with smaller start - const auto& left = lhs.ranges_[lhs_idx]; - const auto& right = rhs.ranges_[rhs_idx]; + const auto& left = lhs_batch[lhs_idx]; + const auto& right = rhs_batch[rhs_idx]; if (left.start <= right.start) { next = left; lhs_idx++; + if (lhs_idx >= lhs_batch.size()) { + lhs_batch = lhs_iter->NextRanges(); + lhs_idx = 0; + } } else { next = right; rhs_idx++; + if (rhs_idx >= rhs_batch.size()) { + rhs_batch = rhs_iter->NextRanges(); + rhs_idx = 0; + } } }