From 25209fb462a2894fc3ab0a81040048ebb5de375b Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Mon, 20 Jan 2025 18:19:27 +0100 Subject: [PATCH 001/142] wip --- .../catalog_entry/duck_table_entry.cpp | 6 +- src/include/duckdb/storage/data_table.hpp | 2 +- .../duckdb/storage/table/row_group.hpp | 2 +- .../storage/table/row_group_collection.hpp | 2 +- .../duckdb/transaction/local_storage.hpp | 6 +- src/storage/data_table.cpp | 4 +- src/storage/local_storage.cpp | 24 +++++--- src/storage/partial_block_manager.cpp | 6 +- src/storage/single_file_block_manager.cpp | 15 +++++ src/storage/table/row_group.cpp | 5 +- src/storage/table/row_group_collection.cpp | 4 +- .../optimistic_write_alter_type.test_slow | 4 -- .../optimistic_write_delete.test | 1 - .../optimistic_write_drop_column.test_slow | 22 +++---- ...der_preserving_odd_sized_batches.test_slow | 3 - ...ace_insert_unique_idx_optimistic.test_slow | 59 ++----------------- ...aim_space_primary_key_optimistic.test_slow | 28 ++------- 17 files changed, 71 insertions(+), 122 deletions(-) diff --git a/src/catalog/catalog_entry/duck_table_entry.cpp b/src/catalog/catalog_entry/duck_table_entry.cpp index 4983710d9a99..b58391d2ab4f 100644 --- a/src/catalog/catalog_entry/duck_table_entry.cpp +++ b/src/catalog/catalog_entry/duck_table_entry.cpp @@ -22,6 +22,7 @@ #include "duckdb/planner/parsed_data/bound_create_table_info.hpp" #include "duckdb/storage/storage_manager.hpp" #include "duckdb/storage/table_storage_info.hpp" +#include "duckdb/transaction/duck_transaction.hpp" namespace duckdb { @@ -885,7 +886,10 @@ void DuckTableEntry::CommitAlter(string &column_name) { break; } } - storage->CommitDropColumn(columns.LogicalToPhysical(LogicalIndex(removed_index.GetIndex())).index); + + auto logical_column_index = LogicalIndex(removed_index.GetIndex()); + auto column_index = columns.LogicalToPhysical(logical_column_index).index; + storage->CommitDropColumn(column_index); } void DuckTableEntry::CommitDrop() { diff --git a/src/include/duckdb/storage/data_table.hpp b/src/include/duckdb/storage/data_table.hpp index 39795ed1b907..bbc42ed3ffc9 100644 --- a/src/include/duckdb/storage/data_table.hpp +++ b/src/include/duckdb/storage/data_table.hpp @@ -200,7 +200,7 @@ class DataTable { //! Checkpoint the table to the specified table data writer void Checkpoint(TableDataWriter &writer, Serializer &serializer); void CommitDropTable(); - void CommitDropColumn(idx_t index); + void CommitDropColumn(const idx_t column_index); idx_t ColumnCount() const; idx_t GetTotalRows() const; diff --git a/src/include/duckdb/storage/table/row_group.hpp b/src/include/duckdb/storage/table/row_group.hpp index 16a535a4e4f9..40d0873a2ed7 100644 --- a/src/include/duckdb/storage/table/row_group.hpp +++ b/src/include/duckdb/storage/table/row_group.hpp @@ -102,7 +102,7 @@ class RowGroup : public SegmentBase { unique_ptr RemoveColumn(RowGroupCollection &collection, idx_t removed_column); void CommitDrop(); - void CommitDropColumn(idx_t index); + void CommitDropColumn(const idx_t index); void InitializeEmpty(const vector &types); diff --git a/src/include/duckdb/storage/table/row_group_collection.hpp b/src/include/duckdb/storage/table/row_group_collection.hpp index 19aa6452038c..412d8dcdaa61 100644 --- a/src/include/duckdb/storage/table/row_group_collection.hpp +++ b/src/include/duckdb/storage/table/row_group_collection.hpp @@ -108,7 +108,7 @@ class RowGroupCollection { bool schedule_vacuum); unique_ptr GetCheckpointTask(CollectionCheckpointState &checkpoint_state, idx_t segment_idx); - void CommitDropColumn(idx_t index); + void CommitDropColumn(const idx_t index); void CommitDropTable(); vector GetPartitionStats() const; diff --git a/src/include/duckdb/transaction/local_storage.hpp b/src/include/duckdb/transaction/local_storage.hpp index 453a7ce440ab..20ce212b8639 100644 --- a/src/include/duckdb/transaction/local_storage.hpp +++ b/src/include/duckdb/transaction/local_storage.hpp @@ -32,8 +32,8 @@ class LocalTableStorage : public enable_shared_from_this { // Create a LocalTableStorage from an ALTER TYPE LocalTableStorage(ClientContext &context, DataTable &table, LocalTableStorage &parent, idx_t changed_idx, const LogicalType &target_type, const vector &bound_columns, Expression &cast_expr); - // Create a LocalTableStorage from a DROP COLUMN - LocalTableStorage(DataTable &table, LocalTableStorage &parent, idx_t drop_idx); + //! Create a LocalTableStorage from a DROP COLUMN. + LocalTableStorage(DataTable &new_data_table, LocalTableStorage &parent, const idx_t drop_column_index); // Create a LocalTableStorage from an ADD COLUMN LocalTableStorage(ClientContext &context, DataTable &table, LocalTableStorage &parent, ColumnDefinition &new_column, ExpressionExecutor &default_executor); @@ -153,7 +153,7 @@ class LocalStorage { void AddColumn(DataTable &old_dt, DataTable &new_dt, ColumnDefinition &new_column, ExpressionExecutor &default_executor); - void DropColumn(DataTable &old_dt, DataTable &new_dt, idx_t removed_column); + void DropColumn(DataTable &old_dt, DataTable &new_dt, const idx_t drop_column_index); void ChangeType(DataTable &old_dt, DataTable &new_dt, idx_t changed_idx, const LogicalType &target_type, const vector &bound_columns, Expression &cast_expr); diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index 81bf50100b13..630b081eebf5 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -1546,8 +1546,8 @@ void DataTable::Checkpoint(TableDataWriter &writer, Serializer &serializer) { writer.FinalizeTable(global_stats, info.get(), serializer); } -void DataTable::CommitDropColumn(idx_t index) { - row_groups->CommitDropColumn(index); +void DataTable::CommitDropColumn(const idx_t column_index) { + row_groups->CommitDropColumn(column_index); } idx_t DataTable::ColumnCount() const { diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 71b20bd10059..239596b3800c 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -62,12 +62,17 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, append_indexes.Move(parent.append_indexes); } -LocalTableStorage::LocalTableStorage(DataTable &new_dt, LocalTableStorage &parent, idx_t drop_idx) - : table_ref(new_dt), allocator(Allocator::Get(new_dt.db)), deleted_rows(parent.deleted_rows), - optimistic_writer(new_dt, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), - merged_storage(parent.merged_storage) { - row_groups = parent.row_groups->RemoveColumn(drop_idx); +LocalTableStorage::LocalTableStorage(DataTable &new_data_table, LocalTableStorage &parent, + const idx_t drop_column_index) + : table_ref(new_data_table), allocator(Allocator::Get(new_data_table.db)), deleted_rows(parent.deleted_rows), + optimistic_writer(new_data_table, parent.optimistic_writer), + optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { + + // Remove the column from the previous local table storage. + row_groups = parent.row_groups->RemoveColumn(drop_column_index); + parent.row_groups->CommitDropColumn(drop_column_index); parent.row_groups.reset(); + append_indexes.Move(parent.append_indexes); } @@ -82,6 +87,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, } LocalTableStorage::~LocalTableStorage() { + D_ASSERT(1); } void LocalTableStorage::InitializeScan(CollectionScanState &state, optional_ptr table_filters) { @@ -248,6 +254,9 @@ void LocalTableStorage::Rollback() { } optimistic_writers.clear(); optimistic_writer.Rollback(); + + // Drop any optimistically written local changes. + row_groups->CommitDropTable(); } //===--------------------------------------------------------------------===// @@ -547,7 +556,6 @@ void LocalStorage::Rollback() { continue; } storage->Rollback(); - entry.second.reset(); } } @@ -598,13 +606,13 @@ void LocalStorage::AddColumn(DataTable &old_dt, DataTable &new_dt, ColumnDefinit table_manager.InsertEntry(new_dt, std::move(new_storage)); } -void LocalStorage::DropColumn(DataTable &old_dt, DataTable &new_dt, idx_t removed_column) { +void LocalStorage::DropColumn(DataTable &old_dt, DataTable &new_dt, const idx_t drop_column_index) { // check if there are any pending appends for the old version of the table auto storage = table_manager.MoveEntry(old_dt); if (!storage) { return; } - auto new_storage = make_shared_ptr(new_dt, *storage, removed_column); + auto new_storage = make_shared_ptr(new_dt, *storage, drop_column_index); table_manager.InsertEntry(new_dt, std::move(new_storage)); } diff --git a/src/storage/partial_block_manager.cpp b/src/storage/partial_block_manager.cpp index 3dbf89760591..79bc4c813fbe 100644 --- a/src/storage/partial_block_manager.cpp +++ b/src/storage/partial_block_manager.cpp @@ -196,9 +196,9 @@ BlockManager &PartialBlockManager::GetBlockManager() const { void PartialBlockManager::Rollback() { ClearBlocks(); - for (auto &block_id : written_blocks) { - block_manager.MarkBlockAsFree(block_id); - } + // for (auto &block_id : written_blocks) { + // block_manager.MarkBlockAsFree(block_id); + // } } } // namespace duckdb diff --git a/src/storage/single_file_block_manager.cpp b/src/storage/single_file_block_manager.cpp index f00d93040686..111ba02a55c9 100644 --- a/src/storage/single_file_block_manager.cpp +++ b/src/storage/single_file_block_manager.cpp @@ -313,6 +313,9 @@ void SingleFileBlockManager::LoadFreeList() { free_list.clear(); for (idx_t i = 0; i < free_list_count; i++) { auto block = reader.Read(); + if (block == 1) { + D_ASSERT(1); + } free_list.insert(block); newly_freed_list.insert(block); } @@ -363,6 +366,9 @@ void SingleFileBlockManager::MarkBlockAsFree(block_id_t block_id) { throw InternalException("MarkBlockAsFree called but block %llu was already freed!", block_id); } multi_use_blocks.erase(block_id); + if (block_id == 1) { + D_ASSERT(1); + } free_list.insert(block_id); newly_freed_list.insert(block_id); } @@ -377,6 +383,9 @@ void SingleFileBlockManager::MarkBlockAsUsed(block_id_t block_id) { // i.e. if max_block = 0, and block_id = 3, we need to add blocks 1 and 2 to the free list while (max_block < block_id) { free_list.insert(max_block); + if (max_block == 1) { + D_ASSERT(1); + } max_block++; } max_block++; @@ -410,6 +419,9 @@ void SingleFileBlockManager::MarkBlockAsModified(block_id_t block_id) { // Check for multi-free // TODO: Fix the bug that causes this assert to fire, then uncomment it. // D_ASSERT(modified_blocks.find(block_id) == modified_blocks.end()); + if (block_id == 1) { + D_ASSERT(1); + } D_ASSERT(free_list.find(block_id) == free_list.end()); modified_blocks.insert(block_id); } @@ -640,6 +652,9 @@ void SingleFileBlockManager::WriteHeader(DatabaseHeader header) { for (auto &block : modified_blocks) { free_list.insert(block); + if (block == 1) { + D_ASSERT(1); + } newly_freed_list.insert(block); } modified_blocks.clear(); diff --git a/src/storage/table/row_group.cpp b/src/storage/table/row_group.cpp index d5250387362b..ee07f838c14c 100644 --- a/src/storage/table/row_group.cpp +++ b/src/storage/table/row_group.cpp @@ -385,8 +385,9 @@ void RowGroup::CommitDrop() { } } -void RowGroup::CommitDropColumn(idx_t column_idx) { - GetColumn(column_idx).CommitDropColumn(); +void RowGroup::CommitDropColumn(const idx_t column_idx) { + auto &column = GetColumn(column_idx); + column.CommitDropColumn(); } void RowGroup::NextVector(CollectionScanState &state) { diff --git a/src/storage/table/row_group_collection.cpp b/src/storage/table/row_group_collection.cpp index a167644193fd..f775add2dca8 100644 --- a/src/storage/table/row_group_collection.cpp +++ b/src/storage/table/row_group_collection.cpp @@ -1110,9 +1110,9 @@ void RowGroupCollection::Checkpoint(TableDataWriter &writer, TableStatistics &gl //===--------------------------------------------------------------------===// // CommitDrop //===--------------------------------------------------------------------===// -void RowGroupCollection::CommitDropColumn(idx_t index) { +void RowGroupCollection::CommitDropColumn(const idx_t column_index) { for (auto &row_group : row_groups->Segments()) { - row_group.CommitDropColumn(index); + row_group.CommitDropColumn(column_index); } } diff --git a/test/sql/storage/optimistic_write/optimistic_write_alter_type.test_slow b/test/sql/storage/optimistic_write/optimistic_write_alter_type.test_slow index dbebdcece844..fa7847641795 100644 --- a/test/sql/storage/optimistic_write/optimistic_write_alter_type.test_slow +++ b/test/sql/storage/optimistic_write/optimistic_write_alter_type.test_slow @@ -2,10 +2,6 @@ # description: Test optimistic write with alter type in transaction-local storage # group: [optimistic_write] -# FIXME: for smaller block sizes (16KB) the database size does not stabilize in the loop, instead, -# FIXME: it grows very slowly (only investigated up to 40 iterations) -require block_size 262144 - load __TEST_DIR__/optimistic_write_alter_type.db statement ok diff --git a/test/sql/storage/optimistic_write/optimistic_write_delete.test b/test/sql/storage/optimistic_write/optimistic_write_delete.test index 5c71995862c8..b96d893fb865 100644 --- a/test/sql/storage/optimistic_write/optimistic_write_delete.test +++ b/test/sql/storage/optimistic_write/optimistic_write_delete.test @@ -2,7 +2,6 @@ # description: Test optimistic write with deletes in transaction-local storage # group: [optimistic_write] -# load the DB from disk load __TEST_DIR__/optimistic_write_delete.db statement ok diff --git a/test/sql/storage/optimistic_write/optimistic_write_drop_column.test_slow b/test/sql/storage/optimistic_write/optimistic_write_drop_column.test_slow index c91637cdc596..0427213938c0 100644 --- a/test/sql/storage/optimistic_write/optimistic_write_drop_column.test_slow +++ b/test/sql/storage/optimistic_write/optimistic_write_drop_column.test_slow @@ -2,10 +2,6 @@ # description: Test optimistic write with drop column in transaction-local storage # group: [optimistic_write] -# FIXME: for smaller block sizes (16KB) the database size does not stabilize in the loop, instead, -# FIXME: it grows very slowly (only investigated up to 40 iterations) -require block_size 262144 - load __TEST_DIR__/optimistic_write_drop.db statement ok @@ -15,7 +11,7 @@ statement ok BEGIN TRANSACTION statement ok -INSERT INTO test SELECT i, i+1, i+2 FROM range(1000000) tbl(i) +INSERT INTO test SELECT i, i + 1, i + 2 FROM range(1000000) tbl(i) statement ok ALTER TABLE test DROP COLUMN c @@ -52,12 +48,12 @@ SELECT SUM(a), SUM(b) FROM test ---- 499999500000 500000500000 -require skip_reload +# Ensure that we reclaim space correctly. -# ensure the drop column does not result in leaking blocks +require skip_reload -# for smaller block sizes (16KB) the total blocks alternate between a few values in the loop, -# therefore, we need to compare to a range of total block counts +# For smaller block sizes (16KB) the total blocks alternate between a few values in the loop. +# Therefore, we compare to a range of total block counts. statement ok CREATE TABLE total_blocks_tbl AS SELECT total_blocks FROM pragma_database_size(); @@ -73,7 +69,7 @@ statement ok BEGIN TRANSACTION statement ok -INSERT INTO test SELECT i, i+1, i+2 FROM range(1000000) tbl(i) +INSERT INTO test SELECT i, i + 1, i + 2 FROM range(1000000) tbl(i) statement ok ALTER TABLE test DROP COLUMN c @@ -86,8 +82,8 @@ SELECT SUM(a), SUM(b) FROM test ---- 499999500000 500000500000 -# ensure that the total blocks don't exceed the total blocks after the first iteration -# by more than 1.2 +# Ensure that the total blocks don't exceed the total blocks after the first iteration +# by more than 1.2. query I SELECT CASE WHEN ${i} = 0 THEN True @@ -97,7 +93,7 @@ FROM pragma_database_size() AS current, total_blocks_tbl; ---- 1 -# adjust total_blocks_tbl once to the count after the first iteration +# Adjust total_blocks_tbl once to the count after the first iteration. statement ok UPDATE total_blocks_tbl SET total_blocks = ( diff --git a/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow b/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow index 3474b2e0621c..941885e4dcbf 100644 --- a/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow +++ b/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow @@ -2,9 +2,6 @@ # description: Test parallel order-preserving insert # group: [parallel] -# FIXME: see internal issue 3931. -mode skip - # There are different numbers of distinct blocks for smaller block sizes, # because the segment size is bound by the block size. require block_size 262144 diff --git a/test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow b/test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow index c09499562f34..7cfbd2fa6a56 100644 --- a/test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow +++ b/test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow @@ -2,42 +2,20 @@ # description: Test space reclamation of optimistic writing with a UNIQUE constraint violation. # group: [parallel] -# FIXME: see internal issue 3931. -mode skip - load __TEST_DIR__/reclaim_space_unique_index.db statement ok SET preserve_insertion_order=false; statement ok -CREATE TABLE integers AS SELECT * FROM range(10000000) t(i); +CREATE TABLE integers AS SELECT * FROM range(1_000_000) t(i); statement ok CREATE TABLE integers2 (i INTEGER); -statement ok -INSERT INTO integers2 VALUES (9999999); - -statement ok -CREATE UNIQUE INDEX idx ON integers2(i); - -# For smaller block sizes (16KB) the total blocks increase (to twice the original amount) in the first -# iteration, and then stay constant. statement ok CREATE TABLE total_blocks_tbl AS SELECT total_blocks FROM pragma_database_size(); -statement ok -CREATE TYPE test_result AS UNION ( - ok BOOL, - err STRUCT( - old BIGINT, - allowed_max DECIMAL(21,1), - actual BIGINT) -); - -loop i 0 10 - statement ok BEGIN; @@ -45,41 +23,14 @@ statement ok CHECKPOINT; statement ok -INSERT INTO integers2 VALUES (9999998); - -# Invalidate the transaction. +INSERT INTO integers2 VALUES (999_998); -statement error -INSERT INTO integers2 SELECT * FROM integers WHERE i <= 9999998; ----- -:Constraint Error.*PRIMARY KEY or UNIQUE constraint violation.* +statement ok +INSERT INTO integers2 SELECT * FROM integers WHERE i <= 999_998; statement ok ROLLBACK -# Ensure that the total blocks don't exceed the total blocks after the first iteration by more than 1.2. - -query I -SELECT - CASE WHEN ${i} = 0 THEN True::test_result - WHEN current.total_blocks <= total_blocks_tbl.total_blocks * 1.4 THEN True::test_result - ELSE { - 'old': total_blocks_tbl.total_blocks, - 'allowed_max': total_blocks_tbl.total_blocks * 1.4, - 'actual': current.total_blocks - }::test_result - END -FROM pragma_database_size() AS current, total_blocks_tbl; ----- -true - -# Adjust the total_blocks_tbl once to the count after the first iteration. - statement ok UPDATE total_blocks_tbl SET total_blocks = ( - SELECT - CASE WHEN ${i} = 0 THEN (SELECT current.total_blocks FROM pragma_database_size() AS current) - ELSE (total_blocks)END - ); - -endloop +SELECT current.total_blocks FROM pragma_database_size() AS current); diff --git a/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow b/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow index 27753dd7877a..6f97e21dbbee 100644 --- a/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow +++ b/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow @@ -2,48 +2,30 @@ # description: Test space reclamation of optimistic writing with a PK constraint violation. # group: [parallel] -# FIXME: see internal issue 3931. -mode skip - load __TEST_DIR__/reclaim_space_primary_key.db statement ok SET preserve_insertion_order=false; statement ok -CREATE TABLE integers AS SELECT * FROM range(10000000) t(i); +CREATE TABLE integers AS SELECT * FROM range(1000000) t(i); statement ok -CREATE TABLE integers2 (i INTEGER PRIMARY KEY); +CREATE TABLE integers2 (i INTEGER); statement ok -INSERT INTO integers2 VALUES (9999999); - -statement error -INSERT INTO integers2 SELECT * FROM integers; ----- -:Constraint Error.*violates primary key constraint.* +INSERT INTO integers2 VALUES (999999); statement ok CREATE TABLE block_count (count INT); loop i 0 10 -statement error -INSERT INTO integers2 SELECT * FROM integers; ----- -:Constraint Error.*violates primary key constraint.* - statement ok BEGIN; statement ok -INSERT INTO integers2 VALUES (9999998); - -statement error -INSERT INTO integers2 SELECT * FROM integers WHERE i <= 9999998; ----- -:Constraint Error.*PRIMARY KEY or UNIQUE constraint violation.* +INSERT INTO integers2 VALUES (999998); statement ok ROLLBACK @@ -54,7 +36,7 @@ SELECT COUNT(*) - ${i} FROM integers2; 1 statement ok -INSERT INTO integers2 VALUES (10000000 + ${i}); +INSERT INTO integers2 VALUES (1000000 + ${i}); statement ok CHECKPOINT; From cd9b27ad4cb96092b8d553e3552dcc2e8f0125da Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Tue, 21 Jan 2025 12:10:21 +0100 Subject: [PATCH 002/142] some tidying --- .../catalog_entry/duck_table_entry.cpp | 1 - .../duckdb/storage/table/row_group.hpp | 2 +- .../storage/table/row_group_collection.hpp | 2 +- .../duckdb/transaction/local_storage.hpp | 7 +-- src/storage/local_storage.cpp | 19 ++++--- src/storage/partial_block_manager.cpp | 3 -- src/storage/single_file_block_manager.cpp | 15 ------ src/storage/table/row_group.cpp | 4 +- .../optimistic_write_alter_type.test_slow | 18 +++---- ...der_preserving_odd_sized_batches.test_slow | 2 + ...ace_insert_unique_idx_optimistic.test_slow | 51 ++++++++++++++++++- ...aim_space_primary_key_optimistic.test_slow | 27 ++++++++-- test/temp.test | 24 +++++++++ 13 files changed, 125 insertions(+), 50 deletions(-) create mode 100644 test/temp.test diff --git a/src/catalog/catalog_entry/duck_table_entry.cpp b/src/catalog/catalog_entry/duck_table_entry.cpp index b58391d2ab4f..d12bc557e7aa 100644 --- a/src/catalog/catalog_entry/duck_table_entry.cpp +++ b/src/catalog/catalog_entry/duck_table_entry.cpp @@ -22,7 +22,6 @@ #include "duckdb/planner/parsed_data/bound_create_table_info.hpp" #include "duckdb/storage/storage_manager.hpp" #include "duckdb/storage/table_storage_info.hpp" -#include "duckdb/transaction/duck_transaction.hpp" namespace duckdb { diff --git a/src/include/duckdb/storage/table/row_group.hpp b/src/include/duckdb/storage/table/row_group.hpp index 40d0873a2ed7..8ceea68a3615 100644 --- a/src/include/duckdb/storage/table/row_group.hpp +++ b/src/include/duckdb/storage/table/row_group.hpp @@ -102,7 +102,7 @@ class RowGroup : public SegmentBase { unique_ptr RemoveColumn(RowGroupCollection &collection, idx_t removed_column); void CommitDrop(); - void CommitDropColumn(const idx_t index); + void CommitDropColumn(const idx_t column_index); void InitializeEmpty(const vector &types); diff --git a/src/include/duckdb/storage/table/row_group_collection.hpp b/src/include/duckdb/storage/table/row_group_collection.hpp index 412d8dcdaa61..9940d80f45c0 100644 --- a/src/include/duckdb/storage/table/row_group_collection.hpp +++ b/src/include/duckdb/storage/table/row_group_collection.hpp @@ -108,7 +108,7 @@ class RowGroupCollection { bool schedule_vacuum); unique_ptr GetCheckpointTask(CollectionCheckpointState &checkpoint_state, idx_t segment_idx); - void CommitDropColumn(const idx_t index); + void CommitDropColumn(const idx_t column_index); void CommitDropTable(); vector GetPartitionStats() const; diff --git a/src/include/duckdb/transaction/local_storage.hpp b/src/include/duckdb/transaction/local_storage.hpp index 20ce212b8639..7516adeced72 100644 --- a/src/include/duckdb/transaction/local_storage.hpp +++ b/src/include/duckdb/transaction/local_storage.hpp @@ -29,9 +29,10 @@ class LocalTableStorage : public enable_shared_from_this { public: // Create a new LocalTableStorage explicit LocalTableStorage(ClientContext &context, DataTable &table); - // Create a LocalTableStorage from an ALTER TYPE - LocalTableStorage(ClientContext &context, DataTable &table, LocalTableStorage &parent, idx_t changed_idx, - const LogicalType &target_type, const vector &bound_columns, Expression &cast_expr); + //! Create a LocalTableStorage from an ALTER TYPE. + LocalTableStorage(ClientContext &context, DataTable &new_data_table, LocalTableStorage &parent, + const idx_t alter_column_index, const LogicalType &target_type, + const vector &bound_columns, Expression &cast_expr); //! Create a LocalTableStorage from a DROP COLUMN. LocalTableStorage(DataTable &new_data_table, LocalTableStorage &parent, const idx_t drop_column_index); // Create a LocalTableStorage from an ADD COLUMN diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 239596b3800c..20334291e229 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -51,14 +51,18 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &table) }); } -LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, LocalTableStorage &parent, - idx_t changed_idx, const LogicalType &target_type, +LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_data_table, LocalTableStorage &parent, + const idx_t alter_column_index, const LogicalType &target_type, const vector &bound_columns, Expression &cast_expr) - : table_ref(new_dt), allocator(Allocator::Get(new_dt.db)), deleted_rows(parent.deleted_rows), - optimistic_writer(new_dt, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), - merged_storage(parent.merged_storage) { - row_groups = parent.row_groups->AlterType(context, changed_idx, target_type, bound_columns, cast_expr); + : table_ref(new_data_table), allocator(Allocator::Get(new_data_table.db)), deleted_rows(parent.deleted_rows), + optimistic_writer(new_data_table, parent.optimistic_writer), + optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { + + // Alter the column type. + row_groups = parent.row_groups->AlterType(context, alter_column_index, target_type, bound_columns, cast_expr); + parent.row_groups->CommitDropColumn(alter_column_index); parent.row_groups.reset(); + append_indexes.Move(parent.append_indexes); } @@ -68,7 +72,7 @@ LocalTableStorage::LocalTableStorage(DataTable &new_data_table, LocalTableStorag optimistic_writer(new_data_table, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { - // Remove the column from the previous local table storage. + // Remove the column from the previous table storage. row_groups = parent.row_groups->RemoveColumn(drop_column_index); parent.row_groups->CommitDropColumn(drop_column_index); parent.row_groups.reset(); @@ -87,7 +91,6 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, } LocalTableStorage::~LocalTableStorage() { - D_ASSERT(1); } void LocalTableStorage::InitializeScan(CollectionScanState &state, optional_ptr table_filters) { diff --git a/src/storage/partial_block_manager.cpp b/src/storage/partial_block_manager.cpp index 79bc4c813fbe..991e5fcd4cf5 100644 --- a/src/storage/partial_block_manager.cpp +++ b/src/storage/partial_block_manager.cpp @@ -196,9 +196,6 @@ BlockManager &PartialBlockManager::GetBlockManager() const { void PartialBlockManager::Rollback() { ClearBlocks(); - // for (auto &block_id : written_blocks) { - // block_manager.MarkBlockAsFree(block_id); - // } } } // namespace duckdb diff --git a/src/storage/single_file_block_manager.cpp b/src/storage/single_file_block_manager.cpp index 111ba02a55c9..f00d93040686 100644 --- a/src/storage/single_file_block_manager.cpp +++ b/src/storage/single_file_block_manager.cpp @@ -313,9 +313,6 @@ void SingleFileBlockManager::LoadFreeList() { free_list.clear(); for (idx_t i = 0; i < free_list_count; i++) { auto block = reader.Read(); - if (block == 1) { - D_ASSERT(1); - } free_list.insert(block); newly_freed_list.insert(block); } @@ -366,9 +363,6 @@ void SingleFileBlockManager::MarkBlockAsFree(block_id_t block_id) { throw InternalException("MarkBlockAsFree called but block %llu was already freed!", block_id); } multi_use_blocks.erase(block_id); - if (block_id == 1) { - D_ASSERT(1); - } free_list.insert(block_id); newly_freed_list.insert(block_id); } @@ -383,9 +377,6 @@ void SingleFileBlockManager::MarkBlockAsUsed(block_id_t block_id) { // i.e. if max_block = 0, and block_id = 3, we need to add blocks 1 and 2 to the free list while (max_block < block_id) { free_list.insert(max_block); - if (max_block == 1) { - D_ASSERT(1); - } max_block++; } max_block++; @@ -419,9 +410,6 @@ void SingleFileBlockManager::MarkBlockAsModified(block_id_t block_id) { // Check for multi-free // TODO: Fix the bug that causes this assert to fire, then uncomment it. // D_ASSERT(modified_blocks.find(block_id) == modified_blocks.end()); - if (block_id == 1) { - D_ASSERT(1); - } D_ASSERT(free_list.find(block_id) == free_list.end()); modified_blocks.insert(block_id); } @@ -652,9 +640,6 @@ void SingleFileBlockManager::WriteHeader(DatabaseHeader header) { for (auto &block : modified_blocks) { free_list.insert(block); - if (block == 1) { - D_ASSERT(1); - } newly_freed_list.insert(block); } modified_blocks.clear(); diff --git a/src/storage/table/row_group.cpp b/src/storage/table/row_group.cpp index ee07f838c14c..b9add3d67dd7 100644 --- a/src/storage/table/row_group.cpp +++ b/src/storage/table/row_group.cpp @@ -385,8 +385,8 @@ void RowGroup::CommitDrop() { } } -void RowGroup::CommitDropColumn(const idx_t column_idx) { - auto &column = GetColumn(column_idx); +void RowGroup::CommitDropColumn(const idx_t column_index) { + auto &column = GetColumn(column_index); column.CommitDropColumn(); } diff --git a/test/sql/storage/optimistic_write/optimistic_write_alter_type.test_slow b/test/sql/storage/optimistic_write/optimistic_write_alter_type.test_slow index fa7847641795..a32e866bad35 100644 --- a/test/sql/storage/optimistic_write/optimistic_write_alter_type.test_slow +++ b/test/sql/storage/optimistic_write/optimistic_write_alter_type.test_slow @@ -14,7 +14,7 @@ statement ok INSERT INTO test SELECT i FROM range(1000000) tbl(i) statement ok -ALTER TABLE test ALTER a SET TYPE BIGINT USING a+1 +ALTER TABLE test ALTER a SET TYPE BIGINT USING a + 1 statement ok COMMIT @@ -48,12 +48,12 @@ SELECT SUM(a) FROM test ---- 500000500000 -require skip_reload +# Ensure that we reclaim space correctly. -# ensure the alter type does not result in leaking blocks +require skip_reload -# for smaller block sizes (16KB) the total blocks alternate between a few values in the loop, -# therefore, we need to compare to a range of total block counts +# For smaller block sizes (16KB) the total blocks alternate between a few values in the loop. +# Therefore, we compare to a range of total block counts. statement ok CREATE TABLE total_blocks_tbl AS SELECT total_blocks FROM pragma_database_size(); @@ -72,7 +72,7 @@ statement ok INSERT INTO test SELECT i FROM range(1000000) tbl(i) statement ok -ALTER TABLE test ALTER a SET TYPE BIGINT USING a+1 +ALTER TABLE test ALTER a SET TYPE BIGINT USING a + 1 statement ok COMMIT @@ -82,8 +82,8 @@ SELECT SUM(a) FROM test ---- 500000500000 -# ensure that the total blocks don't exceed the total blocks after the first iteration -# by more than 1.2 +# Ensure that the total blocks don't exceed the total blocks after the first iteration +# by more than 1.2. query I SELECT CASE WHEN ${i} = 0 THEN True @@ -93,7 +93,7 @@ FROM pragma_database_size() AS current, total_blocks_tbl; ---- 1 -# adjust total_blocks_tbl once to the count after the first iteration +# Adjust total_blocks_tbl once to the count after the first iteration. statement ok UPDATE total_blocks_tbl SET total_blocks = ( diff --git a/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow b/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow index 941885e4dcbf..4a0dc3c9bd09 100644 --- a/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow +++ b/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow @@ -2,6 +2,8 @@ # description: Test parallel order-preserving insert # group: [parallel] +mode skip + # There are different numbers of distinct blocks for smaller block sizes, # because the segment size is bound by the block size. require block_size 262144 diff --git a/test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow b/test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow index 7cfbd2fa6a56..243f4860b205 100644 --- a/test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow +++ b/test/sql/storage/parallel/reclaim_space_insert_unique_idx_optimistic.test_slow @@ -13,9 +13,29 @@ CREATE TABLE integers AS SELECT * FROM range(1_000_000) t(i); statement ok CREATE TABLE integers2 (i INTEGER); +statement ok +INSERT INTO integers2 VALUES (9999999); + +statement ok +CREATE UNIQUE INDEX idx ON integers2(i); + +# For smaller block sizes (16KB) the total blocks increase (to twice the original amount) in the first +# iteration, and then stay constant. + statement ok CREATE TABLE total_blocks_tbl AS SELECT total_blocks FROM pragma_database_size(); +statement ok +CREATE TYPE test_result AS UNION ( + ok BOOL, + err STRUCT( + old BIGINT, + allowed_max DECIMAL(21,1), + actual BIGINT) +); + +loop i 0 10 + statement ok BEGIN; @@ -25,12 +45,39 @@ CHECKPOINT; statement ok INSERT INTO integers2 VALUES (999_998); -statement ok +# Invalidate the transaction. + +statement error INSERT INTO integers2 SELECT * FROM integers WHERE i <= 999_998; +---- +:Constraint Error.*PRIMARY KEY or UNIQUE constraint violation.* statement ok ROLLBACK +# Ensure that the total blocks don't exceed the total blocks after the first iteration by more than 1.2. + +query I +SELECT + CASE WHEN ${i} = 0 THEN True::test_result + WHEN current.total_blocks <= total_blocks_tbl.total_blocks * 1.4 THEN True::test_result + ELSE { + 'old': total_blocks_tbl.total_blocks, + 'allowed_max': total_blocks_tbl.total_blocks * 1.4, + 'actual': current.total_blocks + }::test_result + END +FROM pragma_database_size() AS current, total_blocks_tbl; +---- +true + +# Adjust the total_blocks_tbl once to the count after the first iteration. + statement ok UPDATE total_blocks_tbl SET total_blocks = ( -SELECT current.total_blocks FROM pragma_database_size() AS current); + SELECT + CASE WHEN ${i} = 0 THEN (SELECT current.total_blocks FROM pragma_database_size() AS current) + ELSE (total_blocks)END + ); + +endloop diff --git a/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow b/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow index 6f97e21dbbee..4d08b4a8f6ec 100644 --- a/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow +++ b/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow @@ -2,30 +2,47 @@ # description: Test space reclamation of optimistic writing with a PK constraint violation. # group: [parallel] +mode skip + load __TEST_DIR__/reclaim_space_primary_key.db statement ok SET preserve_insertion_order=false; statement ok -CREATE TABLE integers AS SELECT * FROM range(1000000) t(i); +CREATE TABLE integers AS SELECT * FROM range(10000000) t(i); statement ok -CREATE TABLE integers2 (i INTEGER); +CREATE TABLE integers2 (i INTEGER PRIMARY KEY); statement ok -INSERT INTO integers2 VALUES (999999); +INSERT INTO integers2 VALUES (9999999); + +statement error +INSERT INTO integers2 SELECT * FROM integers; +---- +:Constraint Error.*violates primary key constraint.* statement ok CREATE TABLE block_count (count INT); loop i 0 10 +statement error +INSERT INTO integers2 SELECT * FROM integers; +---- +:Constraint Error.*violates primary key constraint.* + statement ok BEGIN; statement ok -INSERT INTO integers2 VALUES (999998); +INSERT INTO integers2 VALUES (9999998); + +statement error +INSERT INTO integers2 SELECT * FROM integers WHERE i <= 9999998; +---- +:Constraint Error.*PRIMARY KEY or UNIQUE constraint violation.* statement ok ROLLBACK @@ -36,7 +53,7 @@ SELECT COUNT(*) - ${i} FROM integers2; 1 statement ok -INSERT INTO integers2 VALUES (1000000 + ${i}); +INSERT INTO integers2 VALUES (10000000 + ${i}); statement ok CHECKPOINT; diff --git a/test/temp.test b/test/temp.test new file mode 100644 index 000000000000..0314c52deb1c --- /dev/null +++ b/test/temp.test @@ -0,0 +1,24 @@ +# name: test/temp.test +# group: [test] + +load __TEST_DIR__/reclaim_space_primary_key.db + +statement ok +SET preserve_insertion_order=false; + +statement ok +CREATE TABLE integers AS SELECT * FROM range(10000000) t(i); + +statement ok +CREATE TABLE integers2 (i INTEGER PRIMARY KEY); + +statement ok +INSERT INTO integers2 VALUES (9999999); + +statement error +INSERT INTO integers2 SELECT * FROM integers; +---- +:Constraint Error.*violates primary key constraint.* + +statement ok +CREATE TABLE block_count (count INT); \ No newline at end of file From 62d6d87b3cf87fcb84249beb48b49887435fbf18 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:38:59 +0100 Subject: [PATCH 003/142] more reclaim space fixes --- .../duckdb/storage/optimistic_data_writer.hpp | 2 +- .../duckdb/storage/partial_block_manager.hpp | 2 +- src/storage/local_storage.cpp | 7 ++-- src/storage/optimistic_data_writer.cpp | 5 +-- src/storage/partial_block_manager.cpp | 9 +++- ...der_preserving_odd_sized_batches.test_slow | 14 +++---- ...aim_space_primary_key_optimistic.test_slow | 41 +++++++++++++++---- test/temp.test | 24 ----------- 8 files changed, 55 insertions(+), 49 deletions(-) delete mode 100644 test/temp.test diff --git a/src/include/duckdb/storage/optimistic_data_writer.hpp b/src/include/duckdb/storage/optimistic_data_writer.hpp index 802d51bad707..c3d04e9c470f 100644 --- a/src/include/duckdb/storage/optimistic_data_writer.hpp +++ b/src/include/duckdb/storage/optimistic_data_writer.hpp @@ -30,7 +30,7 @@ class OptimisticDataWriter { //! Merge the partially written blocks from one optimistic writer into another void Merge(OptimisticDataWriter &other); //! Rollback - void Rollback(); + void Rollback(const bool mark_modified); private: //! Prepare a write to disk diff --git a/src/include/duckdb/storage/partial_block_manager.hpp b/src/include/duckdb/storage/partial_block_manager.hpp index b46ea65ad031..dbe6e7029664 100644 --- a/src/include/duckdb/storage/partial_block_manager.hpp +++ b/src/include/duckdb/storage/partial_block_manager.hpp @@ -114,7 +114,7 @@ class PartialBlockManager { void ClearBlocks(); //! Rollback all data written by this partial block manager - void Rollback(); + void Rollback(const bool mark_modified); //! Merge this block manager into another one void Merge(PartialBlockManager &other); diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 20334291e229..9764f878136d 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -253,12 +253,13 @@ void LocalTableStorage::FinalizeOptimisticWriter(OptimisticDataWriter &writer) { void LocalTableStorage::Rollback() { for (auto &writer : optimistic_writers) { - writer->Rollback(); + writer->Rollback(true); } - optimistic_writers.clear(); - optimistic_writer.Rollback(); // Drop any optimistically written local changes. + // The top-level writer writes to the row groups. + optimistic_writers.clear(); + optimistic_writer.Rollback(false); row_groups->CommitDropTable(); } diff --git a/src/storage/optimistic_data_writer.cpp b/src/storage/optimistic_data_writer.cpp index 0a1966c500b4..e8f5bfb70485 100644 --- a/src/storage/optimistic_data_writer.cpp +++ b/src/storage/optimistic_data_writer.cpp @@ -80,13 +80,12 @@ void OptimisticDataWriter::Merge(OptimisticDataWriter &other) { void OptimisticDataWriter::FinalFlush() { if (partial_manager) { partial_manager->FlushPartialBlocks(); - partial_manager.reset(); } } -void OptimisticDataWriter::Rollback() { +void OptimisticDataWriter::Rollback(const bool mark_modified) { if (partial_manager) { - partial_manager->Rollback(); + partial_manager->Rollback(mark_modified); partial_manager.reset(); } } diff --git a/src/storage/partial_block_manager.cpp b/src/storage/partial_block_manager.cpp index 991e5fcd4cf5..5b8d392ea8da 100644 --- a/src/storage/partial_block_manager.cpp +++ b/src/storage/partial_block_manager.cpp @@ -46,6 +46,7 @@ PartialBlockManager::PartialBlockManager(BlockManager &block_manager, PartialBlo // Use the default maximum partial block size with a ratio of 20% free and 80% utilization. max_partial_block_size = NumericCast(block_manager.GetBlockSize() / 5 * 4); } + PartialBlockManager::~PartialBlockManager() { } @@ -186,6 +187,7 @@ void PartialBlockManager::ClearBlocks() { void PartialBlockManager::FlushPartialBlocks() { for (auto &e : partially_filled_blocks) { e.second->Flush(e.first); + written_blocks.insert(e.second->state.block_id); } partially_filled_blocks.clear(); } @@ -194,8 +196,13 @@ BlockManager &PartialBlockManager::GetBlockManager() const { return block_manager; } -void PartialBlockManager::Rollback() { +void PartialBlockManager::Rollback(const bool mark_modified) { ClearBlocks(); + if (mark_modified) { + for (auto &block_id : written_blocks) { + block_manager.MarkBlockAsFree(block_id); + } + } } } // namespace duckdb diff --git a/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow b/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow index 4a0dc3c9bd09..b87a2f377424 100644 --- a/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow +++ b/test/sql/storage/parallel/insert_order_preserving_odd_sized_batches.test_slow @@ -2,8 +2,6 @@ # description: Test parallel order-preserving insert # group: [parallel] -mode skip - # There are different numbers of distinct blocks for smaller block sizes, # because the segment size is bound by the block size. require block_size 262144 @@ -19,7 +17,7 @@ CREATE TABLE integers AS SELECT * FROM range(10000000) tbl(i); ---- 10000000 -# check the block count and median number of rows per row group +# Check the block count and median number of rows per row group. query I SELECT COUNT(DISTINCT block_id) < 4 FROM pragma_storage_info('integers'); ---- @@ -33,7 +31,6 @@ SELECT MEDIAN(count) FROM pragma_storage_info('integers'); statement ok COPY integers TO '__TEST_DIR__/integers.parquet' (ROW_GROUP_SIZE 77777) -# verify that reading while preserving insertion order creates the same size table statement ok CREATE TABLE integers_parquet AS FROM '__TEST_DIR__/integers.parquet'; @@ -61,11 +58,12 @@ SELECT COUNT(DISTINCT block_id) < 4 FROM pragma_storage_info('integers_parquet') true query I -SELECT MEDIAN(count)>100000 FROM pragma_storage_info('integers_parquet'); +SELECT MEDIAN(count) > 100000 FROM pragma_storage_info('integers_parquet'); ---- true -# verify that reading without preserving insertion order creates the same size table +# FIXME: does this even make sense? +# Verify that reading without preserving insertion order creates a same size table. statement ok SET preserve_insertion_order=false @@ -73,11 +71,11 @@ statement ok CREATE TABLE integers_parquet_no_order AS FROM '__TEST_DIR__/integers.parquet' query I -SELECT COUNT(DISTINCT block_id) < 4 FROM pragma_storage_info('integers_parquet_no_order'); +SELECT COUNT(DISTINCT block_id) < 12 FROM pragma_storage_info('integers_parquet_no_order'); ---- true query I -SELECT MEDIAN(count)>100000 FROM pragma_storage_info('integers_parquet_no_order'); +SELECT MEDIAN(count) > 100000 FROM pragma_storage_info('integers_parquet_no_order'); ---- true diff --git a/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow b/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow index 4d08b4a8f6ec..6865f4d0a75d 100644 --- a/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow +++ b/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow @@ -2,8 +2,6 @@ # description: Test space reclamation of optimistic writing with a PK constraint violation. # group: [parallel] -mode skip - load __TEST_DIR__/reclaim_space_primary_key.db statement ok @@ -23,8 +21,20 @@ INSERT INTO integers2 SELECT * FROM integers; ---- :Constraint Error.*violates primary key constraint.* +# For smaller block sizes (16KB) the total blocks increase (to twice the original amount) in the first +# iteration, and then stay constant. + statement ok -CREATE TABLE block_count (count INT); +CREATE TABLE total_blocks_tbl AS SELECT total_blocks FROM pragma_database_size(); + +statement ok +CREATE TYPE test_result AS UNION ( + ok BOOL, + err STRUCT( + old BIGINT, + allowed_max DECIMAL(21,1), + actual BIGINT) +); loop i 0 10 @@ -58,19 +68,34 @@ INSERT INTO integers2 VALUES (10000000 + ${i}); statement ok CHECKPOINT; -statement ok -INSERT INTO block_count SELECT total_blocks FROM pragma_database_size(); - query I SELECT COUNT(*) - ${i} FROM integers2; ---- 2 -# Ensure there is only a small difference between the MIN and MAX block counts. +# Ensure that the total blocks don't exceed the total blocks after the first iteration by more than 1.2. query I -SELECT (MAX(count) - MIN(count)) < 20 FROM block_count; +SELECT + CASE WHEN ${i} = 0 THEN True::test_result + WHEN current.total_blocks <= total_blocks_tbl.total_blocks * 1.4 THEN True::test_result + ELSE { + 'old': total_blocks_tbl.total_blocks, + 'allowed_max': total_blocks_tbl.total_blocks * 1.4, + 'actual': current.total_blocks + }::test_result + END +FROM pragma_database_size() AS current, total_blocks_tbl; ---- true +# Adjust the total_blocks_tbl once to the count after the first iteration. + +statement ok +UPDATE total_blocks_tbl SET total_blocks = ( + SELECT + CASE WHEN ${i} = 0 THEN (SELECT current.total_blocks FROM pragma_database_size() AS current) + ELSE (total_blocks)END + ); + endloop diff --git a/test/temp.test b/test/temp.test deleted file mode 100644 index 0314c52deb1c..000000000000 --- a/test/temp.test +++ /dev/null @@ -1,24 +0,0 @@ -# name: test/temp.test -# group: [test] - -load __TEST_DIR__/reclaim_space_primary_key.db - -statement ok -SET preserve_insertion_order=false; - -statement ok -CREATE TABLE integers AS SELECT * FROM range(10000000) t(i); - -statement ok -CREATE TABLE integers2 (i INTEGER PRIMARY KEY); - -statement ok -INSERT INTO integers2 VALUES (9999999); - -statement error -INSERT INTO integers2 SELECT * FROM integers; ----- -:Constraint Error.*violates primary key constraint.* - -statement ok -CREATE TABLE block_count (count INT); \ No newline at end of file From cbfdb4547c405d61b90ef0e9a7547bf0235836e3 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Wed, 22 Jan 2025 17:30:10 +0100 Subject: [PATCH 004/142] removing written blocks and moving the optimistic row group collections into the local table storage --- .../persistent/physical_batch_insert.cpp | 13 ++++++++++ .../operator/persistent/physical_insert.cpp | 18 ++++++++----- .../operator/persistent/physical_insert.hpp | 2 +- src/include/duckdb/storage/data_table.hpp | 4 ++- .../duckdb/storage/optimistic_data_writer.hpp | 2 +- .../duckdb/storage/partial_block_manager.hpp | 7 +----- .../duckdb/transaction/local_storage.hpp | 18 +++++++++---- .../write_overflow_strings_to_disk.cpp | 3 --- src/storage/compression/zstd.cpp | 4 --- src/storage/data_table.cpp | 6 +++++ src/storage/local_storage.cpp | 25 +++++++++++++------ src/storage/optimistic_data_writer.cpp | 5 ++-- src/storage/partial_block_manager.cpp | 21 +--------------- src/storage/table/column_checkpoint_state.cpp | 1 - 14 files changed, 72 insertions(+), 57 deletions(-) diff --git a/src/execution/operator/persistent/physical_batch_insert.cpp b/src/execution/operator/persistent/physical_batch_insert.cpp index 2e546c477282..8e41e7244914 100644 --- a/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/execution/operator/persistent/physical_batch_insert.cpp @@ -188,6 +188,19 @@ class BatchInsertLocalState : public LocalSinkState { optional_ptr writer; unique_ptr constraint_state; + // void CreateNewCollection(ClientContext &context, BatchInsertGlobalState &g_state, const vector + //&insert_types) { auto &data_table = g_state.table; auto table_info = data_table.GetStorage().GetDataTableInfo(); + // auto &io_manager = TableIOManager::Get(data_table.GetStorage()); + // + // // Create the local row group collection. + // auto max_row_id = NumericCast(MAX_ROW_ID); + // auto collection = make_uniq(std::move(table_info), io_manager, insert_types, + //max_row_id); collection->InitializeEmpty(); collection->InitializeAppend(current_append_state); + // + // lock_guard l(g_state.lock); + // auto &local_table_storage = data_table.GetStorage(); + // current_collection = data_table.CreateOptimisticRowGroups(context, std::move(collection)); + // } void CreateNewCollection(DuckTableEntry &table, const vector &insert_types) { auto table_info = table.GetStorage().GetDataTableInfo(); auto &io_manager = TableIOManager::Get(table.GetStorage()); diff --git a/src/execution/operator/persistent/physical_insert.cpp b/src/execution/operator/persistent/physical_insert.cpp index cb21cc26b91c..c7abdf25f9ca 100644 --- a/src/execution/operator/persistent/physical_insert.cpp +++ b/src/execution/operator/persistent/physical_insert.cpp @@ -685,14 +685,20 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, D_ASSERT(!return_chunk); // parallel append if (!lstate.local_collection) { - lock_guard l(gstate.lock); auto table_info = storage.GetDataTableInfo(); auto &io_manager = TableIOManager::Get(table.GetStorage()); - lstate.local_collection = make_uniq(std::move(table_info), io_manager, insert_types, - NumericCast(MAX_ROW_ID)); - lstate.local_collection->InitializeEmpty(); - lstate.local_collection->InitializeAppend(lstate.local_append_state); - lstate.writer = &gstate.table.GetStorage().CreateOptimisticWriter(context.client); + + // Create the local row group collection. + auto max_row_id = NumericCast(MAX_ROW_ID); + auto collection = + make_uniq(std::move(table_info), io_manager, insert_types, max_row_id); + collection->InitializeEmpty(); + collection->InitializeAppend(lstate.local_append_state); + + lock_guard l(gstate.lock); + auto &data_table = gstate.table.GetStorage(); + lstate.writer = data_table.CreateOptimisticWriter(context.client); + lstate.local_collection = data_table.CreateOptimisticRowGroups(context.client, std::move(collection)); } OnConflictHandling(table, context, lstate); D_ASSERT(action_type != OnConflictAction::UPDATE); diff --git a/src/include/duckdb/execution/operator/persistent/physical_insert.hpp b/src/include/duckdb/execution/operator/persistent/physical_insert.hpp index 04d202af24d9..acc36a7780c4 100644 --- a/src/include/duckdb/execution/operator/persistent/physical_insert.hpp +++ b/src/include/duckdb/execution/operator/persistent/physical_insert.hpp @@ -57,7 +57,7 @@ class InsertLocalState : public LocalSinkState { DataChunk update_chunk; ExpressionExecutor default_executor; TableAppendState local_append_state; - unique_ptr local_collection; + optional_ptr local_collection; optional_ptr writer; // Rows that have been updated by a DO UPDATE conflict unordered_set updated_rows; diff --git a/src/include/duckdb/storage/data_table.hpp b/src/include/duckdb/storage/data_table.hpp index 40f8145ddb8b..5496c29115e2 100644 --- a/src/include/duckdb/storage/data_table.hpp +++ b/src/include/duckdb/storage/data_table.hpp @@ -115,7 +115,9 @@ class DataTable { optional_ptr> column_ids); //! Merge a row group collection into the transaction-local storage void LocalMerge(ClientContext &context, RowGroupCollection &collection); - //! Creates an optimistic writer for this table - used for optimistically writing parallel appends + //! Create an optimistic row group collection for this table. Used for optimistically writing parallel appends. + RowGroupCollection &CreateOptimisticRowGroups(ClientContext &context, unique_ptr collection); + //! Create an optimistic writer for this table. Used for optimistically writing parallel appends. OptimisticDataWriter &CreateOptimisticWriter(ClientContext &context); void FinalizeOptimisticWriter(ClientContext &context, OptimisticDataWriter &writer); diff --git a/src/include/duckdb/storage/optimistic_data_writer.hpp b/src/include/duckdb/storage/optimistic_data_writer.hpp index c3d04e9c470f..802d51bad707 100644 --- a/src/include/duckdb/storage/optimistic_data_writer.hpp +++ b/src/include/duckdb/storage/optimistic_data_writer.hpp @@ -30,7 +30,7 @@ class OptimisticDataWriter { //! Merge the partially written blocks from one optimistic writer into another void Merge(OptimisticDataWriter &other); //! Rollback - void Rollback(const bool mark_modified); + void Rollback(); private: //! Prepare a write to disk diff --git a/src/include/duckdb/storage/partial_block_manager.hpp b/src/include/duckdb/storage/partial_block_manager.hpp index dbe6e7029664..c59869976a91 100644 --- a/src/include/duckdb/storage/partial_block_manager.hpp +++ b/src/include/duckdb/storage/partial_block_manager.hpp @@ -114,7 +114,7 @@ class PartialBlockManager { void ClearBlocks(); //! Rollback all data written by this partial block manager - void Rollback(const bool mark_modified); + void Rollback(); //! Merge this block manager into another one void Merge(PartialBlockManager &other); @@ -129,9 +129,6 @@ class PartialBlockManager { //! Returns a reference to the underlying block manager. BlockManager &GetBlockManager() const; - //! Registers a block as "written" by this partial block manager - void AddWrittenBlock(block_id_t block); - protected: BlockManager &block_manager; PartialBlockType partial_block_type; @@ -140,8 +137,6 @@ class PartialBlockManager { //! This is a multimap because there might be outstanding partial blocks with //! the same amount of left-over space multimap> partially_filled_blocks; - //! The set of written blocks - unordered_set written_blocks; //! The maximum size (in bytes) at which a partial block will be considered a partial block uint32_t max_partial_block_size; diff --git a/src/include/duckdb/transaction/local_storage.hpp b/src/include/duckdb/transaction/local_storage.hpp index 7516adeced72..3c202a56fde6 100644 --- a/src/include/duckdb/transaction/local_storage.hpp +++ b/src/include/duckdb/transaction/local_storage.hpp @@ -43,7 +43,7 @@ class LocalTableStorage : public enable_shared_from_this { reference table_ref; Allocator &allocator; - //! The main chunk collection holding the data + //! The main row group collection. shared_ptr row_groups; //! The set of unique append indexes. TableIndexList append_indexes; @@ -51,10 +51,14 @@ class LocalTableStorage : public enable_shared_from_this { TableIndexList delete_indexes; //! The number of deleted rows idx_t deleted_rows; - //! The main optimistic data writer + + //! The optimistic row group collections associated with this table. + vector> optimistic_row_groups; + //! The main optimistic data writer associated with this table. OptimisticDataWriter optimistic_writer; - //! The set of all optimistic data writers associated with this table + //! The optimistic data writers associated with this table. vector> optimistic_writers; + //! Whether or not storage was merged bool merged_storage = false; //! Whether or not the storage was dropped @@ -73,7 +77,9 @@ class LocalTableStorage : public enable_shared_from_this { const vector &table_types, row_t &start_row); void AppendToDeleteIndexes(Vector &row_ids, DataChunk &delete_chunk); - //! Creates an optimistic writer for this table + //! Create an optimistic row group collection for this table. + RowGroupCollection &CreateOptimisticRowGroups(unique_ptr collection); + //! Create an optimistic writer for this table. OptimisticDataWriter &CreateOptimisticWriter(); void FinalizeOptimisticWriter(OptimisticDataWriter &writer); }; @@ -129,7 +135,9 @@ class LocalStorage { static void FinalizeAppend(LocalAppendState &state); //! Merge a row group collection into the transaction-local storage void LocalMerge(DataTable &table, RowGroupCollection &collection); - //! Create an optimistic writer for the specified table + //! Create an optimistic row group collection for this table. + RowGroupCollection &CreateOptimisticRowGroups(DataTable &table, unique_ptr collection); + //! Create an optimistic writer for this table. OptimisticDataWriter &CreateOptimisticWriter(DataTable &table); void FinalizeOptimisticWriter(DataTable &table, OptimisticDataWriter &writer); diff --git a/src/storage/checkpoint/write_overflow_strings_to_disk.cpp b/src/storage/checkpoint/write_overflow_strings_to_disk.cpp index c58be310271c..37492f787cbe 100644 --- a/src/storage/checkpoint/write_overflow_strings_to_disk.cpp +++ b/src/storage/checkpoint/write_overflow_strings_to_disk.cpp @@ -87,9 +87,6 @@ void WriteOverflowStringsToDisk::Flush() { // write to disk auto &block_manager = partial_block_manager.GetBlockManager(); block_manager.Write(handle.GetFileBuffer(), block_id); - - auto lock = partial_block_manager.GetLock(); - partial_block_manager.AddWrittenBlock(block_id); } block_id = INVALID_BLOCK; offset = 0; diff --git a/src/storage/compression/zstd.cpp b/src/storage/compression/zstd.cpp index fca90c18099c..b3cac8107343 100644 --- a/src/storage/compression/zstd.cpp +++ b/src/storage/compression/zstd.cpp @@ -474,10 +474,6 @@ class ZSTDCompressionState : public CompressionState { // Write the current page to disk auto &block_manager = partial_block_manager.GetBlockManager(); block_manager.Write(buffer.GetFileBuffer(), block_id); - { - auto lock = partial_block_manager.GetLock(); - partial_block_manager.AddWrittenBlock(block_id); - } } void FlushVector() { diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index 384b34330af3..9a2bc0fa988d 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -848,6 +848,12 @@ void DataTable::FinalizeLocalAppend(LocalAppendState &state) { LocalStorage::FinalizeAppend(state); } +RowGroupCollection &DataTable::CreateOptimisticRowGroups(ClientContext &context, + unique_ptr collection) { + auto &local_storage = LocalStorage::Get(context, db); + return local_storage.CreateOptimisticRowGroups(*this, std::move(collection)); +} + OptimisticDataWriter &DataTable::CreateOptimisticWriter(ClientContext &context) { auto &local_storage = LocalStorage::Get(context, db); return local_storage.CreateOptimisticWriter(*this); diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 9764f878136d..ea88d506d41e 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -55,6 +55,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_data const idx_t alter_column_index, const LogicalType &target_type, const vector &bound_columns, Expression &cast_expr) : table_ref(new_data_table), allocator(Allocator::Get(new_data_table.db)), deleted_rows(parent.deleted_rows), + optimistic_row_groups(std::move(parent.optimistic_row_groups)), optimistic_writer(new_data_table, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { @@ -69,6 +70,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_data LocalTableStorage::LocalTableStorage(DataTable &new_data_table, LocalTableStorage &parent, const idx_t drop_column_index) : table_ref(new_data_table), allocator(Allocator::Get(new_data_table.db)), deleted_rows(parent.deleted_rows), + optimistic_row_groups(std::move(parent.optimistic_row_groups)), optimistic_writer(new_data_table, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { @@ -83,8 +85,10 @@ LocalTableStorage::LocalTableStorage(DataTable &new_data_table, LocalTableStorag LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, LocalTableStorage &parent, ColumnDefinition &new_column, ExpressionExecutor &default_executor) : table_ref(new_dt), allocator(Allocator::Get(new_dt.db)), deleted_rows(parent.deleted_rows), + optimistic_row_groups(std::move(parent.optimistic_row_groups)), optimistic_writer(new_dt, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { + row_groups = parent.row_groups->AddColumn(context, new_column, default_executor); parent.row_groups.reset(); append_indexes.Move(parent.append_indexes); @@ -229,6 +233,11 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen } } +RowGroupCollection &LocalTableStorage::CreateOptimisticRowGroups(unique_ptr collection) { + optimistic_row_groups.push_back(std::move(collection)); + return *optimistic_row_groups.back(); +} + OptimisticDataWriter &LocalTableStorage::CreateOptimisticWriter() { auto writer = make_uniq(table_ref.get()); optimistic_writers.push_back(std::move(writer)); @@ -252,14 +261,10 @@ void LocalTableStorage::FinalizeOptimisticWriter(OptimisticDataWriter &writer) { } void LocalTableStorage::Rollback() { - for (auto &writer : optimistic_writers) { - writer->Rollback(true); + for (auto &collection : optimistic_row_groups) { + collection->CommitDropTable(); } - - // Drop any optimistically written local changes. - // The top-level writer writes to the row groups. - optimistic_writers.clear(); - optimistic_writer.Rollback(false); + optimistic_row_groups.clear(); row_groups->CommitDropTable(); } @@ -447,6 +452,12 @@ void LocalStorage::LocalMerge(DataTable &table, RowGroupCollection &collection) storage.merged_storage = true; } +RowGroupCollection &LocalStorage::CreateOptimisticRowGroups(DataTable &table, + unique_ptr collection) { + auto &storage = table_manager.GetOrCreateStorage(context, table); + return storage.CreateOptimisticRowGroups(std::move(collection)); +} + OptimisticDataWriter &LocalStorage::CreateOptimisticWriter(DataTable &table) { auto &storage = table_manager.GetOrCreateStorage(context, table); return storage.CreateOptimisticWriter(); diff --git a/src/storage/optimistic_data_writer.cpp b/src/storage/optimistic_data_writer.cpp index e8f5bfb70485..0a1966c500b4 100644 --- a/src/storage/optimistic_data_writer.cpp +++ b/src/storage/optimistic_data_writer.cpp @@ -80,12 +80,13 @@ void OptimisticDataWriter::Merge(OptimisticDataWriter &other) { void OptimisticDataWriter::FinalFlush() { if (partial_manager) { partial_manager->FlushPartialBlocks(); + partial_manager.reset(); } } -void OptimisticDataWriter::Rollback(const bool mark_modified) { +void OptimisticDataWriter::Rollback() { if (partial_manager) { - partial_manager->Rollback(mark_modified); + partial_manager->Rollback(); partial_manager.reset(); } } diff --git a/src/storage/partial_block_manager.cpp b/src/storage/partial_block_manager.cpp index 5b8d392ea8da..7c23df3da75b 100644 --- a/src/storage/partial_block_manager.cpp +++ b/src/storage/partial_block_manager.cpp @@ -133,7 +133,6 @@ void PartialBlockManager::RegisterPartialBlock(PartialBlockAllocation allocation // Flush any block that we're not going to reuse. if (block_to_free) { block_to_free->Flush(free_space); - AddWrittenBlock(block_to_free->state.block_id); } } @@ -162,21 +161,9 @@ void PartialBlockManager::Merge(PartialBlockManager &other) { partially_filled_blocks.insert(make_pair(e.first, std::move(e.second))); } } - // copy over the written blocks - for (auto &block_id : other.written_blocks) { - AddWrittenBlock(block_id); - } - other.written_blocks.clear(); other.partially_filled_blocks.clear(); } -void PartialBlockManager::AddWrittenBlock(block_id_t block) { - auto entry = written_blocks.insert(block); - if (!entry.second) { - throw InternalException("Written block already exists"); - } -} - void PartialBlockManager::ClearBlocks() { for (auto &e : partially_filled_blocks) { e.second->Clear(); @@ -187,7 +174,6 @@ void PartialBlockManager::ClearBlocks() { void PartialBlockManager::FlushPartialBlocks() { for (auto &e : partially_filled_blocks) { e.second->Flush(e.first); - written_blocks.insert(e.second->state.block_id); } partially_filled_blocks.clear(); } @@ -196,13 +182,8 @@ BlockManager &PartialBlockManager::GetBlockManager() const { return block_manager; } -void PartialBlockManager::Rollback(const bool mark_modified) { +void PartialBlockManager::Rollback() { ClearBlocks(); - if (mark_modified) { - for (auto &block_id : written_blocks) { - block_manager.MarkBlockAsFree(block_id); - } - } } } // namespace duckdb diff --git a/src/storage/table/column_checkpoint_state.cpp b/src/storage/table/column_checkpoint_state.cpp index a67daa060b76..d2fce922af48 100644 --- a/src/storage/table/column_checkpoint_state.cpp +++ b/src/storage/table/column_checkpoint_state.cpp @@ -71,7 +71,6 @@ void PartialBlockForCheckpoint::Flush(const idx_t free_space_left) { } } } - Clear(); } From 6a807b88c710ba03f405752169f22e05727cc8d0 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 23 Jan 2025 16:14:17 +0100 Subject: [PATCH 005/142] implemented the filter --- src/common/enum_util.cpp | 14 ++-- src/common/enums/metric_type.cpp | 9 ++- src/common/enums/optimizer_type.cpp | 1 + .../duckdb/common/enums/metric_type.hpp | 1 + .../duckdb/common/enums/optimizer_type.hpp | 3 +- .../optimizer/remove_useless_projections.hpp | 29 +++++++++ src/optimizer/CMakeLists.txt | 1 + src/optimizer/optimizer.cpp | 8 +++ src/optimizer/remove_useless_projections.cpp | 64 +++++++++++++++++++ test/optimizer/pullup_filters.test | 10 +-- 10 files changed, 126 insertions(+), 14 deletions(-) create mode 100644 src/include/duckdb/optimizer/remove_useless_projections.hpp create mode 100644 src/optimizer/remove_useless_projections.cpp diff --git a/src/common/enum_util.cpp b/src/common/enum_util.cpp index 0ac5bb0c4e7f..8da8057edeb1 100644 --- a/src/common/enum_util.cpp +++ b/src/common/enum_util.cpp @@ -2401,19 +2401,20 @@ const StringUtil::EnumStringLiteral *GetMetricsTypeValues() { { static_cast(MetricsType::OPTIMIZER_EXTENSION), "OPTIMIZER_EXTENSION" }, { static_cast(MetricsType::OPTIMIZER_MATERIALIZED_CTE), "OPTIMIZER_MATERIALIZED_CTE" }, { static_cast(MetricsType::OPTIMIZER_SUM_REWRITER), "OPTIMIZER_SUM_REWRITER" }, - { static_cast(MetricsType::OPTIMIZER_LATE_MATERIALIZATION), "OPTIMIZER_LATE_MATERIALIZATION" } + { static_cast(MetricsType::OPTIMIZER_LATE_MATERIALIZATION), "OPTIMIZER_LATE_MATERIALIZATION" }, + { static_cast(MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS), "OPTIMIZER_REMOVE_USELESS_PROJECTIONS" } }; return values; } template<> const char* EnumUtil::ToChars(MetricsType value) { - return StringUtil::EnumToString(GetMetricsTypeValues(), 49, "MetricsType", static_cast(value)); + return StringUtil::EnumToString(GetMetricsTypeValues(), 50, "MetricsType", static_cast(value)); } template<> MetricsType EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetMetricsTypeValues(), 49, "MetricsType", value)); + return static_cast(StringUtil::StringToEnum(GetMetricsTypeValues(), 50, "MetricsType", value)); } const StringUtil::EnumStringLiteral *GetMultiFileReaderColumnMappingModeValues() { @@ -2605,19 +2606,20 @@ const StringUtil::EnumStringLiteral *GetOptimizerTypeValues() { { static_cast(OptimizerType::EXTENSION), "EXTENSION" }, { static_cast(OptimizerType::MATERIALIZED_CTE), "MATERIALIZED_CTE" }, { static_cast(OptimizerType::SUM_REWRITER), "SUM_REWRITER" }, - { static_cast(OptimizerType::LATE_MATERIALIZATION), "LATE_MATERIALIZATION" } + { static_cast(OptimizerType::LATE_MATERIALIZATION), "LATE_MATERIALIZATION" }, + { static_cast(OptimizerType::REMOVE_USELESS_PROJECTIONS), "REMOVE_USELESS_PROJECTIONS" } }; return values; } template<> const char* EnumUtil::ToChars(OptimizerType value) { - return StringUtil::EnumToString(GetOptimizerTypeValues(), 28, "OptimizerType", static_cast(value)); + return StringUtil::EnumToString(GetOptimizerTypeValues(), 29, "OptimizerType", static_cast(value)); } template<> OptimizerType EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetOptimizerTypeValues(), 28, "OptimizerType", value)); + return static_cast(StringUtil::StringToEnum(GetOptimizerTypeValues(), 29, "OptimizerType", value)); } const StringUtil::EnumStringLiteral *GetOrderByNullTypeValues() { diff --git a/src/common/enums/metric_type.cpp b/src/common/enums/metric_type.cpp index d97579c23f2a..0477f2a4f672 100644 --- a/src/common/enums/metric_type.cpp +++ b/src/common/enums/metric_type.cpp @@ -66,6 +66,8 @@ MetricsType MetricsUtils::GetOptimizerMetricByType(OptimizerType type) { return MetricsType::OPTIMIZER_FILTER_PUSHDOWN; case OptimizerType::EMPTY_RESULT_PULLUP: return MetricsType::OPTIMIZER_EMPTY_RESULT_PULLUP; + case OptimizerType::REMOVE_USELESS_PROJECTIONS: + return MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS; case OptimizerType::CTE_FILTER_PUSHER: return MetricsType::OPTIMIZER_CTE_FILTER_PUSHER; case OptimizerType::REGEX_RANGE: @@ -153,6 +155,8 @@ OptimizerType MetricsUtils::GetOptimizerTypeByMetric(MetricsType type) { return OptimizerType::BUILD_SIDE_PROBE_SIDE; case MetricsType::OPTIMIZER_LIMIT_PUSHDOWN: return OptimizerType::LIMIT_PUSHDOWN; + case MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS: + return OptimizerType::REMOVE_USELESS_PROJECTIONS; case MetricsType::OPTIMIZER_TOP_N: return OptimizerType::TOP_N; case MetricsType::OPTIMIZER_COMPRESSED_MATERIALIZATION: @@ -170,9 +174,9 @@ OptimizerType MetricsUtils::GetOptimizerTypeByMetric(MetricsType type) { case MetricsType::OPTIMIZER_MATERIALIZED_CTE: return OptimizerType::MATERIALIZED_CTE; case MetricsType::OPTIMIZER_SUM_REWRITER: - return OptimizerType::SUM_REWRITER; + return OptimizerType::SUM_REWRITER; case MetricsType::OPTIMIZER_LATE_MATERIALIZATION: - return OptimizerType::LATE_MATERIALIZATION; + return OptimizerType::LATE_MATERIALIZATION; default: return OptimizerType::INVALID; }; @@ -206,6 +210,7 @@ bool MetricsUtils::IsOptimizerMetric(MetricsType type) { case MetricsType::OPTIMIZER_EXTENSION: case MetricsType::OPTIMIZER_MATERIALIZED_CTE: case MetricsType::OPTIMIZER_SUM_REWRITER: + case MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS: case MetricsType::OPTIMIZER_LATE_MATERIALIZATION: return true; default: diff --git a/src/common/enums/optimizer_type.cpp b/src/common/enums/optimizer_type.cpp index f4d02d68a3b5..fd307260d06d 100644 --- a/src/common/enums/optimizer_type.cpp +++ b/src/common/enums/optimizer_type.cpp @@ -29,6 +29,7 @@ static const DefaultOptimizerType internal_optimizer_types[] = { {"column_lifetime", OptimizerType::COLUMN_LIFETIME}, {"limit_pushdown", OptimizerType::LIMIT_PUSHDOWN}, {"top_n", OptimizerType::TOP_N}, + {"remove_useless_projections", OptimizerType::REMOVE_USELESS_PROJECTIONS}, {"build_side_probe_side", OptimizerType::BUILD_SIDE_PROBE_SIDE}, {"compressed_materialization", OptimizerType::COMPRESSED_MATERIALIZATION}, {"duplicate_groups", OptimizerType::DUPLICATE_GROUPS}, diff --git a/src/include/duckdb/common/enums/metric_type.hpp b/src/include/duckdb/common/enums/metric_type.hpp index 14389bf4a5f9..bd938779900d 100644 --- a/src/include/duckdb/common/enums/metric_type.hpp +++ b/src/include/duckdb/common/enums/metric_type.hpp @@ -69,6 +69,7 @@ enum class MetricsType : uint8_t { OPTIMIZER_MATERIALIZED_CTE, OPTIMIZER_SUM_REWRITER, OPTIMIZER_LATE_MATERIALIZATION, + OPTIMIZER_REMOVE_USELESS_PROJECTIONS, }; struct MetricsTypeHashFunction { diff --git a/src/include/duckdb/common/enums/optimizer_type.hpp b/src/include/duckdb/common/enums/optimizer_type.hpp index adabacec225d..e9209d56ba1b 100644 --- a/src/include/duckdb/common/enums/optimizer_type.hpp +++ b/src/include/duckdb/common/enums/optimizer_type.hpp @@ -41,7 +41,8 @@ enum class OptimizerType : uint32_t { EXTENSION, MATERIALIZED_CTE, SUM_REWRITER, - LATE_MATERIALIZATION + LATE_MATERIALIZATION, + REMOVE_USELESS_PROJECTIONS }; string OptimizerTypeToString(OptimizerType type); diff --git a/src/include/duckdb/optimizer/remove_useless_projections.hpp b/src/include/duckdb/optimizer/remove_useless_projections.hpp new file mode 100644 index 000000000000..46665b74ee73 --- /dev/null +++ b/src/include/duckdb/optimizer/remove_useless_projections.hpp @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/optimizer/remove_useless_projections.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/optimizer/column_binding_replacer.hpp" + +namespace duckdb { + +//! The RemoveUselessProjections Optimizer traverses the logical operator tree and removes all projections that just +class RemoveUselessProjections : LogicalOperatorVisitor { +public: + RemoveUselessProjections() { + } + unique_ptr RemoveProjections(unique_ptr plan); + unique_ptr RemoveProjectionsChildren(unique_ptr plan); + void ReplaceBindings(LogicalOperator &plan); + +private: + bool first_projection; + ColumnBindingReplacer replacer; +}; + +} // namespace duckdb diff --git a/src/optimizer/CMakeLists.txt b/src/optimizer/CMakeLists.txt index a7b881b09925..bed29e1866d8 100644 --- a/src/optimizer/CMakeLists.txt +++ b/src/optimizer/CMakeLists.txt @@ -30,6 +30,7 @@ add_library_unity( regex_range_filter.cpp remove_duplicate_groups.cpp remove_unused_columns.cpp + remove_useless_projections.cpp statistics_propagator.cpp limit_pushdown.cpp topn_optimizer.cpp diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index 8ac4cdd87da8..d07d4271bd87 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -12,6 +12,7 @@ #include "duckdb/optimizer/cte_filter_pusher.hpp" #include "duckdb/optimizer/deliminator.hpp" #include "duckdb/optimizer/empty_result_pullup.hpp" +#include "duckdb/optimizer/remove_useless_projections.hpp" #include "duckdb/optimizer/expression_heuristics.hpp" #include "duckdb/optimizer/filter_pullup.hpp" #include "duckdb/optimizer/filter_pushdown.hpp" @@ -166,6 +167,13 @@ void Optimizer::RunBuiltInOptimizers() { plan = empty_result_pullup.Optimize(std::move(plan)); }); + // Removes Unnecessary Projections + RunOptimizer(OptimizerType::REMOVE_USELESS_PROJECTIONS, [&]() { + RemoveUselessProjections remover; + plan = remover.RemoveProjections(std::move(plan)); + remover.ReplaceBindings(*plan); + }); + // then we perform the join ordering optimization // this also rewrites cross products + filters into joins and performs filter pushdowns RunOptimizer(OptimizerType::JOIN_ORDER, [&]() { diff --git a/src/optimizer/remove_useless_projections.cpp b/src/optimizer/remove_useless_projections.cpp new file mode 100644 index 000000000000..74ae42b30c87 --- /dev/null +++ b/src/optimizer/remove_useless_projections.cpp @@ -0,0 +1,64 @@ +#include "duckdb/optimizer/remove_useless_projections.hpp" +#include "duckdb/common/enums/logical_operator_type.hpp" + +namespace duckdb { + +unique_ptr RemoveUselessProjections::RemoveProjectionsChildren(unique_ptr op) { + for (idx_t i = 0; i < op->children.size(); i++) { + op->children[i] = RemoveProjections(std::move(op->children[i])); + } + return op; +} + +unique_ptr RemoveUselessProjections::RemoveProjections(unique_ptr op) { + if (op->type == LogicalOperatorType::LOGICAL_UNION || op->type == LogicalOperatorType::LOGICAL_EXCEPT || + op->type == LogicalOperatorType::LOGICAL_INTERSECT || op->type == LogicalOperatorType::LOGICAL_RECURSIVE_CTE || + op->type == LogicalOperatorType::LOGICAL_MATERIALIZED_CTE) { + // guaranteed to find a projection under this that is meant to keep the column order in the presence of + // an optimization done by build side probe side. + for (idx_t i = 0; i < op->children.size(); i++) { + first_projection = true; + op->children[i] = RemoveProjections(std::move(op->children[i])); + } + return op; + } + if (op->type != LogicalOperatorType::LOGICAL_PROJECTION) { + return RemoveProjectionsChildren(std::move(op)); + } + // operator is a projection. Remove if possible + if (first_projection) { + first_projection = false; + return RemoveProjectionsChildren(std::move(op)); + } + auto &proj = op->Cast(); + auto child_bindings = op->children[0]->GetColumnBindings(); + if (proj.GetColumnBindings().size() != child_bindings.size()) { + return op; + } + idx_t binding_index = 0; + for (auto &expr : proj.expressions) { + if (expr->type != ExpressionType::BOUND_COLUMN_REF) { + return op; + } + auto &bound_ref = expr->Cast(); + if (bound_ref.binding != child_bindings[binding_index]) { + return op; + } + binding_index++; + } + D_ASSERT(binding_index == op->GetColumnBindings().size()); + // we have a projection where every expression is a bound column ref, and they are in the same order as the + // bindings of the child. We can remove this projection + binding_index = 0; + for (auto &binding : op->GetColumnBindings()) { + replacer.replacement_bindings.push_back(ReplacementBinding(binding, child_bindings[binding_index])); + binding_index++; + } + return RemoveProjectionsChildren(std::move(op->children[0])); +} + +void RemoveUselessProjections::ReplaceBindings(LogicalOperator &op) { + replacer.VisitOperator(op); +} + +} // namespace duckdb diff --git a/test/optimizer/pullup_filters.test b/test/optimizer/pullup_filters.test index 62a76de2335e..87bd01e99daf 100644 --- a/test/optimizer/pullup_filters.test +++ b/test/optimizer/pullup_filters.test @@ -6,13 +6,13 @@ statement ok PRAGMA explain_output = 'PHYSICAL_ONLY' statement ok -CREATE TABLE vals1 AS SELECT i AS i, i AS j FROM range(0, 11, 1) t1(i) +CREATE TABLE vals1 AS SELECT i AS i, i AS j FROM range(0, 11, 1) t1(i); statement ok -CREATE TABLE vals2(k BIGINT, l BIGINT) +CREATE TABLE vals2(k BIGINT, l BIGINT); statement ok -INSERT INTO vals2 SELECT * FROM vals1 +INSERT INTO vals2 SELECT * FROM vals1; ## INNER JOIN: pull up a single filter in cross product from LHS query II @@ -30,13 +30,13 @@ physical_plan :.*=5.*=5.* query II EXPLAIN SELECT * FROM (SELECT * FROM vals1, vals2 WHERE i=5 AND k=3) tbl1, (SELECT * FROM vals1, vals2) tbl2 WHERE tbl1.i=tbl2.i AND tbl1.k=tbl2.k ---- -physical_plan :(.*=5.*=3.*=5.*=3.*|.*=3.*=5.*=3.*=5.*) +physical_plan :(.*=5.*=5.*=3.*=3.*|.*=3.*=3.*=5.*=5.*) ## INNER JOIN: pull up two filters in cross product from RHS query II EXPLAIN SELECT * FROM (SELECT * FROM vals1, vals2) tbl1, (SELECT * FROM vals1, vals2 WHERE i=5 AND k=3) tbl2 WHERE tbl1.i=tbl2.i AND tbl1.k=tbl2.k ---- -physical_plan :(.*=5.*=3.*=5.*=3.*|.*=3.*=5.*=3.*=5.*) +physical_plan :(.*=5.*=5.*=3.*=3.*|.*=3.*=3.*=5.*=5.*) #### LEFT JOIN: pull up a single filter from LHS #### query II From b33572693e23ca00adf34847271572e6b06ca50c Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 23 Jan 2025 16:51:02 +0100 Subject: [PATCH 006/142] added one test --- .../optimizer/remove_useless_projections.hpp | 2 +- .../remove_unnecessary_projections.test | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 test/optimizer/remove_unnecessary_projections.test diff --git a/src/include/duckdb/optimizer/remove_useless_projections.hpp b/src/include/duckdb/optimizer/remove_useless_projections.hpp index 46665b74ee73..595d090630fb 100644 --- a/src/include/duckdb/optimizer/remove_useless_projections.hpp +++ b/src/include/duckdb/optimizer/remove_useless_projections.hpp @@ -15,7 +15,7 @@ namespace duckdb { //! The RemoveUselessProjections Optimizer traverses the logical operator tree and removes all projections that just class RemoveUselessProjections : LogicalOperatorVisitor { public: - RemoveUselessProjections() { + RemoveUselessProjections() : first_projection(true) { } unique_ptr RemoveProjections(unique_ptr plan); unique_ptr RemoveProjectionsChildren(unique_ptr plan); diff --git a/test/optimizer/remove_unnecessary_projections.test b/test/optimizer/remove_unnecessary_projections.test new file mode 100644 index 000000000000..2bcab73648ba --- /dev/null +++ b/test/optimizer/remove_unnecessary_projections.test @@ -0,0 +1,23 @@ +# name: test/optimizer/remove_unnecessary_projections.test +# description: Test regex to like Optimization Rules +# group: [optimizer] + +statement ok +pragma disabled_optimizers='statistics_propagation,column_lifetime'; + +statement ok +create table t1 as select range%50 a from range(10000); + +statement ok +create table t2 as select range b from range(100); + +statement ok +create table t3 as select range c from range(10000); + +statement ok +create table t4 as select range d from range(400); + +query II +explain select * from (select * from t1, t2 where a = b) t_left, (select * from t3, t4 where c = d) t_right where a = d; +---- +physical_plan :.*PROJECTION.*PROJECTION.* From 63b53f0b920bfc4c6f40d8bfda5211d85c26a4f2 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Fri, 24 Jan 2025 17:06:24 +0100 Subject: [PATCH 007/142] some refactoring to prepare for batch insert --- .../persistent/physical_batch_insert.cpp | 13 ---- .../operator/persistent/physical_insert.cpp | 68 ++++++++++--------- .../operator/persistent/physical_insert.hpp | 2 +- src/include/duckdb/storage/data_table.hpp | 5 +- .../duckdb/storage/optimistic_data_writer.hpp | 2 +- .../duckdb/transaction/local_storage.hpp | 16 ++++- src/storage/data_table.cpp | 10 ++- src/storage/local_storage.cpp | 39 ++++++++--- .../optimistic_write_update.test | 1 - 9 files changed, 90 insertions(+), 66 deletions(-) diff --git a/src/execution/operator/persistent/physical_batch_insert.cpp b/src/execution/operator/persistent/physical_batch_insert.cpp index 8e41e7244914..2e546c477282 100644 --- a/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/execution/operator/persistent/physical_batch_insert.cpp @@ -188,19 +188,6 @@ class BatchInsertLocalState : public LocalSinkState { optional_ptr writer; unique_ptr constraint_state; - // void CreateNewCollection(ClientContext &context, BatchInsertGlobalState &g_state, const vector - //&insert_types) { auto &data_table = g_state.table; auto table_info = data_table.GetStorage().GetDataTableInfo(); - // auto &io_manager = TableIOManager::Get(data_table.GetStorage()); - // - // // Create the local row group collection. - // auto max_row_id = NumericCast(MAX_ROW_ID); - // auto collection = make_uniq(std::move(table_info), io_manager, insert_types, - //max_row_id); collection->InitializeEmpty(); collection->InitializeAppend(current_append_state); - // - // lock_guard l(g_state.lock); - // auto &local_table_storage = data_table.GetStorage(); - // current_collection = data_table.CreateOptimisticRowGroups(context, std::move(collection)); - // } void CreateNewCollection(DuckTableEntry &table, const vector &insert_types) { auto table_info = table.GetStorage().GetDataTableInfo(); auto &io_manager = TableIOManager::Get(table.GetStorage()); diff --git a/src/execution/operator/persistent/physical_insert.cpp b/src/execution/operator/persistent/physical_insert.cpp index c7abdf25f9ca..3594ae15e7ff 100644 --- a/src/execution/operator/persistent/physical_insert.cpp +++ b/src/execution/operator/persistent/physical_insert.cpp @@ -83,7 +83,8 @@ InsertGlobalState::InsertGlobalState(ClientContext &context, const vector &types_p, const vector> &bound_defaults, const vector> &bound_constraints) - : default_executor(context, bound_defaults), bound_constraints(bound_constraints) { + : default_executor(context, bound_defaults), collection_index(DConstants::INVALID_INDEX), + bound_constraints(bound_constraints) { auto &allocator = Allocator::Get(context); @@ -681,34 +682,35 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, // All of the tuples should have been turned into an update, leaving the chunk empty afterwards D_ASSERT(lstate.update_chunk.size() == 0); } - } else { - D_ASSERT(!return_chunk); - // parallel append - if (!lstate.local_collection) { - auto table_info = storage.GetDataTableInfo(); - auto &io_manager = TableIOManager::Get(table.GetStorage()); - - // Create the local row group collection. - auto max_row_id = NumericCast(MAX_ROW_ID); - auto collection = - make_uniq(std::move(table_info), io_manager, insert_types, max_row_id); - collection->InitializeEmpty(); - collection->InitializeAppend(lstate.local_append_state); - - lock_guard l(gstate.lock); - auto &data_table = gstate.table.GetStorage(); - lstate.writer = data_table.CreateOptimisticWriter(context.client); - lstate.local_collection = data_table.CreateOptimisticRowGroups(context.client, std::move(collection)); - } - OnConflictHandling(table, context, lstate); - D_ASSERT(action_type != OnConflictAction::UPDATE); + return SinkResultType::NEED_MORE_INPUT; + } - auto new_row_group = lstate.local_collection->Append(lstate.insert_chunk, lstate.local_append_state); - if (new_row_group) { - lstate.writer->WriteNewRowGroup(*lstate.local_collection); - } + // parallel append + D_ASSERT(!return_chunk); + auto &data_table = gstate.table.GetStorage(); + if (!lstate.collection_index.IsValid()) { + auto table_info = storage.GetDataTableInfo(); + auto &io_manager = TableIOManager::Get(table.GetStorage()); + + // Create the local row group collection. + auto max_row_id = NumericCast(MAX_ROW_ID); + auto collection = make_uniq(std::move(table_info), io_manager, insert_types, max_row_id); + collection->InitializeEmpty(); + collection->InitializeAppend(lstate.local_append_state); + + lock_guard l(gstate.lock); + lstate.writer = data_table.CreateOptimisticWriter(context.client); + lstate.collection_index = data_table.CreateOptimisticCollection(context.client, std::move(collection)); } + OnConflictHandling(table, context, lstate); + D_ASSERT(action_type != OnConflictAction::UPDATE); + + auto &collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); + auto new_row_group = collection.Append(lstate.insert_chunk, lstate.local_append_state); + if (new_row_group) { + lstate.writer->WriteNewRowGroup(collection); + } return SinkResultType::NEED_MORE_INPUT; } @@ -719,7 +721,7 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato context.thread.profiler.Flush(*this); client_profiler.Flush(context.thread.profiler); - if (!parallel || !lstate.local_collection) { + if (!parallel || !lstate.collection_index.IsValid()) { return SinkCombineResultType::FINISHED; } @@ -729,9 +731,11 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato // parallel append: finalize the append TransactionData tdata(0, 0); - lstate.local_collection->FinalizeAppend(tdata, lstate.local_append_state); + auto &data_table = gstate.table.GetStorage(); + auto &collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); + collection.FinalizeAppend(tdata, lstate.local_append_state); - auto append_count = lstate.local_collection->GetTotalRows(); + auto append_count = collection.GetTotalRows(); lock_guard lock(gstate.lock); gstate.insert_count += append_count; @@ -739,16 +743,16 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato // we have few rows - append to the local storage directly storage.InitializeLocalAppend(gstate.append_state, table, context.client, bound_constraints); auto &transaction = DuckTransaction::Get(context.client, table.catalog); - lstate.local_collection->Scan(transaction, [&](DataChunk &insert_chunk) { + collection.Scan(transaction, [&](DataChunk &insert_chunk) { storage.LocalAppend(gstate.append_state, context.client, insert_chunk, false); return true; }); storage.FinalizeLocalAppend(gstate.append_state); } else { // we have written rows to disk optimistically - merge directly into the transaction-local storage - lstate.writer->WriteLastRowGroup(*lstate.local_collection); + lstate.writer->WriteLastRowGroup(collection); lstate.writer->FinalFlush(); - gstate.table.GetStorage().LocalMerge(context.client, *lstate.local_collection); + gstate.table.GetStorage().LocalMerge(context.client, collection); gstate.table.GetStorage().FinalizeOptimisticWriter(context.client, *lstate.writer); } diff --git a/src/include/duckdb/execution/operator/persistent/physical_insert.hpp b/src/include/duckdb/execution/operator/persistent/physical_insert.hpp index acc36a7780c4..f12084e39af9 100644 --- a/src/include/duckdb/execution/operator/persistent/physical_insert.hpp +++ b/src/include/duckdb/execution/operator/persistent/physical_insert.hpp @@ -57,7 +57,7 @@ class InsertLocalState : public LocalSinkState { DataChunk update_chunk; ExpressionExecutor default_executor; TableAppendState local_append_state; - optional_ptr local_collection; + PhysicalIndex collection_index; optional_ptr writer; // Rows that have been updated by a DO UPDATE conflict unordered_set updated_rows; diff --git a/src/include/duckdb/storage/data_table.hpp b/src/include/duckdb/storage/data_table.hpp index 5496c29115e2..16354f45024b 100644 --- a/src/include/duckdb/storage/data_table.hpp +++ b/src/include/duckdb/storage/data_table.hpp @@ -116,7 +116,10 @@ class DataTable { //! Merge a row group collection into the transaction-local storage void LocalMerge(ClientContext &context, RowGroupCollection &collection); //! Create an optimistic row group collection for this table. Used for optimistically writing parallel appends. - RowGroupCollection &CreateOptimisticRowGroups(ClientContext &context, unique_ptr collection); + //! Returns the index into the optimistic_collections vector for newly created collection. + PhysicalIndex CreateOptimisticCollection(ClientContext &context, unique_ptr collection); + //! Returns the optimistic row group collection corresponding to the index. + RowGroupCollection &GetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index); //! Create an optimistic writer for this table. Used for optimistically writing parallel appends. OptimisticDataWriter &CreateOptimisticWriter(ClientContext &context); void FinalizeOptimisticWriter(ClientContext &context, OptimisticDataWriter &writer); diff --git a/src/include/duckdb/storage/optimistic_data_writer.hpp b/src/include/duckdb/storage/optimistic_data_writer.hpp index 802d51bad707..cdf96a038264 100644 --- a/src/include/duckdb/storage/optimistic_data_writer.hpp +++ b/src/include/duckdb/storage/optimistic_data_writer.hpp @@ -39,7 +39,7 @@ class OptimisticDataWriter { private: //! The table DataTable &table; - //! The partial block manager (if we created one yet) + //! The partial block manager, if any was created. unique_ptr partial_manager; }; diff --git a/src/include/duckdb/transaction/local_storage.hpp b/src/include/duckdb/transaction/local_storage.hpp index 3c202a56fde6..b5a7398a8446 100644 --- a/src/include/duckdb/transaction/local_storage.hpp +++ b/src/include/duckdb/transaction/local_storage.hpp @@ -53,7 +53,7 @@ class LocalTableStorage : public enable_shared_from_this { idx_t deleted_rows; //! The optimistic row group collections associated with this table. - vector> optimistic_row_groups; + vector> optimistic_collections; //! The main optimistic data writer associated with this table. OptimisticDataWriter optimistic_writer; //! The optimistic data writers associated with this table. @@ -78,10 +78,16 @@ class LocalTableStorage : public enable_shared_from_this { void AppendToDeleteIndexes(Vector &row_ids, DataChunk &delete_chunk); //! Create an optimistic row group collection for this table. - RowGroupCollection &CreateOptimisticRowGroups(unique_ptr collection); + //! Returns the index into the optimistic_collections vector for newly created collection. + PhysicalIndex CreateOptimisticCollection(unique_ptr collection); + //! Returns the optimistic row group collection corresponding to the index. + RowGroupCollection &GetOptimisticCollection(const PhysicalIndex collection_index); //! Create an optimistic writer for this table. OptimisticDataWriter &CreateOptimisticWriter(); void FinalizeOptimisticWriter(OptimisticDataWriter &writer); + +private: + mutex collections_lock; }; class LocalTableManager { @@ -136,7 +142,10 @@ class LocalStorage { //! Merge a row group collection into the transaction-local storage void LocalMerge(DataTable &table, RowGroupCollection &collection); //! Create an optimistic row group collection for this table. - RowGroupCollection &CreateOptimisticRowGroups(DataTable &table, unique_ptr collection); + //! Returns the index into the optimistic_collections vector for newly created collection. + PhysicalIndex CreateOptimisticCollection(DataTable &table, unique_ptr collection); + //! Returns the optimistic row group collection corresponding to the index. + RowGroupCollection &GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); //! Create an optimistic writer for this table. OptimisticDataWriter &CreateOptimisticWriter(DataTable &table); void FinalizeOptimisticWriter(DataTable &table, OptimisticDataWriter &writer); @@ -179,6 +188,7 @@ class LocalStorage { DuckTransaction &transaction; LocalTableManager table_manager; +private: void Flush(DataTable &table, LocalTableStorage &storage, optional_ptr commit_state); }; diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index 9a2bc0fa988d..9e194f0d6549 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -848,10 +848,14 @@ void DataTable::FinalizeLocalAppend(LocalAppendState &state) { LocalStorage::FinalizeAppend(state); } -RowGroupCollection &DataTable::CreateOptimisticRowGroups(ClientContext &context, - unique_ptr collection) { +PhysicalIndex DataTable::CreateOptimisticCollection(ClientContext &context, unique_ptr collection) { auto &local_storage = LocalStorage::Get(context, db); - return local_storage.CreateOptimisticRowGroups(*this, std::move(collection)); + return local_storage.CreateOptimisticCollection(*this, std::move(collection)); +} + +RowGroupCollection &DataTable::GetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index) { + auto &local_storage = LocalStorage::Get(context, db); + return local_storage.GetOptimisticCollection(*this, collection_index); } OptimisticDataWriter &DataTable::CreateOptimisticWriter(ClientContext &context) { diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index ea88d506d41e..6304147f715b 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -55,7 +55,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_data const idx_t alter_column_index, const LogicalType &target_type, const vector &bound_columns, Expression &cast_expr) : table_ref(new_data_table), allocator(Allocator::Get(new_data_table.db)), deleted_rows(parent.deleted_rows), - optimistic_row_groups(std::move(parent.optimistic_row_groups)), + optimistic_collections(std::move(parent.optimistic_collections)), optimistic_writer(new_data_table, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { @@ -70,7 +70,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_data LocalTableStorage::LocalTableStorage(DataTable &new_data_table, LocalTableStorage &parent, const idx_t drop_column_index) : table_ref(new_data_table), allocator(Allocator::Get(new_data_table.db)), deleted_rows(parent.deleted_rows), - optimistic_row_groups(std::move(parent.optimistic_row_groups)), + optimistic_collections(std::move(parent.optimistic_collections)), optimistic_writer(new_data_table, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { @@ -85,7 +85,7 @@ LocalTableStorage::LocalTableStorage(DataTable &new_data_table, LocalTableStorag LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, LocalTableStorage &parent, ColumnDefinition &new_column, ExpressionExecutor &default_executor) : table_ref(new_dt), allocator(Allocator::Get(new_dt.db)), deleted_rows(parent.deleted_rows), - optimistic_row_groups(std::move(parent.optimistic_row_groups)), + optimistic_collections(std::move(parent.optimistic_collections)), optimistic_writer(new_dt, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { @@ -233,9 +233,15 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen } } -RowGroupCollection &LocalTableStorage::CreateOptimisticRowGroups(unique_ptr collection) { - optimistic_row_groups.push_back(std::move(collection)); - return *optimistic_row_groups.back(); +PhysicalIndex LocalTableStorage::CreateOptimisticCollection(unique_ptr collection) { + lock_guard l(collections_lock); + optimistic_collections.push_back(std::move(collection)); + return PhysicalIndex(optimistic_collections.size() - 1); +} + +RowGroupCollection &LocalTableStorage::GetOptimisticCollection(const PhysicalIndex collection_index) { + lock_guard l(collections_lock); + return *optimistic_collections[collection_index.index]; } OptimisticDataWriter &LocalTableStorage::CreateOptimisticWriter() { @@ -261,10 +267,17 @@ void LocalTableStorage::FinalizeOptimisticWriter(OptimisticDataWriter &writer) { } void LocalTableStorage::Rollback() { - for (auto &collection : optimistic_row_groups) { + for (auto &writer : optimistic_writers) { + writer->Rollback(); + } + optimistic_writer.Rollback(); + for (auto &collection : optimistic_collections) { + if (!collection) { + continue; + } collection->CommitDropTable(); } - optimistic_row_groups.clear(); + optimistic_collections.clear(); row_groups->CommitDropTable(); } @@ -452,10 +465,14 @@ void LocalStorage::LocalMerge(DataTable &table, RowGroupCollection &collection) storage.merged_storage = true; } -RowGroupCollection &LocalStorage::CreateOptimisticRowGroups(DataTable &table, - unique_ptr collection) { +PhysicalIndex LocalStorage::CreateOptimisticCollection(DataTable &table, unique_ptr collection) { + auto &storage = table_manager.GetOrCreateStorage(context, table); + return storage.CreateOptimisticCollection(std::move(collection)); +} + +RowGroupCollection &LocalStorage::GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index) { auto &storage = table_manager.GetOrCreateStorage(context, table); - return storage.CreateOptimisticRowGroups(std::move(collection)); + return storage.GetOptimisticCollection(collection_index); } OptimisticDataWriter &LocalStorage::CreateOptimisticWriter(DataTable &table) { diff --git a/test/sql/storage/optimistic_write/optimistic_write_update.test b/test/sql/storage/optimistic_write/optimistic_write_update.test index 41bc6d1f7c6d..12c42a763824 100644 --- a/test/sql/storage/optimistic_write/optimistic_write_update.test +++ b/test/sql/storage/optimistic_write/optimistic_write_update.test @@ -2,7 +2,6 @@ # description: Test optimistic write with updates in transaction-local storage # group: [optimistic_write] -# load the DB from disk load __TEST_DIR__/optimistic_write_update.db statement ok From 9203af533d1740fde50b194ca395534fa9cad891 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Sat, 25 Jan 2025 16:13:16 +0100 Subject: [PATCH 008/142] use local table storage for batch insert --- .../persistent/physical_batch_insert.cpp | 222 ++++++++++-------- .../operator/persistent/physical_insert.cpp | 18 +- .../operator/persistent/physical_insert.hpp | 1 + src/include/duckdb/storage/data_table.hpp | 5 +- .../duckdb/transaction/local_storage.hpp | 8 +- src/storage/data_table.cpp | 8 +- src/storage/local_storage.cpp | 21 +- 7 files changed, 171 insertions(+), 112 deletions(-) diff --git a/src/execution/operator/persistent/physical_batch_insert.cpp b/src/execution/operator/persistent/physical_batch_insert.cpp index 2e546c477282..a9d8696201f4 100644 --- a/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/execution/operator/persistent/physical_batch_insert.cpp @@ -38,38 +38,47 @@ enum class RowGroupBatchType : uint8_t { FLUSHED, NOT_FLUSHED }; class CollectionMerger { public: - explicit CollectionMerger(ClientContext &context) : context(context) { + explicit CollectionMerger(ClientContext &context, DataTable &data_table) + : context(context), data_table(data_table), batch_type(RowGroupBatchType::NOT_FLUSHED) { } + //! The transaction context. ClientContext &context; - vector> current_collections; - RowGroupBatchType batch_type = RowGroupBatchType::NOT_FLUSHED; + //! The data table. + DataTable &data_table; + //! Indexes to the optimistic row group collection vector of the local table storage for this transaction. + vector collection_indexes; + //! The batch type for merging collections. + RowGroupBatchType batch_type; public: - void AddCollection(unique_ptr collection, RowGroupBatchType type) { - current_collections.push_back(std::move(collection)); + void AddCollection(const PhysicalIndex collection_index, RowGroupBatchType type) { + collection_indexes.push_back(collection_index); if (type == RowGroupBatchType::FLUSHED) { batch_type = RowGroupBatchType::FLUSHED; - if (current_collections.size() > 1) { + if (collection_indexes.size() > 1) { throw InternalException("Cannot merge flushed collections"); } } } bool Empty() { - return current_collections.empty(); + return collection_indexes.empty(); } - unique_ptr Flush(OptimisticDataWriter &writer) { + PhysicalIndex Flush(OptimisticDataWriter &writer) { if (Empty()) { - return nullptr; + return PhysicalIndex(DConstants::INVALID_INDEX); } - unique_ptr new_collection = std::move(current_collections[0]); - if (current_collections.size() > 1) { - // we have gathered multiple collections: create one big collection and merge that - auto &types = new_collection->GetTypes(); + + auto result_collection_index = collection_indexes[0]; + auto result_collection = data_table.GetOptimisticCollection(context, result_collection_index); + D_ASSERT(result_collection); + if (collection_indexes.size() > 1) { + // Merge all collections into one result collection. + auto &types = result_collection->GetTypes(); TableAppendState append_state; - new_collection->InitializeAppend(append_state); + result_collection->InitializeAppend(append_state); DataChunk scan_chunk; scan_chunk.Initialize(context, types); @@ -78,7 +87,8 @@ class CollectionMerger { for (idx_t i = 0; i < types.size(); i++) { column_ids.emplace_back(i); } - for (auto &collection : current_collections) { + for (idx_t i = 1; i < collection_indexes.size(); i++) { + auto collection = data_table.GetOptimisticCollection(context, collection_indexes[i]); if (!collection) { continue; } @@ -92,35 +102,38 @@ class CollectionMerger { if (scan_chunk.size() == 0) { break; } - auto new_row_group = new_collection->Append(scan_chunk, append_state); + auto new_row_group = result_collection->Append(scan_chunk, append_state); if (new_row_group) { - writer.WriteNewRowGroup(*new_collection); + writer.WriteNewRowGroup(*result_collection); } } + data_table.ResetOptimisticCollection(context, collection_indexes[i]); } - new_collection->FinalizeAppend(TransactionData(0, 0), append_state); - writer.WriteLastRowGroup(*new_collection); + result_collection->FinalizeAppend(TransactionData(0, 0), append_state); + writer.WriteLastRowGroup(*result_collection); } else if (batch_type == RowGroupBatchType::NOT_FLUSHED) { - writer.WriteLastRowGroup(*new_collection); + writer.WriteLastRowGroup(*result_collection); } - current_collections.clear(); - return new_collection; + + collection_indexes.clear(); + return result_collection_index; } }; struct RowGroupBatchEntry { - RowGroupBatchEntry(idx_t batch_idx, unique_ptr collection_p, RowGroupBatchType type) - : batch_idx(batch_idx), total_rows(collection_p->GetTotalRows()), unflushed_memory(0), - collection(std::move(collection_p)), type(type) { + RowGroupBatchEntry(RowGroupCollection &collection, const idx_t batch_idx, const PhysicalIndex collection_index, + const RowGroupBatchType type) + : batch_idx(batch_idx), total_rows(collection.GetTotalRows()), unflushed_memory(0), + collection_index(collection_index), type(type) { if (type == RowGroupBatchType::NOT_FLUSHED) { - unflushed_memory = collection->GetAllocationSize(); + unflushed_memory = collection.GetAllocationSize(); } } idx_t batch_idx; idx_t total_rows; idx_t unflushed_memory; - unique_ptr collection; + PhysicalIndex collection_index; RowGroupBatchType type; }; @@ -138,7 +151,7 @@ class BatchInsertTask { class BatchInsertGlobalState : public GlobalSinkState { public: - explicit BatchInsertGlobalState(ClientContext &context, DuckTableEntry &table, idx_t minimum_memory_per_thread) + BatchInsertGlobalState(ClientContext &context, DuckTableEntry &table, idx_t minimum_memory_per_thread) : memory_manager(context, minimum_memory_per_thread), table(table), insert_count(0), optimistically_written(false), minimum_memory_per_thread(minimum_memory_per_thread) { row_group_size = table.GetStorage().GetRowGroupSize(); @@ -155,16 +168,14 @@ class BatchInsertGlobalState : public GlobalSinkState { atomic optimistically_written; idx_t minimum_memory_per_thread; - bool ReadyToMerge(idx_t count) const; - void ScheduleMergeTasks(idx_t min_batch_index); - unique_ptr MergeCollections(ClientContext &context, - vector merge_collections, - OptimisticDataWriter &writer); - void AddCollection(ClientContext &context, idx_t batch_index, idx_t min_batch_index, - unique_ptr current_collection, - optional_ptr writer = nullptr); + bool ReadyToMerge(const idx_t count) const; + void ScheduleMergeTasks(ClientContext &context, const idx_t min_batch_index); + PhysicalIndex MergeCollections(ClientContext &context, const vector &merge_collections, + OptimisticDataWriter &writer); + void AddCollection(ClientContext &context, const idx_t batch_index, const idx_t min_batch_index, + const PhysicalIndex collection_index, optional_ptr writer = nullptr); - idx_t MaxThreads(idx_t source_max_threads) override { + idx_t MaxThreads(const idx_t source_max_threads) override { // try to request 4MB per column per thread memory_manager.SetMemorySize(source_max_threads * minimum_memory_per_thread); // cap the concurrent threads working on this task based on the amount of available memory @@ -176,7 +187,7 @@ class BatchInsertLocalState : public LocalSinkState { public: BatchInsertLocalState(ClientContext &context, const vector &types, const vector> &bound_defaults) - : default_executor(context, bound_defaults) { + : default_executor(context, bound_defaults), collection_index(DConstants::INVALID_INDEX) { insert_chunk.Initialize(Allocator::Get(context), types); } @@ -184,17 +195,23 @@ class BatchInsertLocalState : public LocalSinkState { ExpressionExecutor default_executor; idx_t current_index; TableAppendState current_append_state; - unique_ptr current_collection; + PhysicalIndex collection_index; optional_ptr writer; unique_ptr constraint_state; - void CreateNewCollection(DuckTableEntry &table, const vector &insert_types) { - auto table_info = table.GetStorage().GetDataTableInfo(); - auto &io_manager = TableIOManager::Get(table.GetStorage()); - current_collection = make_uniq(std::move(table_info), io_manager, insert_types, - NumericCast(MAX_ROW_ID)); - current_collection->InitializeEmpty(); - current_collection->InitializeAppend(current_append_state); + void CreateNewCollection(ClientContext &context, DuckTableEntry &table_entry, + const vector &insert_types) { + auto table_info = table_entry.GetStorage().GetDataTableInfo(); + auto &io_manager = TableIOManager::Get(table_entry.GetStorage()); + + // Create the local row group collection. + auto max_row_id = NumericCast(MAX_ROW_ID); + auto collection = make_uniq(std::move(table_info), io_manager, insert_types, max_row_id); + collection->InitializeEmpty(); + collection->InitializeAppend(current_append_state); + + auto &data_table = table_entry.GetStorage(); + collection_index = data_table.CreateOptimisticCollection(context, std::move(collection)); } }; @@ -210,23 +227,27 @@ class MergeCollectionTask : public BatchInsertTask { vector merge_collections; idx_t merged_batch_index; - void Execute(const PhysicalBatchInsert &op, ClientContext &context, GlobalSinkState &gstate_p, - LocalSinkState &lstate_p) override { - auto &gstate = gstate_p.Cast(); - auto &lstate = lstate_p.Cast(); - // merge together the collections - D_ASSERT(lstate.writer); - auto final_collection = gstate.MergeCollections(context, std::move(merge_collections), *lstate.writer); - // add the merged-together collection to the set of batch indexes - lock_guard l(gstate.lock); - RowGroupBatchEntry new_entry(merged_batch_index, std::move(final_collection), RowGroupBatchType::FLUSHED); + void Execute(const PhysicalBatchInsert &op, ClientContext &context, GlobalSinkState &g_state_p, + LocalSinkState &l_state_p) override { + auto &g_state = g_state_p.Cast(); + auto &l_state = l_state_p.Cast(); + + // Merge the collections. + D_ASSERT(l_state.writer); + auto collection_index = g_state.MergeCollections(context, std::move(merge_collections), *l_state.writer); + + // Add the result collection to the set of batch indexes. + lock_guard l(g_state.lock); + auto result_collection = g_state.table.GetStorage().GetOptimisticCollection(context, collection_index); + RowGroupBatchEntry new_entry(*result_collection, merged_batch_index, collection_index, + RowGroupBatchType::FLUSHED); auto it = std::lower_bound( - gstate.collections.begin(), gstate.collections.end(), new_entry, + g_state.collections.begin(), g_state.collections.end(), new_entry, [&](const RowGroupBatchEntry &a, const RowGroupBatchEntry &b) { return a.batch_idx < b.batch_idx; }); if (it->batch_idx != merged_batch_index) { throw InternalException("Merged batch index was no longer present in collection"); } - it->collection = std::move(new_entry.collection); + it->collection_index = new_entry.collection_index; } }; @@ -239,7 +260,7 @@ struct BatchMergeTask { idx_t total_count; }; -bool BatchInsertGlobalState::ReadyToMerge(idx_t count) const { +bool BatchInsertGlobalState::ReadyToMerge(const idx_t count) const { // we try to merge so the count fits nicely into row groups if (count >= row_group_size / 10 * 9 && count <= row_group_size) { // 90%-100% of row group size @@ -260,9 +281,8 @@ bool BatchInsertGlobalState::ReadyToMerge(idx_t count) const { return false; } -void BatchInsertGlobalState::ScheduleMergeTasks(idx_t min_batch_index) { +void BatchInsertGlobalState::ScheduleMergeTasks(ClientContext &context, const idx_t min_batch_index) { idx_t current_idx; - vector to_be_scheduled_tasks; BatchMergeTask current_task(next_start); @@ -312,13 +332,14 @@ void BatchInsertGlobalState::ScheduleMergeTasks(idx_t min_batch_index) { vector merge_collections; for (idx_t idx = scheduled_task.start_index; idx < scheduled_task.end_index; idx++) { auto &entry = collections[idx]; - if (!entry.collection || entry.type == RowGroupBatchType::FLUSHED) { + if (!entry.collection_index.IsValid() || entry.type == RowGroupBatchType::FLUSHED) { throw InternalException("Adding a row group collection that should not be flushed"); } - RowGroupBatchEntry added_entry(collections[scheduled_task.start_index].batch_idx, - std::move(entry.collection), RowGroupBatchType::FLUSHED); + auto collection = table.GetStorage().GetOptimisticCollection(context, entry.collection_index); + RowGroupBatchEntry added_entry(*collection, collections[scheduled_task.start_index].batch_idx, + entry.collection_index, RowGroupBatchType::FLUSHED); added_entry.unflushed_memory = entry.unflushed_memory; - merge_collections.push_back(std::move(added_entry)); + merge_collections.push_back(added_entry); entry.total_rows = scheduled_task.total_count; entry.type = RowGroupBatchType::FLUSHED; } @@ -335,14 +356,14 @@ void BatchInsertGlobalState::ScheduleMergeTasks(idx_t min_batch_index) { } } -unique_ptr BatchInsertGlobalState::MergeCollections(ClientContext &context, - vector merge_collections, - OptimisticDataWriter &writer) { +PhysicalIndex BatchInsertGlobalState::MergeCollections(ClientContext &context, + const vector &merge_collections, + OptimisticDataWriter &writer) { D_ASSERT(!merge_collections.empty()); - CollectionMerger merger(context); + CollectionMerger merger(context, table.GetStorage()); idx_t written_data = 0; for (auto &entry : merge_collections) { - merger.AddCollection(std::move(entry.collection), RowGroupBatchType::NOT_FLUSHED); + merger.AddCollection(entry.collection_index, RowGroupBatchType::NOT_FLUSHED); written_data += entry.unflushed_memory; } optimistically_written = true; @@ -350,22 +371,23 @@ unique_ptr BatchInsertGlobalState::MergeCollections(ClientCo return merger.Flush(writer); } -void BatchInsertGlobalState::AddCollection(ClientContext &context, idx_t batch_index, idx_t min_batch_index, - unique_ptr current_collection, +void BatchInsertGlobalState::AddCollection(ClientContext &context, const idx_t batch_index, const idx_t min_batch_index, + const PhysicalIndex collection_index, optional_ptr writer) { if (batch_index < min_batch_index) { throw InternalException("Batch index of the added collection (%llu) is smaller than the min batch index (%llu)", batch_index, min_batch_index); } - auto new_count = current_collection->GetTotalRows(); + auto collection = table.GetStorage().GetOptimisticCollection(context, collection_index); + auto new_count = collection->GetTotalRows(); auto batch_type = new_count < row_group_size ? RowGroupBatchType::NOT_FLUSHED : RowGroupBatchType::FLUSHED; if (batch_type == RowGroupBatchType::FLUSHED && writer) { - writer->WriteLastRowGroup(*current_collection); + writer->WriteLastRowGroup(*collection); } lock_guard l(lock); insert_count += new_count; // add the collection to the batch index - RowGroupBatchEntry new_entry(batch_index, std::move(current_collection), batch_type); + RowGroupBatchEntry new_entry(*collection, batch_index, collection_index, batch_type); if (batch_type == RowGroupBatchType::NOT_FLUSHED) { memory_manager.IncreaseUnflushedMemory(new_entry.unflushed_memory); } @@ -379,9 +401,9 @@ void BatchInsertGlobalState::AddCollection(ClientContext &context, idx_t batch_i "batch indexes are not uniquely distributed over threads", batch_index); } - collections.insert(it, std::move(new_entry)); + collections.insert(it, new_entry); if (writer) { - ScheduleMergeTasks(min_batch_index); + ScheduleMergeTasks(context, min_batch_index); } } @@ -441,15 +463,16 @@ SinkNextBatchType PhysicalBatchInsert::NextBatch(ExecutionContext &context, Oper auto &memory_manager = gstate.memory_manager; auto batch_index = lstate.partition_info.batch_index.GetIndex(); - if (lstate.current_collection) { + if (lstate.collection_index.IsValid()) { if (lstate.current_index == batch_index) { throw InternalException("NextBatch called with the same batch index?"); } // batch index has changed: move the old collection to the global state and create a new collection TransactionData tdata(0, 0); - lstate.current_collection->FinalizeAppend(tdata, lstate.current_append_state); + auto collection = gstate.table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); + collection->FinalizeAppend(tdata, lstate.current_append_state); gstate.AddCollection(context.client, lstate.current_index, lstate.partition_info.min_batch_index.GetIndex(), - std::move(lstate.current_collection), lstate.writer); + lstate.collection_index, lstate.writer); bool any_unblocked; { @@ -459,7 +482,7 @@ SinkNextBatchType PhysicalBatchInsert::NextBatch(ExecutionContext &context, Oper if (!any_unblocked) { ExecuteTasks(context.client, gstate, lstate); } - lstate.current_collection.reset(); + lstate.collection_index.index = DConstants::INVALID_INDEX; } lstate.current_index = batch_index; @@ -501,10 +524,10 @@ SinkResultType PhysicalBatchInsert::Sink(ExecutionContext &context, DataChunk &c } } } - if (!lstate.current_collection) { + if (!lstate.collection_index.IsValid()) { lock_guard l(gstate.lock); // no collection yet: create a new one - lstate.CreateNewCollection(table, insert_types); + lstate.CreateNewCollection(context.client, table, insert_types); if (!lstate.writer) { lstate.writer = &table.GetStorage().CreateOptimisticWriter(context.client); } @@ -520,10 +543,11 @@ SinkResultType PhysicalBatchInsert::Sink(ExecutionContext &context, DataChunk &c auto &storage = table.GetStorage(); storage.VerifyAppendConstraints(*lstate.constraint_state, context.client, lstate.insert_chunk, nullptr, nullptr); - auto new_row_group = lstate.current_collection->Append(lstate.insert_chunk, lstate.current_append_state); + auto collection = table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); + auto new_row_group = collection->Append(lstate.insert_chunk, lstate.current_append_state); if (new_row_group) { // we have already written to disk - flush the next row group as well - lstate.writer->WriteNewRowGroup(*lstate.current_collection); + lstate.writer->WriteNewRowGroup(*collection); } return SinkResultType::NEED_MORE_INPUT; } @@ -541,12 +565,13 @@ SinkCombineResultType PhysicalBatchInsert::Combine(ExecutionContext &context, Op memory_manager.UpdateMinBatchIndex(lstate.partition_info.min_batch_index.GetIndex()); - if (lstate.current_collection) { + if (lstate.collection_index.IsValid()) { TransactionData tdata(0, 0); - lstate.current_collection->FinalizeAppend(tdata, lstate.current_append_state); - if (lstate.current_collection->GetTotalRows() > 0) { + auto collection = gstate.table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); + collection->FinalizeAppend(tdata, lstate.current_append_state); + if (collection->GetTotalRows() > 0) { gstate.AddCollection(context.client, lstate.current_index, lstate.partition_info.min_batch_index.GetIndex(), - std::move(lstate.current_collection)); + lstate.collection_index); } } if (lstate.writer) { @@ -568,6 +593,7 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, OperatorSinkFinalizeInput &input) const { auto &gstate = input.global_state.Cast(); auto &memory_manager = gstate.memory_manager; + auto &data_table = gstate.table.GetStorage(); if (gstate.optimistically_written || gstate.insert_count >= gstate.row_group_size) { // we have written data to disk optimistically or are inserting a large amount of data @@ -580,9 +606,9 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, if (entry.type == RowGroupBatchType::NOT_FLUSHED) { // this collection has not been flushed: add it to the merge set if (!current_merger) { - current_merger = make_uniq(context); + current_merger = make_uniq(context, data_table); } - current_merger->AddCollection(std::move(entry.collection), entry.type); + current_merger->AddCollection(entry.collection_index, entry.type); memory_manager.ReduceUnflushedMemory(entry.unflushed_memory); } else { // this collection has been flushed: it does not need to be merged @@ -592,8 +618,8 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, mergers.push_back(std::move(current_merger)); current_merger.reset(); } - auto larger_merger = make_uniq(context); - larger_merger->AddCollection(std::move(entry.collection), entry.type); + auto larger_merger = make_uniq(context, data_table); + larger_merger->AddCollection(entry.collection_index, entry.type); mergers.push_back(std::move(larger_merger)); } } @@ -602,7 +628,7 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, } // now that we have created all of the mergers, perform the actual merging - vector> final_collections; + vector final_collections; final_collections.reserve(mergers.size()); auto &writer = storage.CreateOptimisticWriter(context); for (auto &merger : mergers) { @@ -610,8 +636,10 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, } // finally, merge the row groups into the local storage - for (auto &collection : final_collections) { + for (const auto collection_index : final_collections) { + auto collection = data_table.GetOptimisticCollection(context, collection_index); storage.LocalMerge(context, *collection); + data_table.ResetOptimisticCollection(context, collection_index); } storage.FinalizeOptimisticWriter(context, writer); } else { @@ -628,10 +656,12 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, } memory_manager.ReduceUnflushedMemory(entry.unflushed_memory); - entry.collection->Scan(transaction, [&](DataChunk &insert_chunk) { + auto collection = data_table.GetOptimisticCollection(context, entry.collection_index); + collection->Scan(transaction, [&](DataChunk &insert_chunk) { storage.LocalAppend(append_state, context, insert_chunk, false); return true; }); + data_table.ResetOptimisticCollection(context, entry.collection_index); } storage.FinalizeLocalAppend(append_state); } diff --git a/src/execution/operator/persistent/physical_insert.cpp b/src/execution/operator/persistent/physical_insert.cpp index 3594ae15e7ff..1604d3fcbc1d 100644 --- a/src/execution/operator/persistent/physical_insert.cpp +++ b/src/execution/operator/persistent/physical_insert.cpp @@ -706,10 +706,10 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, OnConflictHandling(table, context, lstate); D_ASSERT(action_type != OnConflictAction::UPDATE); - auto &collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); - auto new_row_group = collection.Append(lstate.insert_chunk, lstate.local_append_state); + auto collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); + auto new_row_group = collection->Append(lstate.insert_chunk, lstate.local_append_state); if (new_row_group) { - lstate.writer->WriteNewRowGroup(collection); + lstate.writer->WriteNewRowGroup(*collection); } return SinkResultType::NEED_MORE_INPUT; } @@ -732,10 +732,10 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato // parallel append: finalize the append TransactionData tdata(0, 0); auto &data_table = gstate.table.GetStorage(); - auto &collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); - collection.FinalizeAppend(tdata, lstate.local_append_state); + auto collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); + collection->FinalizeAppend(tdata, lstate.local_append_state); - auto append_count = collection.GetTotalRows(); + auto append_count = collection->GetTotalRows(); lock_guard lock(gstate.lock); gstate.insert_count += append_count; @@ -743,16 +743,16 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato // we have few rows - append to the local storage directly storage.InitializeLocalAppend(gstate.append_state, table, context.client, bound_constraints); auto &transaction = DuckTransaction::Get(context.client, table.catalog); - collection.Scan(transaction, [&](DataChunk &insert_chunk) { + collection->Scan(transaction, [&](DataChunk &insert_chunk) { storage.LocalAppend(gstate.append_state, context.client, insert_chunk, false); return true; }); storage.FinalizeLocalAppend(gstate.append_state); } else { // we have written rows to disk optimistically - merge directly into the transaction-local storage - lstate.writer->WriteLastRowGroup(collection); + lstate.writer->WriteLastRowGroup(*collection); lstate.writer->FinalFlush(); - gstate.table.GetStorage().LocalMerge(context.client, collection); + gstate.table.GetStorage().LocalMerge(context.client, *collection); gstate.table.GetStorage().FinalizeOptimisticWriter(context.client, *lstate.writer); } diff --git a/src/include/duckdb/execution/operator/persistent/physical_insert.hpp b/src/include/duckdb/execution/operator/persistent/physical_insert.hpp index f12084e39af9..c5f4f3ac2807 100644 --- a/src/include/duckdb/execution/operator/persistent/physical_insert.hpp +++ b/src/include/duckdb/execution/operator/persistent/physical_insert.hpp @@ -57,6 +57,7 @@ class InsertLocalState : public LocalSinkState { DataChunk update_chunk; ExpressionExecutor default_executor; TableAppendState local_append_state; + //! An index to the optimistic row group collection vector of the local table storage for this transaction. PhysicalIndex collection_index; optional_ptr writer; // Rows that have been updated by a DO UPDATE conflict diff --git a/src/include/duckdb/storage/data_table.hpp b/src/include/duckdb/storage/data_table.hpp index 16354f45024b..ee366441cca1 100644 --- a/src/include/duckdb/storage/data_table.hpp +++ b/src/include/duckdb/storage/data_table.hpp @@ -119,7 +119,10 @@ class DataTable { //! Returns the index into the optimistic_collections vector for newly created collection. PhysicalIndex CreateOptimisticCollection(ClientContext &context, unique_ptr collection); //! Returns the optimistic row group collection corresponding to the index. - RowGroupCollection &GetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index); + optional_ptr GetOptimisticCollection(ClientContext &context, + const PhysicalIndex collection_index); + //! Resets the optimistic row group collection corresponding to the index. + void ResetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index); //! Create an optimistic writer for this table. Used for optimistically writing parallel appends. OptimisticDataWriter &CreateOptimisticWriter(ClientContext &context); void FinalizeOptimisticWriter(ClientContext &context, OptimisticDataWriter &writer); diff --git a/src/include/duckdb/transaction/local_storage.hpp b/src/include/duckdb/transaction/local_storage.hpp index b5a7398a8446..58fda0931bdf 100644 --- a/src/include/duckdb/transaction/local_storage.hpp +++ b/src/include/duckdb/transaction/local_storage.hpp @@ -81,7 +81,9 @@ class LocalTableStorage : public enable_shared_from_this { //! Returns the index into the optimistic_collections vector for newly created collection. PhysicalIndex CreateOptimisticCollection(unique_ptr collection); //! Returns the optimistic row group collection corresponding to the index. - RowGroupCollection &GetOptimisticCollection(const PhysicalIndex collection_index); + optional_ptr GetOptimisticCollection(const PhysicalIndex collection_index); + //! Resets the optimistic row group collection corresponding to the index. + void ResetOptimisticCollection(const PhysicalIndex collection_index); //! Create an optimistic writer for this table. OptimisticDataWriter &CreateOptimisticWriter(); void FinalizeOptimisticWriter(OptimisticDataWriter &writer); @@ -145,7 +147,9 @@ class LocalStorage { //! Returns the index into the optimistic_collections vector for newly created collection. PhysicalIndex CreateOptimisticCollection(DataTable &table, unique_ptr collection); //! Returns the optimistic row group collection corresponding to the index. - RowGroupCollection &GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); + optional_ptr GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); + //! Resets the optimistic row group collection corresponding to the index. + void ResetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); //! Create an optimistic writer for this table. OptimisticDataWriter &CreateOptimisticWriter(DataTable &table); void FinalizeOptimisticWriter(DataTable &table, OptimisticDataWriter &writer); diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index 9e194f0d6549..c717c6697da3 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -853,11 +853,17 @@ PhysicalIndex DataTable::CreateOptimisticCollection(ClientContext &context, uniq return local_storage.CreateOptimisticCollection(*this, std::move(collection)); } -RowGroupCollection &DataTable::GetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index) { +optional_ptr DataTable::GetOptimisticCollection(ClientContext &context, + const PhysicalIndex collection_index) { auto &local_storage = LocalStorage::Get(context, db); return local_storage.GetOptimisticCollection(*this, collection_index); } +void DataTable::ResetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index) { + auto &local_storage = LocalStorage::Get(context, db); + local_storage.ResetOptimisticCollection(*this, collection_index); +} + OptimisticDataWriter &DataTable::CreateOptimisticWriter(ClientContext &context) { auto &local_storage = LocalStorage::Get(context, db); return local_storage.CreateOptimisticWriter(*this); diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 6304147f715b..3e36e0ba55d8 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -239,9 +239,18 @@ PhysicalIndex LocalTableStorage::CreateOptimisticCollection(unique_ptr LocalTableStorage::GetOptimisticCollection(const PhysicalIndex collection_index) { lock_guard l(collections_lock); - return *optimistic_collections[collection_index.index]; + auto &collection = optimistic_collections[collection_index.index]; + if (collection == nullptr) { + return nullptr; + } + return *collection; +} + +void LocalTableStorage::ResetOptimisticCollection(const PhysicalIndex collection_index) { + lock_guard l(collections_lock); + optimistic_collections[collection_index.index].reset(); } OptimisticDataWriter &LocalTableStorage::CreateOptimisticWriter() { @@ -470,11 +479,17 @@ PhysicalIndex LocalStorage::CreateOptimisticCollection(DataTable &table, unique_ return storage.CreateOptimisticCollection(std::move(collection)); } -RowGroupCollection &LocalStorage::GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index) { +optional_ptr LocalStorage::GetOptimisticCollection(DataTable &table, + const PhysicalIndex collection_index) { auto &storage = table_manager.GetOrCreateStorage(context, table); return storage.GetOptimisticCollection(collection_index); } +void LocalStorage::ResetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index) { + auto &storage = table_manager.GetOrCreateStorage(context, table); + storage.ResetOptimisticCollection(collection_index); +} + OptimisticDataWriter &LocalStorage::CreateOptimisticWriter(DataTable &table) { auto &storage = table_manager.GetOrCreateStorage(context, table); return storage.CreateOptimisticWriter(); From ae5e49486d02af4f29079467198242ed89c9bd95 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Sat, 25 Jan 2025 16:32:27 +0100 Subject: [PATCH 009/142] turn into reference --- .../persistent/physical_batch_insert.cpp | 69 +++++++++---------- .../operator/persistent/physical_insert.cpp | 20 +++--- src/include/duckdb/storage/data_table.hpp | 3 +- .../duckdb/transaction/local_storage.hpp | 4 +- src/storage/data_table.cpp | 3 +- src/storage/local_storage.cpp | 8 +-- 6 files changed, 49 insertions(+), 58 deletions(-) diff --git a/src/execution/operator/persistent/physical_batch_insert.cpp b/src/execution/operator/persistent/physical_batch_insert.cpp index a9d8696201f4..17ed86ea6767 100644 --- a/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/execution/operator/persistent/physical_batch_insert.cpp @@ -72,13 +72,13 @@ class CollectionMerger { } auto result_collection_index = collection_indexes[0]; - auto result_collection = data_table.GetOptimisticCollection(context, result_collection_index); - D_ASSERT(result_collection); + auto &result_collection = data_table.GetOptimisticCollection(context, result_collection_index); + if (collection_indexes.size() > 1) { // Merge all collections into one result collection. - auto &types = result_collection->GetTypes(); + auto &types = result_collection.GetTypes(); TableAppendState append_state; - result_collection->InitializeAppend(append_state); + result_collection.InitializeAppend(append_state); DataChunk scan_chunk; scan_chunk.Initialize(context, types); @@ -88,13 +88,10 @@ class CollectionMerger { column_ids.emplace_back(i); } for (idx_t i = 1; i < collection_indexes.size(); i++) { - auto collection = data_table.GetOptimisticCollection(context, collection_indexes[i]); - if (!collection) { - continue; - } + auto &collection = data_table.GetOptimisticCollection(context, collection_indexes[i]); TableScanState scan_state; scan_state.Initialize(column_ids); - collection->InitializeScan(scan_state.local_state, column_ids, nullptr); + collection.InitializeScan(scan_state.local_state, column_ids, nullptr); while (true) { scan_chunk.Reset(); @@ -102,17 +99,17 @@ class CollectionMerger { if (scan_chunk.size() == 0) { break; } - auto new_row_group = result_collection->Append(scan_chunk, append_state); + auto new_row_group = result_collection.Append(scan_chunk, append_state); if (new_row_group) { - writer.WriteNewRowGroup(*result_collection); + writer.WriteNewRowGroup(result_collection); } } data_table.ResetOptimisticCollection(context, collection_indexes[i]); } - result_collection->FinalizeAppend(TransactionData(0, 0), append_state); - writer.WriteLastRowGroup(*result_collection); + result_collection.FinalizeAppend(TransactionData(0, 0), append_state); + writer.WriteLastRowGroup(result_collection); } else if (batch_type == RowGroupBatchType::NOT_FLUSHED) { - writer.WriteLastRowGroup(*result_collection); + writer.WriteLastRowGroup(result_collection); } collection_indexes.clear(); @@ -234,12 +231,12 @@ class MergeCollectionTask : public BatchInsertTask { // Merge the collections. D_ASSERT(l_state.writer); - auto collection_index = g_state.MergeCollections(context, std::move(merge_collections), *l_state.writer); + auto collection_index = g_state.MergeCollections(context, merge_collections, *l_state.writer); // Add the result collection to the set of batch indexes. lock_guard l(g_state.lock); - auto result_collection = g_state.table.GetStorage().GetOptimisticCollection(context, collection_index); - RowGroupBatchEntry new_entry(*result_collection, merged_batch_index, collection_index, + auto &result_collection = g_state.table.GetStorage().GetOptimisticCollection(context, collection_index); + RowGroupBatchEntry new_entry(result_collection, merged_batch_index, collection_index, RowGroupBatchType::FLUSHED); auto it = std::lower_bound( g_state.collections.begin(), g_state.collections.end(), new_entry, @@ -328,15 +325,15 @@ void BatchInsertGlobalState::ScheduleMergeTasks(ClientContext &context, const id for (auto &scheduled_task : to_be_scheduled_tasks) { D_ASSERT(scheduled_task.total_count > 0); D_ASSERT(current_idx > scheduled_task.start_index); - idx_t merged_batch_index = collections[scheduled_task.start_index].batch_idx; + auto merged_batch_index = collections[scheduled_task.start_index].batch_idx; vector merge_collections; for (idx_t idx = scheduled_task.start_index; idx < scheduled_task.end_index; idx++) { auto &entry = collections[idx]; if (!entry.collection_index.IsValid() || entry.type == RowGroupBatchType::FLUSHED) { throw InternalException("Adding a row group collection that should not be flushed"); } - auto collection = table.GetStorage().GetOptimisticCollection(context, entry.collection_index); - RowGroupBatchEntry added_entry(*collection, collections[scheduled_task.start_index].batch_idx, + auto &collection = table.GetStorage().GetOptimisticCollection(context, entry.collection_index); + RowGroupBatchEntry added_entry(collection, collections[scheduled_task.start_index].batch_idx, entry.collection_index, RowGroupBatchType::FLUSHED); added_entry.unflushed_memory = entry.unflushed_memory; merge_collections.push_back(added_entry); @@ -378,16 +375,16 @@ void BatchInsertGlobalState::AddCollection(ClientContext &context, const idx_t b throw InternalException("Batch index of the added collection (%llu) is smaller than the min batch index (%llu)", batch_index, min_batch_index); } - auto collection = table.GetStorage().GetOptimisticCollection(context, collection_index); - auto new_count = collection->GetTotalRows(); + auto &collection = table.GetStorage().GetOptimisticCollection(context, collection_index); + auto new_count = collection.GetTotalRows(); auto batch_type = new_count < row_group_size ? RowGroupBatchType::NOT_FLUSHED : RowGroupBatchType::FLUSHED; if (batch_type == RowGroupBatchType::FLUSHED && writer) { - writer->WriteLastRowGroup(*collection); + writer->WriteLastRowGroup(collection); } lock_guard l(lock); insert_count += new_count; // add the collection to the batch index - RowGroupBatchEntry new_entry(*collection, batch_index, collection_index, batch_type); + RowGroupBatchEntry new_entry(collection, batch_index, collection_index, batch_type); if (batch_type == RowGroupBatchType::NOT_FLUSHED) { memory_manager.IncreaseUnflushedMemory(new_entry.unflushed_memory); } @@ -469,8 +466,8 @@ SinkNextBatchType PhysicalBatchInsert::NextBatch(ExecutionContext &context, Oper } // batch index has changed: move the old collection to the global state and create a new collection TransactionData tdata(0, 0); - auto collection = gstate.table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); - collection->FinalizeAppend(tdata, lstate.current_append_state); + auto &collection = gstate.table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); + collection.FinalizeAppend(tdata, lstate.current_append_state); gstate.AddCollection(context.client, lstate.current_index, lstate.partition_info.min_batch_index.GetIndex(), lstate.collection_index, lstate.writer); @@ -543,11 +540,11 @@ SinkResultType PhysicalBatchInsert::Sink(ExecutionContext &context, DataChunk &c auto &storage = table.GetStorage(); storage.VerifyAppendConstraints(*lstate.constraint_state, context.client, lstate.insert_chunk, nullptr, nullptr); - auto collection = table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); - auto new_row_group = collection->Append(lstate.insert_chunk, lstate.current_append_state); + auto &collection = table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); + auto new_row_group = collection.Append(lstate.insert_chunk, lstate.current_append_state); if (new_row_group) { // we have already written to disk - flush the next row group as well - lstate.writer->WriteNewRowGroup(*collection); + lstate.writer->WriteNewRowGroup(collection); } return SinkResultType::NEED_MORE_INPUT; } @@ -567,9 +564,9 @@ SinkCombineResultType PhysicalBatchInsert::Combine(ExecutionContext &context, Op if (lstate.collection_index.IsValid()) { TransactionData tdata(0, 0); - auto collection = gstate.table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); - collection->FinalizeAppend(tdata, lstate.current_append_state); - if (collection->GetTotalRows() > 0) { + auto &collection = gstate.table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); + collection.FinalizeAppend(tdata, lstate.current_append_state); + if (collection.GetTotalRows() > 0) { gstate.AddCollection(context.client, lstate.current_index, lstate.partition_info.min_batch_index.GetIndex(), lstate.collection_index); } @@ -637,8 +634,8 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, // finally, merge the row groups into the local storage for (const auto collection_index : final_collections) { - auto collection = data_table.GetOptimisticCollection(context, collection_index); - storage.LocalMerge(context, *collection); + auto &collection = data_table.GetOptimisticCollection(context, collection_index); + storage.LocalMerge(context, collection); data_table.ResetOptimisticCollection(context, collection_index); } storage.FinalizeOptimisticWriter(context, writer); @@ -656,8 +653,8 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, } memory_manager.ReduceUnflushedMemory(entry.unflushed_memory); - auto collection = data_table.GetOptimisticCollection(context, entry.collection_index); - collection->Scan(transaction, [&](DataChunk &insert_chunk) { + auto &collection = data_table.GetOptimisticCollection(context, entry.collection_index); + collection.Scan(transaction, [&](DataChunk &insert_chunk) { storage.LocalAppend(append_state, context, insert_chunk, false); return true; }); diff --git a/src/execution/operator/persistent/physical_insert.cpp b/src/execution/operator/persistent/physical_insert.cpp index 1604d3fcbc1d..415e94ba6e42 100644 --- a/src/execution/operator/persistent/physical_insert.cpp +++ b/src/execution/operator/persistent/physical_insert.cpp @@ -685,7 +685,7 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, return SinkResultType::NEED_MORE_INPUT; } - // parallel append + // Parallel append. D_ASSERT(!return_chunk); auto &data_table = gstate.table.GetStorage(); if (!lstate.collection_index.IsValid()) { @@ -706,10 +706,10 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, OnConflictHandling(table, context, lstate); D_ASSERT(action_type != OnConflictAction::UPDATE); - auto collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); - auto new_row_group = collection->Append(lstate.insert_chunk, lstate.local_append_state); + auto &collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); + auto new_row_group = collection.Append(lstate.insert_chunk, lstate.local_append_state); if (new_row_group) { - lstate.writer->WriteNewRowGroup(*collection); + lstate.writer->WriteNewRowGroup(collection); } return SinkResultType::NEED_MORE_INPUT; } @@ -732,10 +732,10 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato // parallel append: finalize the append TransactionData tdata(0, 0); auto &data_table = gstate.table.GetStorage(); - auto collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); - collection->FinalizeAppend(tdata, lstate.local_append_state); + auto &collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); + collection.FinalizeAppend(tdata, lstate.local_append_state); - auto append_count = collection->GetTotalRows(); + auto append_count = collection.GetTotalRows(); lock_guard lock(gstate.lock); gstate.insert_count += append_count; @@ -743,16 +743,16 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato // we have few rows - append to the local storage directly storage.InitializeLocalAppend(gstate.append_state, table, context.client, bound_constraints); auto &transaction = DuckTransaction::Get(context.client, table.catalog); - collection->Scan(transaction, [&](DataChunk &insert_chunk) { + collection.Scan(transaction, [&](DataChunk &insert_chunk) { storage.LocalAppend(gstate.append_state, context.client, insert_chunk, false); return true; }); storage.FinalizeLocalAppend(gstate.append_state); } else { // we have written rows to disk optimistically - merge directly into the transaction-local storage - lstate.writer->WriteLastRowGroup(*collection); + lstate.writer->WriteLastRowGroup(collection); lstate.writer->FinalFlush(); - gstate.table.GetStorage().LocalMerge(context.client, *collection); + gstate.table.GetStorage().LocalMerge(context.client, collection); gstate.table.GetStorage().FinalizeOptimisticWriter(context.client, *lstate.writer); } diff --git a/src/include/duckdb/storage/data_table.hpp b/src/include/duckdb/storage/data_table.hpp index ee366441cca1..a46ec0637206 100644 --- a/src/include/duckdb/storage/data_table.hpp +++ b/src/include/duckdb/storage/data_table.hpp @@ -119,8 +119,7 @@ class DataTable { //! Returns the index into the optimistic_collections vector for newly created collection. PhysicalIndex CreateOptimisticCollection(ClientContext &context, unique_ptr collection); //! Returns the optimistic row group collection corresponding to the index. - optional_ptr GetOptimisticCollection(ClientContext &context, - const PhysicalIndex collection_index); + RowGroupCollection &GetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index); //! Resets the optimistic row group collection corresponding to the index. void ResetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index); //! Create an optimistic writer for this table. Used for optimistically writing parallel appends. diff --git a/src/include/duckdb/transaction/local_storage.hpp b/src/include/duckdb/transaction/local_storage.hpp index 58fda0931bdf..a71ea9eacbaa 100644 --- a/src/include/duckdb/transaction/local_storage.hpp +++ b/src/include/duckdb/transaction/local_storage.hpp @@ -81,7 +81,7 @@ class LocalTableStorage : public enable_shared_from_this { //! Returns the index into the optimistic_collections vector for newly created collection. PhysicalIndex CreateOptimisticCollection(unique_ptr collection); //! Returns the optimistic row group collection corresponding to the index. - optional_ptr GetOptimisticCollection(const PhysicalIndex collection_index); + RowGroupCollection &GetOptimisticCollection(const PhysicalIndex collection_index); //! Resets the optimistic row group collection corresponding to the index. void ResetOptimisticCollection(const PhysicalIndex collection_index); //! Create an optimistic writer for this table. @@ -147,7 +147,7 @@ class LocalStorage { //! Returns the index into the optimistic_collections vector for newly created collection. PhysicalIndex CreateOptimisticCollection(DataTable &table, unique_ptr collection); //! Returns the optimistic row group collection corresponding to the index. - optional_ptr GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); + RowGroupCollection &GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); //! Resets the optimistic row group collection corresponding to the index. void ResetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); //! Create an optimistic writer for this table. diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index c717c6697da3..0620c939ba04 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -853,8 +853,7 @@ PhysicalIndex DataTable::CreateOptimisticCollection(ClientContext &context, uniq return local_storage.CreateOptimisticCollection(*this, std::move(collection)); } -optional_ptr DataTable::GetOptimisticCollection(ClientContext &context, - const PhysicalIndex collection_index) { +RowGroupCollection &DataTable::GetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index) { auto &local_storage = LocalStorage::Get(context, db); return local_storage.GetOptimisticCollection(*this, collection_index); } diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 3e36e0ba55d8..929aef5ebb3d 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -239,12 +239,9 @@ PhysicalIndex LocalTableStorage::CreateOptimisticCollection(unique_ptr LocalTableStorage::GetOptimisticCollection(const PhysicalIndex collection_index) { +RowGroupCollection &LocalTableStorage::GetOptimisticCollection(const PhysicalIndex collection_index) { lock_guard l(collections_lock); auto &collection = optimistic_collections[collection_index.index]; - if (collection == nullptr) { - return nullptr; - } return *collection; } @@ -479,8 +476,7 @@ PhysicalIndex LocalStorage::CreateOptimisticCollection(DataTable &table, unique_ return storage.CreateOptimisticCollection(std::move(collection)); } -optional_ptr LocalStorage::GetOptimisticCollection(DataTable &table, - const PhysicalIndex collection_index) { +RowGroupCollection &LocalStorage::GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index) { auto &storage = table_manager.GetOrCreateStorage(context, table); return storage.GetOptimisticCollection(collection_index); } From 9ebf87d7e20752058047b9555e5137e16afb134a Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 27 Jan 2025 09:14:14 +0100 Subject: [PATCH 010/142] fix broken test --- src/optimizer/join_order/relation_manager.cpp | 15 +++++---------- .../optimizer/remove_unnecessary_projections.test | 8 ++++++++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/optimizer/join_order/relation_manager.cpp b/src/optimizer/join_order/relation_manager.cpp index d4f7032d676d..ccd68101e5e1 100644 --- a/src/optimizer/join_order/relation_manager.cpp +++ b/src/optimizer/join_order/relation_manager.cpp @@ -65,19 +65,14 @@ void RelationManager::AddRelation(LogicalOperator &op, optional_ptr:.*PROJECTION.*PROJECTION.* + +statement ok +pragma explain_output='optimized_only'; + +query II +explain select a b from (select b a from (select a b from values (1), (2), (3) t(a))); +---- +logical_opt :.*PROJECTION.*PROJECTION.*PROJECTION.* \ No newline at end of file From 2a2d359d1ed4777002231e875a33ace9ae447b67 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 27 Jan 2025 12:40:46 +0100 Subject: [PATCH 011/142] generate metric enum --- src/common/enums/metric_type.cpp | 15 ++++++++------- src/include/duckdb/common/enums/metric_type.hpp | 2 +- src/optimizer/remove_useless_projections.cpp | 15 ++++++++++----- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/common/enums/metric_type.cpp b/src/common/enums/metric_type.cpp index 0477f2a4f672..24793fdcdecc 100644 --- a/src/common/enums/metric_type.cpp +++ b/src/common/enums/metric_type.cpp @@ -40,6 +40,7 @@ profiler_settings_t MetricsUtils::GetOptimizerMetrics() { MetricsType::OPTIMIZER_MATERIALIZED_CTE, MetricsType::OPTIMIZER_SUM_REWRITER, MetricsType::OPTIMIZER_LATE_MATERIALIZATION, + MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS, }; } @@ -66,8 +67,6 @@ MetricsType MetricsUtils::GetOptimizerMetricByType(OptimizerType type) { return MetricsType::OPTIMIZER_FILTER_PUSHDOWN; case OptimizerType::EMPTY_RESULT_PULLUP: return MetricsType::OPTIMIZER_EMPTY_RESULT_PULLUP; - case OptimizerType::REMOVE_USELESS_PROJECTIONS: - return MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS; case OptimizerType::CTE_FILTER_PUSHER: return MetricsType::OPTIMIZER_CTE_FILTER_PUSHER; case OptimizerType::REGEX_RANGE: @@ -114,6 +113,8 @@ MetricsType MetricsUtils::GetOptimizerMetricByType(OptimizerType type) { return MetricsType::OPTIMIZER_SUM_REWRITER; case OptimizerType::LATE_MATERIALIZATION: return MetricsType::OPTIMIZER_LATE_MATERIALIZATION; + case OptimizerType::REMOVE_USELESS_PROJECTIONS: + return MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS; default: throw InternalException("OptimizerType %s cannot be converted to a MetricsType", EnumUtil::ToString(type)); }; @@ -155,8 +156,6 @@ OptimizerType MetricsUtils::GetOptimizerTypeByMetric(MetricsType type) { return OptimizerType::BUILD_SIDE_PROBE_SIDE; case MetricsType::OPTIMIZER_LIMIT_PUSHDOWN: return OptimizerType::LIMIT_PUSHDOWN; - case MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS: - return OptimizerType::REMOVE_USELESS_PROJECTIONS; case MetricsType::OPTIMIZER_TOP_N: return OptimizerType::TOP_N; case MetricsType::OPTIMIZER_COMPRESSED_MATERIALIZATION: @@ -174,9 +173,11 @@ OptimizerType MetricsUtils::GetOptimizerTypeByMetric(MetricsType type) { case MetricsType::OPTIMIZER_MATERIALIZED_CTE: return OptimizerType::MATERIALIZED_CTE; case MetricsType::OPTIMIZER_SUM_REWRITER: - return OptimizerType::SUM_REWRITER; + return OptimizerType::SUM_REWRITER; case MetricsType::OPTIMIZER_LATE_MATERIALIZATION: - return OptimizerType::LATE_MATERIALIZATION; + return OptimizerType::LATE_MATERIALIZATION; + case MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS: + return OptimizerType::REMOVE_USELESS_PROJECTIONS; default: return OptimizerType::INVALID; }; @@ -210,8 +211,8 @@ bool MetricsUtils::IsOptimizerMetric(MetricsType type) { case MetricsType::OPTIMIZER_EXTENSION: case MetricsType::OPTIMIZER_MATERIALIZED_CTE: case MetricsType::OPTIMIZER_SUM_REWRITER: - case MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS: case MetricsType::OPTIMIZER_LATE_MATERIALIZATION: + case MetricsType::OPTIMIZER_REMOVE_USELESS_PROJECTIONS: return true; default: return false; diff --git a/src/include/duckdb/common/enums/metric_type.hpp b/src/include/duckdb/common/enums/metric_type.hpp index bd938779900d..825e5c5369e8 100644 --- a/src/include/duckdb/common/enums/metric_type.hpp +++ b/src/include/duckdb/common/enums/metric_type.hpp @@ -69,7 +69,7 @@ enum class MetricsType : uint8_t { OPTIMIZER_MATERIALIZED_CTE, OPTIMIZER_SUM_REWRITER, OPTIMIZER_LATE_MATERIALIZATION, - OPTIMIZER_REMOVE_USELESS_PROJECTIONS, + OPTIMIZER_REMOVE_USELESS_PROJECTIONS, }; struct MetricsTypeHashFunction { diff --git a/src/optimizer/remove_useless_projections.cpp b/src/optimizer/remove_useless_projections.cpp index 74ae42b30c87..736d1bcd4e88 100644 --- a/src/optimizer/remove_useless_projections.cpp +++ b/src/optimizer/remove_useless_projections.cpp @@ -1,4 +1,5 @@ #include "duckdb/optimizer/remove_useless_projections.hpp" +#include "duckdb/planner/operator/logical_projection.hpp" #include "duckdb/common/enums/logical_operator_type.hpp" namespace duckdb { @@ -11,17 +12,21 @@ unique_ptr RemoveUselessProjections::RemoveProjectionsChildren( } unique_ptr RemoveUselessProjections::RemoveProjections(unique_ptr op) { - if (op->type == LogicalOperatorType::LOGICAL_UNION || op->type == LogicalOperatorType::LOGICAL_EXCEPT || - op->type == LogicalOperatorType::LOGICAL_INTERSECT || op->type == LogicalOperatorType::LOGICAL_RECURSIVE_CTE || - op->type == LogicalOperatorType::LOGICAL_MATERIALIZED_CTE) { - // guaranteed to find a projection under this that is meant to keep the column order in the presence of - // an optimization done by build side probe side. + switch (op->type) { + case LogicalOperatorType::LOGICAL_UNION: + case LogicalOperatorType::LOGICAL_EXCEPT: + case LogicalOperatorType::LOGICAL_RECURSIVE_CTE: + case LogicalOperatorType::LOGICAL_INTERSECT: + case LogicalOperatorType::LOGICAL_MATERIALIZED_CTE: { for (idx_t i = 0; i < op->children.size(); i++) { first_projection = true; op->children[i] = RemoveProjections(std::move(op->children[i])); } return op; } + default: + break; + } if (op->type != LogicalOperatorType::LOGICAL_PROJECTION) { return RemoveProjectionsChildren(std::move(op)); } From 4f17c52a6f83467e1231f8915c1ef401361fd009 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 27 Jan 2025 12:58:20 +0100 Subject: [PATCH 012/142] add missing includes --- src/optimizer/remove_useless_projections.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/optimizer/remove_useless_projections.cpp b/src/optimizer/remove_useless_projections.cpp index 736d1bcd4e88..9ed9d7a93cf2 100644 --- a/src/optimizer/remove_useless_projections.cpp +++ b/src/optimizer/remove_useless_projections.cpp @@ -1,4 +1,5 @@ #include "duckdb/optimizer/remove_useless_projections.hpp" +#include "duckdb/planner/expression/bound_columnref_expression.hpp" #include "duckdb/planner/operator/logical_projection.hpp" #include "duckdb/common/enums/logical_operator_type.hpp" From f024e1d663f7a641a18dad788d3824e88419c597 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Mon, 27 Jan 2025 13:11:46 +0100 Subject: [PATCH 013/142] tidying some stuff up --- .../persistent/physical_batch_insert.cpp | 104 ++++++++++-------- src/storage/local_storage.cpp | 2 + 2 files changed, 59 insertions(+), 47 deletions(-) diff --git a/src/execution/operator/persistent/physical_batch_insert.cpp b/src/execution/operator/persistent/physical_batch_insert.cpp index 17ed86ea6767..097341d97c02 100644 --- a/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/execution/operator/persistent/physical_batch_insert.cpp @@ -148,7 +148,7 @@ class BatchInsertTask { class BatchInsertGlobalState : public GlobalSinkState { public: - BatchInsertGlobalState(ClientContext &context, DuckTableEntry &table, idx_t minimum_memory_per_thread) + BatchInsertGlobalState(ClientContext &context, DuckTableEntry &table, const idx_t minimum_memory_per_thread) : memory_manager(context, minimum_memory_per_thread), table(table), insert_count(0), optimistically_written(false), minimum_memory_per_thread(minimum_memory_per_thread) { row_group_size = table.GetStorage().GetRowGroupSize(); @@ -231,13 +231,15 @@ class MergeCollectionTask : public BatchInsertTask { // Merge the collections. D_ASSERT(l_state.writer); - auto collection_index = g_state.MergeCollections(context, merge_collections, *l_state.writer); + auto result_collection_index = g_state.MergeCollections(context, merge_collections, *l_state.writer); + merge_collections.clear(); - // Add the result collection to the set of batch indexes. lock_guard l(g_state.lock); - auto &result_collection = g_state.table.GetStorage().GetOptimisticCollection(context, collection_index); - RowGroupBatchEntry new_entry(result_collection, merged_batch_index, collection_index, + auto &result_collection = g_state.table.GetStorage().GetOptimisticCollection(context, result_collection_index); + RowGroupBatchEntry new_entry(result_collection, merged_batch_index, result_collection_index, RowGroupBatchType::FLUSHED); + + // Add the result collection to the set of batch indexes. auto it = std::lower_bound( g_state.collections.begin(), g_state.collections.end(), new_entry, [&](const RowGroupBatchEntry &a, const RowGroupBatchEntry &b) { return a.batch_idx < b.batch_idx; }); @@ -339,6 +341,7 @@ void BatchInsertGlobalState::ScheduleMergeTasks(ClientContext &context, const id merge_collections.push_back(added_entry); entry.total_rows = scheduled_task.total_count; entry.type = RowGroupBatchType::FLUSHED; + entry.collection_index = PhysicalIndex(DConstants::INVALID_INDEX); } task_manager.AddTask(make_uniq(std::move(merge_collections), merged_batch_index)); } @@ -567,8 +570,9 @@ SinkCombineResultType PhysicalBatchInsert::Combine(ExecutionContext &context, Op auto &collection = gstate.table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); collection.FinalizeAppend(tdata, lstate.current_append_state); if (collection.GetTotalRows() > 0) { - gstate.AddCollection(context.client, lstate.current_index, lstate.partition_info.min_batch_index.GetIndex(), - lstate.collection_index); + auto batch_index = lstate.partition_info.min_batch_index.GetIndex(); + gstate.AddCollection(context.client, lstate.current_index, batch_index, lstate.collection_index); + lstate.collection_index = PhysicalIndex(DConstants::INVALID_INDEX); } } if (lstate.writer) { @@ -588,18 +592,18 @@ SinkCombineResultType PhysicalBatchInsert::Combine(ExecutionContext &context, Op //===--------------------------------------------------------------------===// SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, ClientContext &context, OperatorSinkFinalizeInput &input) const { - auto &gstate = input.global_state.Cast(); - auto &memory_manager = gstate.memory_manager; - auto &data_table = gstate.table.GetStorage(); + auto &g_state = input.global_state.Cast(); + auto &table = g_state.table; + auto &data_table = g_state.table.GetStorage(); + auto &memory_manager = g_state.memory_manager; - if (gstate.optimistically_written || gstate.insert_count >= gstate.row_group_size) { + if (g_state.optimistically_written || g_state.insert_count >= g_state.row_group_size) { // we have written data to disk optimistically or are inserting a large amount of data // perform a final pass over all of the row groups and merge them together vector> mergers; unique_ptr current_merger; - auto &storage = gstate.table.GetStorage(); - for (auto &entry : gstate.collections) { + for (auto &entry : g_state.collections) { if (entry.type == RowGroupBatchType::NOT_FLUSHED) { // this collection has not been flushed: add it to the merge set if (!current_merger) { @@ -607,19 +611,22 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, } current_merger->AddCollection(entry.collection_index, entry.type); memory_manager.ReduceUnflushedMemory(entry.unflushed_memory); - } else { - // this collection has been flushed: it does not need to be merged - // create a separate collection merger only for this entry - if (current_merger) { - // we have small collections remaining: flush them - mergers.push_back(std::move(current_merger)); - current_merger.reset(); - } - auto larger_merger = make_uniq(context, data_table); - larger_merger->AddCollection(entry.collection_index, entry.type); - mergers.push_back(std::move(larger_merger)); + continue; + } + + // This collection has been flushed, so it does not need to be merged. + // Create a separate collection merger for it. + if (current_merger) { + // Flush any remaining small allocations. + mergers.push_back(std::move(current_merger)); + current_merger.reset(); } + auto larger_merger = make_uniq(context, data_table); + larger_merger->AddCollection(entry.collection_index, entry.type); + mergers.push_back(std::move(larger_merger)); } + + g_state.collections.clear(); if (current_merger) { mergers.push_back(std::move(current_merger)); } @@ -627,7 +634,7 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, // now that we have created all of the mergers, perform the actual merging vector final_collections; final_collections.reserve(mergers.size()); - auto &writer = storage.CreateOptimisticWriter(context); + auto &writer = data_table.CreateOptimisticWriter(context); for (auto &merger : mergers) { final_collections.push_back(merger->Flush(writer)); } @@ -635,33 +642,36 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, // finally, merge the row groups into the local storage for (const auto collection_index : final_collections) { auto &collection = data_table.GetOptimisticCollection(context, collection_index); - storage.LocalMerge(context, collection); + data_table.LocalMerge(context, collection); data_table.ResetOptimisticCollection(context, collection_index); } - storage.FinalizeOptimisticWriter(context, writer); - } else { - // we are writing a small amount of data to disk - // append directly to transaction local storage - auto &table = gstate.table; - auto &storage = table.GetStorage(); - LocalAppendState append_state; - storage.InitializeLocalAppend(append_state, table, context, bound_constraints); - auto &transaction = DuckTransaction::Get(context, table.catalog); - for (auto &entry : gstate.collections) { - if (entry.type != RowGroupBatchType::NOT_FLUSHED) { - throw InternalException("Encountered a flushed batch"); - } - memory_manager.ReduceUnflushedMemory(entry.unflushed_memory); - auto &collection = data_table.GetOptimisticCollection(context, entry.collection_index); - collection.Scan(transaction, [&](DataChunk &insert_chunk) { - storage.LocalAppend(append_state, context, insert_chunk, false); - return true; - }); - data_table.ResetOptimisticCollection(context, entry.collection_index); + data_table.FinalizeOptimisticWriter(context, writer); + memory_manager.FinalCheck(); + return SinkFinalizeType::READY; + } + + // We are writing a small amount of data to disk. + // Thus, we append directly to the transaction local storage. + LocalAppendState append_state; + data_table.InitializeLocalAppend(append_state, table, context, bound_constraints); + auto &transaction = DuckTransaction::Get(context, table.catalog); + for (auto &entry : g_state.collections) { + if (entry.type != RowGroupBatchType::NOT_FLUSHED) { + throw InternalException("Encountered a flushed batch"); } - storage.FinalizeLocalAppend(append_state); + + memory_manager.ReduceUnflushedMemory(entry.unflushed_memory); + auto &collection = data_table.GetOptimisticCollection(context, entry.collection_index); + collection.Scan(transaction, [&](DataChunk &insert_chunk) { + data_table.LocalAppend(append_state, context, insert_chunk, false); + return true; + }); + data_table.ResetOptimisticCollection(context, entry.collection_index); } + + g_state.collections.clear(); + data_table.FinalizeLocalAppend(append_state); memory_manager.FinalCheck(); return SinkFinalizeType::READY; } diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 929aef5ebb3d..9a1a880e7413 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -276,7 +276,9 @@ void LocalTableStorage::Rollback() { for (auto &writer : optimistic_writers) { writer->Rollback(); } + optimistic_writers.clear(); optimistic_writer.Rollback(); + for (auto &collection : optimistic_collections) { if (!collection) { continue; From 3ff8737a4396748b99ed90ddc65c56121bf956d8 Mon Sep 17 00:00:00 2001 From: Tishj Date: Mon, 27 Jan 2025 16:06:57 +0100 Subject: [PATCH 014/142] this aligns the behavior of varchar->list with that of varchar->struct --- src/function/cast/vector_cast_helpers.cpp | 7 +--- test/sql/cast/string_to_list_cast.test | 48 +++++++++++------------ 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index c7aa523eaa0d..9dfcfa3d3b0c 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -141,12 +141,9 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { idx_t struct_lvl = 0; SkipToClose(pos, buf, len, struct_lvl, '}'); } else if (buf[pos] == ',' || buf[pos] == ']') { - idx_t trailing_whitespace = 0; - while (StringUtil::CharacterIsSpace(buf[pos - trailing_whitespace - 1])) { - trailing_whitespace++; - } + auto trimmed_pos = StringTrim(buf, start_pos, pos); if (buf[pos] != ']' || start_pos != pos || seen_value) { - state.HandleValue(buf, start_pos, pos - trailing_whitespace); + state.HandleValue(buf, start_pos, trimmed_pos); seen_value = true; } if (buf[pos] == ']') { diff --git a/test/sql/cast/string_to_list_cast.test b/test/sql/cast/string_to_list_cast.test index 45a84603810a..ff1c56ac87e5 100644 --- a/test/sql/cast/string_to_list_cast.test +++ b/test/sql/cast/string_to_list_cast.test @@ -16,7 +16,7 @@ SELECT '[12,13,14]'::INT[]; query I SELECT '["hello", "world", "!"]'::VARCHAR[]; ---- -["hello", "world", "!"] +[hello, world, !] query I SELECT CAST('[Hello World!]' AS VARCHAR[]); @@ -110,8 +110,8 @@ INSERT INTO stringList VALUES ('["hello","world","!"]'), ('["Amazing","text"]'), query I SELECT col1::VARCHAR[] FROM stringList; ---- -["hello", "world", "!"] -["Amazing", "text"] +[hello, world, !] +[Amazing, text] [Hello World!] # --------------------------------------------------- @@ -124,8 +124,8 @@ INSERT INTO nestedStrings VALUES ('[["hello"], ["world"],["!"]]'), ('[["Amazing" query I SELECT col1::VARCHAR[][] FROM nestedStrings; ---- -[["hello"], ["world"], ["!"]] -[["Amazing"], ["text"]] +[[hello], [world], [!]] +[[Amazing], [text]] [[Hello World!]] # --------------------------------------------------- @@ -138,8 +138,8 @@ INSERT INTO superNestedStrings VALUES ('[[[[["hello"]]], [[["world"],["!"]]]]]') query I SELECT col1::VARCHAR[][][][][] FROM superNestedStrings; ---- -[[[[["hello"]]], [[["world"], ["!"]]]]] -[[[[["Amazing"]], [["text"]]]]] +[[[[[hello]]], [[[world], [!]]]]] +[[[[[Amazing]], [[text]]]]] [[[[[Hello World!]]]]] # --------------------------------------------------- @@ -201,39 +201,39 @@ SELECT col1::INT[][][][][][] FROM crazyNested; # Quote handling # --------------------------------------------------- query I -SELECT CAST('[''hello'',''world'', ''!'']' AS VARCHAR[]); +SELECT CAST($$['hello','world', '!']$$ AS VARCHAR[]); ---- -['hello', 'world', '!'] +[hello, world, !] query I -SELECT CAST('[''''hello'''',''''world'''', ''''!'''']' AS VARCHAR[]); +SELECT CAST($$[''hello'',''world'', ''!'']$$ AS VARCHAR[]); ---- -[''hello'', ''world'', ''!''] +['hello', 'world', '!'] query I -SELECT CAST('[[ [''🦆, 🦆, 🦆'']], [[duck, db, ''🦆''] ]]' AS VARCHAR[][][]); +SELECT CAST($$[[ ['🦆, 🦆, 🦆']], [[duck, db, '🦆'] ]]$$ AS VARCHAR[][][]); ---- -[[['🦆, 🦆, 🦆']], [[duck, db, '🦆']]] +[[[🦆, 🦆, 🦆]], [[duck, db, 🦆]]] query I -SELECT CAST('["can''t", "you''re", "i''m"]' AS VARCHAR[]); +SELECT CAST($$["can't", "you're", "i'm"]$$ AS VARCHAR[]); ---- -["can't", "you're", "i'm"] +[can't, you're, i'm] query I -SELECT CAST('[can''t, you''re, i''m]' AS VARCHAR[]); +SELECT CAST($$[can't, you're, i'm]$$ AS VARCHAR[]); ---- [can't, you're, i'm] query I -SELECT CAST('["]", "hello", "world"]' AS VARCHAR[]); +SELECT CAST($$["]", "hello", "world"]$$ AS VARCHAR[]); ---- -["]", "hello", "world"] +[], hello, world] query I -SELECT CAST('['']'', "hello", "world"]' AS VARCHAR[]); +SELECT CAST($$[']', "hello", "world"]$$ AS VARCHAR[]); ---- -[']', "hello", "world"] +[], hello, world] # Test for whitespaces @@ -249,9 +249,9 @@ SELECT CAST('[ [ [12, 13,14], [8, 9 ] ],[[ 4 ] ], [[[12, 13, 14], [8, 9]], [[4]], [[2, 1, 0]]] query I -SELECT CAST('[" hello"," '' world", "! "]' AS VARCHAR[]); +SELECT CAST($$[" hello"," ' world", "! "]$$ AS VARCHAR[]); ---- -[" hello", " ' world", "! "] +[ hello, ' world, ! ] query I SELECT CAST('[ hello , world , ! ]' AS VARCHAR[]); @@ -259,9 +259,9 @@ SELECT CAST('[ hello , world , ! ]' AS VARCHAR[]); [hello, world, !] query I -SELECT CAST('[ [ " hello"] ,[" world" ],[ "! " ] ]' AS VARCHAR[][]); +SELECT CAST($$[ [ " hello"] ,[" world" ],[ "! " ] ]$$ AS VARCHAR[][]); ---- -[[" hello"], [" world"], ["! "]] +[[ hello], [ world], [! ]] # Empty list From edd27473333030b1b7ace05124a40681b4337bcb Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 5 Feb 2025 15:37:28 +0100 Subject: [PATCH 015/142] WIP: support for escaping in string -> list/struct cast, struct isn't finished yet --- src/function/cast/vector_cast_helpers.cpp | 326 +++++++++++++++------- test/sql/cast/string_to_list_cast.test | 2 +- 2 files changed, 226 insertions(+), 102 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 9dfcfa3d3b0c..64f26187e4dc 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -1,32 +1,56 @@ #include "duckdb/function/cast/vector_cast_helpers.hpp" +#include "duckdb/common/typedefs.hpp" + +namespace { + +struct StringCastInputState { +public: + StringCastInputState(const char *buf, idx_t &pos, idx_t &len) : buf(buf), pos(pos), len(len) { + } + +public: + const char *buf; + idx_t &pos; + idx_t &len; + bool escaped = false; +}; + +} // namespace namespace duckdb { // ------- Helper functions for splitting string nested types ------- -static bool IsNull(const char *buf, idx_t start_pos, Vector &child, idx_t row_idx) { - if ((buf[start_pos] == 'N' || buf[start_pos] == 'n') && (buf[start_pos + 1] == 'U' || buf[start_pos + 1] == 'u') && - (buf[start_pos + 2] == 'L' || buf[start_pos + 2] == 'l') && - (buf[start_pos + 3] == 'L' || buf[start_pos + 3] == 'l')) { - FlatVector::SetNull(child, row_idx, true); - return true; +static bool IsNull(StringCastInputState &input_state) { + auto &buf = input_state.buf; + auto &pos = input_state.pos; + if (input_state.pos + 4 != input_state.len) { + return false; } - return false; + return StringUtil::CIEquals(string(buf + pos, buf + pos + 4), "null"); } -inline static void SkipWhitespace(const char *buf, idx_t &pos, idx_t len) { +inline static void SkipWhitespace(StringCastInputState &input_state) { + auto &buf = input_state.buf; + auto &pos = input_state.pos; + auto &len = input_state.len; while (pos < len && StringUtil::CharacterIsSpace(buf[pos])) { pos++; + input_state.escaped = false; } } -static bool SkipToCloseQuotes(idx_t &pos, const char *buf, idx_t &len) { +static bool SkipToCloseQuotes(StringCastInputState &input_state) { + auto &buf = input_state.buf; + auto &pos = input_state.pos; + auto &len = input_state.len; + auto &escaped = input_state.escaped; + char quote = buf[pos]; pos++; - bool escaped = false; while (pos < len) { if (buf[pos] == '\\') { - escaped = !escaped; + escaped = true; } else { if (buf[pos] == quote && !escaped) { return true; @@ -38,48 +62,45 @@ static bool SkipToCloseQuotes(idx_t &pos, const char *buf, idx_t &len) { return false; } -static bool SkipToClose(idx_t &idx, const char *buf, idx_t &len, idx_t &lvl, char close_bracket) { +static bool SkipToClose(StringCastInputState &input_state, idx_t &lvl, char close_bracket) { + auto &idx = input_state.pos; + auto &buf = input_state.buf; + auto &len = input_state.len; + auto &escaped = input_state.escaped; idx++; vector brackets; brackets.push_back(close_bracket); while (idx < len) { - if (buf[idx] == '"' || buf[idx] == '\'') { - if (!SkipToCloseQuotes(idx, buf, len)) { - return false; - } - } else if (buf[idx] == '{') { - brackets.push_back('}'); - } else if (buf[idx] == '[') { - brackets.push_back(']'); - lvl++; - } else if (buf[idx] == brackets.back()) { - if (buf[idx] == ']') { - lvl--; - } - brackets.pop_back(); - if (brackets.empty()) { - return true; + if (!escaped) { + if (buf[idx] == '"' || buf[idx] == '\'') { + if (!SkipToCloseQuotes(input_state)) { + return false; + } + } else if (buf[idx] == '{') { + brackets.push_back('}'); + } else if (buf[idx] == '[') { + brackets.push_back(']'); + lvl++; + } else if (buf[idx] == brackets.back()) { + if (buf[idx] == ']') { + lvl--; + } + brackets.pop_back(); + if (brackets.empty()) { + return true; + } + } else if (buf[idx] == '\\') { + escaped = true; } + } else { + escaped = false; } idx++; } return false; } -static idx_t StringTrim(const char *buf, idx_t &start_pos, idx_t pos) { - idx_t trailing_whitespace = 0; - while (pos > start_pos && StringUtil::CharacterIsSpace(buf[pos - trailing_whitespace - 1])) { - trailing_whitespace++; - } - if ((buf[start_pos] == '"' && buf[pos - trailing_whitespace - 1] == '"') || - (buf[start_pos] == '\'' && buf[pos - trailing_whitespace - 1] == '\'')) { - start_pos++; - trailing_whitespace++; - } - return (pos - trailing_whitespace); -} - struct CountPartOperation { idx_t count = 0; @@ -94,25 +115,45 @@ struct CountPartOperation { // ------- LIST SPLIT ------- struct SplitStringListOperation { - SplitStringListOperation(string_t *child_data, idx_t &child_start, Vector &child) - : child_data(child_data), child_start(child_start), child(child) { +public: + SplitStringListOperation(string_t *child_data, idx_t &entry_count, Vector &child) + : child_data(child_data), entry_count(entry_count), child(child) { } - string_t *child_data; - idx_t &child_start; - Vector &child; - - void HandleValue(const char *buf, idx_t start_pos, idx_t pos) { - if ((pos - start_pos) == 4 && IsNull(buf, start_pos, child, child_start)) { - child_start++; +public: + void HandleValue(const char *buf, idx_t start, idx_t end) { + StringCastInputState temp_state(buf, start, end); + if (IsNull(temp_state)) { + FlatVector::SetNull(child, entry_count, true); + entry_count++; return; } - if (start_pos > pos) { - pos = start_pos; + D_ASSERT(start <= end); + auto length = end - start; + auto allocated_string = StringVector::EmptyString(child, length); + auto string_data = allocated_string.GetDataWriteable(); + uint32_t copied_count = 0; + bool escaped = false; + for (idx_t i = 0; i < length; i++) { + if (!escaped) { + if (buf[start + i] == '\\') { + escaped = true; + } else { + string_data[copied_count++] = buf[start + i]; + } + } else { + string_data[copied_count++] = buf[start + i]; + escaped = false; + } } - child_data[child_start] = StringVector::AddString(child, buf + start_pos, pos - start_pos); - child_start++; + child_data[entry_count] = string_t((const char *)string_data, copied_count); // NOLINT + entry_count++; } + +private: + string_t *child_data; + idx_t &entry_count; + Vector &child; }; template @@ -121,42 +162,94 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { idx_t len = input.GetSize(); idx_t lvl = 1; idx_t pos = 0; - bool seen_value = false; - SkipWhitespace(buf, pos, len); + StringCastInputState input_state(buf, pos, len); + + SkipWhitespace(input_state); if (pos == len || buf[pos] != '[') { + //! Does not have a valid list start return false; } - SkipWhitespace(buf, ++pos, len); - idx_t start_pos = pos; + //! Skip the '[' + pos++; + SkipWhitespace(input_state); + optional_idx start_pos; + idx_t end_pos; + bool seen_value = false; while (pos < len) { if (buf[pos] == '[') { - if (!SkipToClose(pos, buf, len, ++lvl, ']')) { - return false; + if (!start_pos.IsValid()) { + start_pos = pos; + } + //! Start of a LIST + if (!input_state.escaped) { + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; + } + } + end_pos = pos; + } else if ((buf[pos] == '"' || buf[pos] == '\'')) { + if (!input_state.escaped) { + if (!start_pos.IsValid()) { + //! Trim the start quote + start_pos = pos + 1; + } + if (!SkipToCloseQuotes(input_state)) { + return false; + } + end_pos = pos - 1; + } else { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; } - } else if ((buf[pos] == '"' || buf[pos] == '\'') && pos == start_pos) { - SkipToCloseQuotes(pos, buf, len); } else if (buf[pos] == '{') { - idx_t struct_lvl = 0; - SkipToClose(pos, buf, len, struct_lvl, '}'); + if (!start_pos.IsValid()) { + start_pos = pos; + } + //! Start of a STRUCT + if (!input_state.escaped) { + idx_t struct_lvl = 0; + if (!SkipToClose(input_state, struct_lvl, '}')) { + return false; + } + } + end_pos = pos; } else if (buf[pos] == ',' || buf[pos] == ']') { - auto trimmed_pos = StringTrim(buf, start_pos, pos); - if (buf[pos] != ']' || start_pos != pos || seen_value) { - state.HandleValue(buf, start_pos, trimmed_pos); + if (buf[pos] != ']' || start_pos.IsValid() || seen_value) { + if (!start_pos.IsValid()) { + state.HandleValue(buf, 0, 0); + } else { + auto start = start_pos.GetIndex(); + auto end = (end_pos + 1) - start; + auto substr = std::string(buf + start, end); + state.HandleValue(buf, start, end_pos + 1); + } seen_value = true; } if (buf[pos] == ']') { lvl--; break; } - SkipWhitespace(buf, ++pos, len); - start_pos = pos; + pos++; + SkipWhitespace(input_state); + start_pos = optional_idx(); continue; + } else if (buf[pos] == '\\') { + input_state.escaped = true; + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; } pos++; } - SkipWhitespace(buf, ++pos, len); + pos++; + SkipWhitespace(input_state); return (pos == len && lvl == 0); } @@ -187,7 +280,8 @@ struct SplitStringMapOperation { Vector &varchar_val; bool HandleKey(const char *buf, idx_t start_pos, idx_t pos) { - if ((pos - start_pos) == 4 && IsNull(buf, start_pos, varchar_key, child_start)) { + StringCastInputState temp_state(buf, start_pos, pos); + if (IsNull(temp_state)) { FlatVector::SetNull(varchar_val, child_start, true); child_start++; return false; @@ -197,7 +291,9 @@ struct SplitStringMapOperation { } void HandleValue(const char *buf, idx_t start_pos, idx_t pos) { - if ((pos - start_pos) == 4 && IsNull(buf, start_pos, varchar_val, child_start)) { + StringCastInputState temp_state(buf, start_pos, pos); + if (IsNull(temp_state)) { + FlatVector::SetNull(varchar_val, child_start, true); child_start++; return; } @@ -207,21 +303,30 @@ struct SplitStringMapOperation { }; template -static bool FindKeyOrValueMap(const char *buf, idx_t len, idx_t &pos, OP &state, bool key) { - auto start_pos = pos; +static bool FindKeyOrValueMap(StringCastInputState &input_state, OP &state, bool key) { + auto start_pos = input_state.pos; idx_t lvl = 0; + + auto &buf = input_state.buf; + auto &len = input_state.len; + auto &pos = input_state.pos; + while (pos < len) { if (buf[pos] == '"' || buf[pos] == '\'') { - SkipToCloseQuotes(pos, buf, len); + SkipToCloseQuotes(input_state); } else if (buf[pos] == '{') { - SkipToClose(pos, buf, len, lvl, '}'); + SkipToClose(input_state, lvl, '}'); } else if (buf[pos] == '[') { - SkipToClose(pos, buf, len, lvl, ']'); + SkipToClose(input_state, lvl, ']'); } else if (key && buf[pos] == '=') { - idx_t end_pos = StringTrim(buf, start_pos, pos); + // TODO: process the string + // idx_t end_pos = StringTrim(buf, start_pos, pos); + idx_t end_pos = pos; return state.HandleKey(buf, start_pos, end_pos); // put string in KEY_child_vector } else if (!key && (buf[pos] == ',' || buf[pos] == '}')) { - idx_t end_pos = StringTrim(buf, start_pos, pos); + // TODO: process the string + // idx_t end_pos = StringTrim(buf, start_pos, pos); + idx_t end_pos = pos; state.HandleValue(buf, start_pos, end_pos); // put string in VALUE_child_vector return true; } @@ -235,28 +340,33 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { const char *buf = input.GetData(); idx_t len = input.GetSize(); idx_t pos = 0; + StringCastInputState input_state(buf, pos, len); - SkipWhitespace(buf, pos, len); + SkipWhitespace(input_state); if (pos == len || buf[pos] != '{') { return false; } - SkipWhitespace(buf, ++pos, len); + pos++; + SkipWhitespace(input_state); if (pos == len) { return false; } if (buf[pos] == '}') { - SkipWhitespace(buf, ++pos, len); + pos++; + SkipWhitespace(input_state); return (pos == len); } while (pos < len) { - if (!FindKeyOrValueMap(buf, len, pos, state, true)) { + if (!FindKeyOrValueMap(input_state, state, true)) { return false; } - SkipWhitespace(buf, ++pos, len); - if (!FindKeyOrValueMap(buf, len, pos, state, false)) { + pos++; + SkipWhitespace(input_state); + if (!FindKeyOrValueMap(input_state, state, false)) { return false; } - SkipWhitespace(buf, ++pos, len); + pos++; + SkipWhitespace(input_state); } return true; } @@ -284,24 +394,31 @@ static bool FindKeyStruct(const char *buf, idx_t len, idx_t &pos) { return false; } -static bool FindValueStruct(const char *buf, idx_t len, idx_t &pos, Vector &varchar_child, idx_t &row_idx, +static bool FindValueStruct(StringCastInputState &input_state, Vector &varchar_child, idx_t &row_idx, ValidityMask &child_mask) { - auto start_pos = pos; + auto start_pos = input_state.pos; idx_t lvl = 0; + + auto &len = input_state.len; + auto &pos = input_state.pos; + auto &buf = input_state.buf; while (pos < len) { if (buf[pos] == '"' || buf[pos] == '\'') { - SkipToCloseQuotes(pos, buf, len); + SkipToCloseQuotes(input_state); } else if (buf[pos] == '{') { - SkipToClose(pos, buf, len, lvl, '}'); + SkipToClose(input_state, lvl, '}'); } else if (buf[pos] == '[') { - SkipToClose(pos, buf, len, lvl, ']'); + SkipToClose(input_state, lvl, ']'); } else if (buf[pos] == ',' || buf[pos] == '}') { - idx_t end_pos = StringTrim(buf, start_pos, pos); - if ((end_pos - start_pos) == 4 && IsNull(buf, start_pos, varchar_child, row_idx)) { + // TODO: start_pos at first non-whitespace character + StringCastInputState temp_state(buf, start_pos, pos); + if (IsNull(temp_state)) { + FlatVector::SetNull(varchar_child, row_idx, true); return true; } + // TODO: copy the unescaped portion of the string FlatVector::GetData(varchar_child)[row_idx] = - StringVector::AddString(varchar_child, buf + start_pos, end_pos - start_pos); + StringVector::AddString(varchar_child, buf + start_pos, pos - start_pos); child_mask.SetValid(row_idx); // any child not set to valid will remain invalid return true; } @@ -318,11 +435,14 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector= key_end) { // empty key name unsupported return false; @@ -343,14 +465,16 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vectorsecond; - SkipWhitespace(buf, ++pos, len); - if (!FindValueStruct(buf, len, pos, *varchar_vectors[child_idx], row_idx, child_masks[child_idx].get())) { + pos++; + SkipWhitespace(input_state); + if (!FindValueStruct(input_state, *varchar_vectors[child_idx], row_idx, child_masks[child_idx].get())) { return false; } - SkipWhitespace(buf, ++pos, len); + pos++; + SkipWhitespace(input_state); } } - SkipWhitespace(buf, pos, len); + SkipWhitespace(input_state); return (pos == len); } diff --git a/test/sql/cast/string_to_list_cast.test b/test/sql/cast/string_to_list_cast.test index ff1c56ac87e5..8d9d8fee1879 100644 --- a/test/sql/cast/string_to_list_cast.test +++ b/test/sql/cast/string_to_list_cast.test @@ -221,7 +221,7 @@ SELECT CAST($$["can't", "you're", "i'm"]$$ AS VARCHAR[]); [can't, you're, i'm] query I -SELECT CAST($$[can't, you're, i'm]$$ AS VARCHAR[]); +SELECT CAST($$[can\'t, you\'re, i\'m]$$ AS VARCHAR[]); ---- [can't, you're, i'm] From 49558e9ef2ce67ddd76ba15733b5f04c8de742a4 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 5 Feb 2025 17:31:50 +0100 Subject: [PATCH 016/142] removing unescaped quotes, perhaps a little too aggressively, still WIP --- src/function/cast/vector_cast_helpers.cpp | 23 +++++++++++------------ test/sql/cast/string_to_list_cast.test | 8 ++++---- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 64f26187e4dc..48719a2ec77d 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -138,7 +138,7 @@ struct SplitStringListOperation { if (!escaped) { if (buf[start + i] == '\\') { escaped = true; - } else { + } else if (buf[start + i] != '\'' && buf[start + i] != '"') { string_data[copied_count++] = buf[start + i]; } } else { @@ -191,21 +191,15 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { } end_pos = pos; } else if ((buf[pos] == '"' || buf[pos] == '\'')) { + if (!start_pos.IsValid()) { + start_pos = pos; + } if (!input_state.escaped) { - if (!start_pos.IsValid()) { - //! Trim the start quote - start_pos = pos + 1; - } if (!SkipToCloseQuotes(input_state)) { return false; } - end_pos = pos - 1; - } else { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; } + end_pos = pos; } else if (buf[pos] == '{') { if (!start_pos.IsValid()) { start_pos = pos; @@ -239,7 +233,12 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { start_pos = optional_idx(); continue; } else if (buf[pos] == '\\') { - input_state.escaped = true; + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + input_state.escaped = true; + } } else if (!StringUtil::CharacterIsSpace(buf[pos])) { if (!start_pos.IsValid()) { start_pos = pos; diff --git a/test/sql/cast/string_to_list_cast.test b/test/sql/cast/string_to_list_cast.test index 8d9d8fee1879..504f416973c1 100644 --- a/test/sql/cast/string_to_list_cast.test +++ b/test/sql/cast/string_to_list_cast.test @@ -206,7 +206,7 @@ SELECT CAST($$['hello','world', '!']$$ AS VARCHAR[]); [hello, world, !] query I -SELECT CAST($$[''hello'',''world'', ''!'']$$ AS VARCHAR[]); +SELECT CAST($$[\'hello\',\'world\', \'!\']$$ AS VARCHAR[]); ---- ['hello', 'world', '!'] @@ -216,7 +216,7 @@ SELECT CAST($$[[ ['🦆, 🦆, 🦆']], [[duck, db, '🦆'] ]]$$ AS VARCHAR[][][ [[[🦆, 🦆, 🦆]], [[duck, db, 🦆]]] query I -SELECT CAST($$["can't", "you're", "i'm"]$$ AS VARCHAR[]); +SELECT CAST($$[can\'t, you\'re, i\'m]$$ AS VARCHAR[]); ---- [can't, you're, i'm] @@ -249,7 +249,7 @@ SELECT CAST('[ [ [12, 13,14], [8, 9 ] ],[[ 4 ] ], [[[12, 13, 14], [8, 9]], [[4]], [[2, 1, 0]]] query I -SELECT CAST($$[" hello"," ' world", "! "]$$ AS VARCHAR[]); +SELECT CAST($$[" hello"," \' world", "! "]$$ AS VARCHAR[]); ---- [ hello, ' world, ! ] @@ -259,7 +259,7 @@ SELECT CAST('[ hello , world , ! ]' AS VARCHAR[]); [hello, world, !] query I -SELECT CAST($$[ [ " hello"] ,[" world" ],[ "! " ] ]$$ AS VARCHAR[][]); +SELECT CAST($$[ [ \" hello\"] ,[\" world\" ],[ \"! \" ] ]$$ AS VARCHAR[][]); ---- [[ hello], [ world], [! ]] From 8866325e21854f44fd0dbcc52007c12afe8734ae Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 5 Feb 2025 18:17:01 +0100 Subject: [PATCH 017/142] leave the escapes in deeper list levels alone --- src/function/cast/vector_cast_helpers.cpp | 36 ++++++++++++++++++++--- test/sql/cast/string_to_list_cast.test | 2 +- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 48719a2ec77d..7eca9420aba6 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -1,5 +1,6 @@ #include "duckdb/function/cast/vector_cast_helpers.hpp" #include "duckdb/common/typedefs.hpp" +#include "duckdb/common/stack.hpp" namespace { @@ -134,15 +135,42 @@ struct SplitStringListOperation { auto string_data = allocated_string.GetDataWriteable(); uint32_t copied_count = 0; bool escaped = false; + + bool quoted = false; + char quote_char; + stack scopes; for (idx_t i = 0; i < length; i++) { + auto current_char = buf[start + i]; if (!escaped) { - if (buf[start + i] == '\\') { + if (scopes.empty() && current_char == '\\') { + //! Start of escape escaped = true; - } else if (buf[start + i] != '\'' && buf[start + i] != '"') { - string_data[copied_count++] = buf[start + i]; + continue; + } + if (scopes.empty() && (current_char == '\'' || current_char == '"')) { + if (quoted && current_char == quote_char) { + quoted = false; + //! Skip the ending quote + continue; + } else if (!quoted) { + quoted = true; + quote_char = current_char; + //! Skip the starting quote + continue; + } + } + if (!quoted && !scopes.empty() && current_char == scopes.top()) { + //! Close scope + scopes.pop(); + } + if (!quoted && (current_char == '[' || current_char == '{')) { + //! New scope + scopes.push(current_char == '[' ? ']' : '}'); } + //! Regular character + string_data[copied_count++] = current_char; } else { - string_data[copied_count++] = buf[start + i]; + string_data[copied_count++] = current_char; escaped = false; } } diff --git a/test/sql/cast/string_to_list_cast.test b/test/sql/cast/string_to_list_cast.test index 504f416973c1..c6e58cd99338 100644 --- a/test/sql/cast/string_to_list_cast.test +++ b/test/sql/cast/string_to_list_cast.test @@ -259,7 +259,7 @@ SELECT CAST('[ hello , world , ! ]' AS VARCHAR[]); [hello, world, !] query I -SELECT CAST($$[ [ \" hello\"] ,[\" world\" ],[ \"! \" ] ]$$ AS VARCHAR[][]); +SELECT CAST($$[ [ " hello"] ,[" world" ],[ "! " ] ]$$ AS VARCHAR[][]); ---- [[ hello], [ world], [! ]] From 595e4a8d8a42a2d7a130db9d4fe3b61fc69eccb2 Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 6 Feb 2025 09:51:46 +0100 Subject: [PATCH 018/142] add escaped doublequote to test --- test/sql/cast/string_to_list_cast.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/cast/string_to_list_cast.test b/test/sql/cast/string_to_list_cast.test index c6e58cd99338..c64a950651cf 100644 --- a/test/sql/cast/string_to_list_cast.test +++ b/test/sql/cast/string_to_list_cast.test @@ -249,9 +249,9 @@ SELECT CAST('[ [ [12, 13,14], [8, 9 ] ],[[ 4 ] ], [[[12, 13, 14], [8, 9]], [[4]], [[2, 1, 0]]] query I -SELECT CAST($$[" hello"," \' world", "! "]$$ AS VARCHAR[]); +SELECT CAST($$[" hello"," \"' world", "! "]$$ AS VARCHAR[]); ---- -[ hello, ' world, ! ] +[ hello, "' world, ! ] query I SELECT CAST('[ hello , world , ! ]' AS VARCHAR[]); From bb5ca2ec8eb2843ac33a52250e2493efe6ba7ca0 Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 6 Feb 2025 11:37:09 +0100 Subject: [PATCH 019/142] more WIP, worked on supporting the same escaping in MAP --- src/function/cast/vector_cast_helpers.cpp | 283 ++++++++++++++------- test/sql/cast/string_to_map_cast.test_slow | 8 +- 2 files changed, 194 insertions(+), 97 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 7eca9420aba6..ba4eeaa67427 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -85,6 +85,9 @@ static bool SkipToClose(StringCastInputState &input_state, idx_t &lvl, char clos lvl++; } else if (buf[idx] == brackets.back()) { if (buf[idx] == ']') { + if (lvl == 0) { + return false; + } lvl--; } brackets.pop_back(); @@ -114,6 +117,55 @@ struct CountPartOperation { } }; +static string_t HandleString(Vector &vec, const char *buf, idx_t start, idx_t end) { + D_ASSERT(start <= end); + auto length = end - start; + auto allocated_string = StringVector::EmptyString(vec, length); + auto string_data = allocated_string.GetDataWriteable(); + uint32_t copied_count = 0; + bool escaped = false; + + bool quoted = false; + char quote_char; + stack scopes; + for (idx_t i = 0; i < length; i++) { + auto current_char = buf[start + i]; + if (!escaped) { + if (scopes.empty() && current_char == '\\') { + //! Start of escape + escaped = true; + continue; + } + if (scopes.empty() && (current_char == '\'' || current_char == '"')) { + if (quoted && current_char == quote_char) { + quoted = false; + //! Skip the ending quote + continue; + } else if (!quoted) { + quoted = true; + quote_char = current_char; + //! Skip the starting quote + continue; + } + } + if (!quoted && !scopes.empty() && current_char == scopes.top()) { + //! Close scope + scopes.pop(); + } + if (!quoted && (current_char == '[' || current_char == '{')) { + //! New scope + scopes.push(current_char == '[' ? ']' : '}'); + } + //! Regular character + string_data[copied_count++] = current_char; + } else { + string_data[copied_count++] = current_char; + escaped = false; + } + } + return string_t((const char *)string_data, copied_count); // NOLINT +} + // ------- LIST SPLIT ------- struct SplitStringListOperation { public: @@ -129,52 +181,7 @@ struct SplitStringListOperation { entry_count++; return; } - D_ASSERT(start <= end); - auto length = end - start; - auto allocated_string = StringVector::EmptyString(child, length); - auto string_data = allocated_string.GetDataWriteable(); - uint32_t copied_count = 0; - bool escaped = false; - - bool quoted = false; - char quote_char; - stack scopes; - for (idx_t i = 0; i < length; i++) { - auto current_char = buf[start + i]; - if (!escaped) { - if (scopes.empty() && current_char == '\\') { - //! Start of escape - escaped = true; - continue; - } - if (scopes.empty() && (current_char == '\'' || current_char == '"')) { - if (quoted && current_char == quote_char) { - quoted = false; - //! Skip the ending quote - continue; - } else if (!quoted) { - quoted = true; - quote_char = current_char; - //! Skip the starting quote - continue; - } - } - if (!quoted && !scopes.empty() && current_char == scopes.top()) { - //! Close scope - scopes.pop(); - } - if (!quoted && (current_char == '[' || current_char == '{')) { - //! New scope - scopes.push(current_char == '[' ? ']' : '}'); - } - //! Regular character - string_data[copied_count++] = current_char; - } else { - string_data[copied_count++] = current_char; - escaped = false; - } - } - child_data[entry_count] = string_t((const char *)string_data, copied_count); // NOLINT + child_data[entry_count] = HandleString(child, buf, start, end); entry_count++; } @@ -206,6 +213,11 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { idx_t end_pos; bool seen_value = false; while (pos < len) { + if (pos == len) { + return false; + } + bool set_escaped = false; + if (buf[pos] == '[') { if (!start_pos.IsValid()) { start_pos = pos; @@ -253,19 +265,19 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { seen_value = true; } if (buf[pos] == ']') { + if (lvl == 0) { + return false; + } lvl--; break; } - pos++; - SkipWhitespace(input_state); start_pos = optional_idx(); - continue; } else if (buf[pos] == '\\') { if (!start_pos.IsValid()) { start_pos = pos; } if (!input_state.escaped) { - input_state.escaped = true; + set_escaped = true; } } else if (!StringUtil::CharacterIsSpace(buf[pos])) { if (!start_pos.IsValid()) { @@ -273,7 +285,9 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { } end_pos = pos; } + input_state.escaped = set_escaped; pos++; + SkipWhitespace(input_state); } pos++; SkipWhitespace(input_state); @@ -313,7 +327,7 @@ struct SplitStringMapOperation { child_start++; return false; } - child_key_data[child_start] = StringVector::AddString(varchar_key, buf + start_pos, pos - start_pos); + child_key_data[child_start] = HandleString(varchar_key, buf, start_pos, pos); return true; } @@ -324,50 +338,18 @@ struct SplitStringMapOperation { child_start++; return; } - child_val_data[child_start] = StringVector::AddString(varchar_val, buf + start_pos, pos - start_pos); + child_val_data[child_start] = HandleString(varchar_val, buf, start_pos, pos); child_start++; } }; -template -static bool FindKeyOrValueMap(StringCastInputState &input_state, OP &state, bool key) { - auto start_pos = input_state.pos; - idx_t lvl = 0; - - auto &buf = input_state.buf; - auto &len = input_state.len; - auto &pos = input_state.pos; - - while (pos < len) { - if (buf[pos] == '"' || buf[pos] == '\'') { - SkipToCloseQuotes(input_state); - } else if (buf[pos] == '{') { - SkipToClose(input_state, lvl, '}'); - } else if (buf[pos] == '[') { - SkipToClose(input_state, lvl, ']'); - } else if (key && buf[pos] == '=') { - // TODO: process the string - // idx_t end_pos = StringTrim(buf, start_pos, pos); - idx_t end_pos = pos; - return state.HandleKey(buf, start_pos, end_pos); // put string in KEY_child_vector - } else if (!key && (buf[pos] == ',' || buf[pos] == '}')) { - // TODO: process the string - // idx_t end_pos = StringTrim(buf, start_pos, pos); - idx_t end_pos = pos; - state.HandleValue(buf, start_pos, end_pos); // put string in VALUE_child_vector - return true; - } - pos++; - } - return false; -} - template static bool SplitStringMapInternal(const string_t &input, OP &state) { const char *buf = input.GetData(); idx_t len = input.GetSize(); idx_t pos = 0; StringCastInputState input_state(buf, pos, len); + idx_t lvl = 0; SkipWhitespace(input_state); if (pos == len || buf[pos] != '{') { @@ -378,24 +360,139 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (pos == len) { return false; } - if (buf[pos] == '}') { - pos++; - SkipWhitespace(input_state); - return (pos == len); - } + while (pos < len) { - if (!FindKeyOrValueMap(input_state, state, true)) { + optional_idx start_pos; + idx_t end_pos; + while (pos < len && (buf[pos] != '=' || input_state.escaped)) { + bool set_escaped = false; + if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToCloseQuotes(input_state)) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '{') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToClose(input_state, lvl, '}')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '[') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '\\') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + set_escaped = true; + } + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; + } + input_state.escaped = set_escaped; + pos++; + } + if (pos == len) { + return false; + } + if (!start_pos.IsValid()) { + //! Key can not be empty return false; } + auto key_substr = std::string(buf + start_pos.GetIndex(), buf + end_pos + 1); + if (!state.HandleKey(buf, start_pos.GetIndex(), end_pos + 1)) { + return false; + } + start_pos = optional_idx(); pos++; SkipWhitespace(input_state); - if (!FindKeyOrValueMap(input_state, state, false)) { + while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { + if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToCloseQuotes(input_state)) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '{') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToClose(input_state, lvl, '}')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '[') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '\\') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + input_state.escaped = true; + } + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; + } + pos++; + } + if (pos == len) { return false; } + if (!start_pos.IsValid()) { + //! Value is empty + state.HandleValue(buf, 0, 0); + } else { + auto value_substr = std::string(buf + start_pos.GetIndex(), buf + end_pos + 1); + state.HandleValue(buf, start_pos.GetIndex(), end_pos + 1); + } + if (buf[pos] == '}') { + break; + } pos++; SkipWhitespace(input_state); } - return true; + pos++; + SkipWhitespace(input_state); + return (pos == len && lvl == 0); } bool VectorStringToMap::SplitStringMap(const string_t &input, string_t *child_key_data, string_t *child_val_data, diff --git a/test/sql/cast/string_to_map_cast.test_slow b/test/sql/cast/string_to_map_cast.test_slow index 3a5cd142860e..2c896fdc61fa 100644 --- a/test/sql/cast/string_to_map_cast.test_slow +++ b/test/sql/cast/string_to_map_cast.test_slow @@ -76,12 +76,12 @@ SELECT CAST('{''hello''=2, ''world''=50, ''!''=12}' AS MAP(VARCHAR, INT)); {hello=2, world=50, !=12} query I -SELECT CAST('{''''hello''''=hello, ''''world''''=world, ''''!''''=!}' AS MAP(VARCHAR, VARCHAR)); +SELECT CAST($${\'hello\'=hello, \'world\'=world, \'!\'=!}$$ AS MAP(VARCHAR, VARCHAR)); ---- {'hello'=hello, 'world'=world, '!'=!} query I -SELECT CAST('{[[''🦆, 🦆, 🦆'']]=100, [[duck, db, ''🦆'']]=101}' AS MAP(VARCHAR[][], INT)); +SELECT CAST($${[[\'🦆, 🦆, 🦆\']]=100, [[duck, db, \'🦆\']]=101}$$ AS MAP(VARCHAR[][], INT)); ---- {[['🦆, 🦆, 🦆']]=100, [[duck, db, '🦆']]=101} @@ -114,8 +114,8 @@ SELECT CAST('{ [12, 13,14]=val, [ 8, 9 ] =val, [ 4 ]=v {[12, 13, 14]=val, [8, 9]=val, [4]=val} query I -SELECT CAST(' { { a:[2, 3], b: Duckster }= {50.0 =50}, {a : [9,1,4], b:Duck } - ={ 1 = 0} }' AS MAP(STRUCT(a INT[], b VARCHAR), MAP(INT, DOUBLE))); +SELECT CAST($$ { { a:[2, 3], b: Duckster }= {50.0 =50}, {a : [9,1,4], b:Duck } + ={ 1 = 0} }$$ AS MAP(STRUCT(a INT[], b VARCHAR), MAP(INT, DOUBLE))); ---- {{'a': [2, 3], 'b': Duckster}={50=50.0}, {'a': [9, 1, 4], 'b': Duck}={1=0.0}} From 8de6cd55917b430904ffd8a0db8e3cf4dfdd85f3 Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 6 Feb 2025 12:51:23 +0100 Subject: [PATCH 020/142] map, struct and list should all work correctly now --- src/function/cast/vector_cast_helpers.cpp | 229 +++++++++++++++------- test/sql/cast/string_to_list_cast.test | 17 +- 2 files changed, 173 insertions(+), 73 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index ba4eeaa67427..b6048c12eb53 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -50,14 +50,17 @@ static bool SkipToCloseQuotes(StringCastInputState &input_state) { pos++; while (pos < len) { + bool set_escaped = false; if (buf[pos] == '\\') { - escaped = true; + if (!escaped) { + set_escaped = true; + } } else { if (buf[pos] == quote && !escaped) { return true; } - escaped = false; } + escaped = set_escaped; pos++; } return false; @@ -324,6 +327,7 @@ struct SplitStringMapOperation { StringCastInputState temp_state(buf, start_pos, pos); if (IsNull(temp_state)) { FlatVector::SetNull(varchar_val, child_start, true); + FlatVector::SetNull(varchar_key, child_start, true); child_start++; return false; } @@ -360,6 +364,11 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (pos == len) { return false; } + if (buf[pos] == '}') { + pos++; + SkipWhitespace(input_state); + return pos == len; + } while (pos < len) { optional_idx start_pos; @@ -508,49 +517,6 @@ idx_t VectorStringToMap::CountPartsMap(const string_t &input) { } // ------- STRUCT SPLIT ------- -static bool FindKeyStruct(const char *buf, idx_t len, idx_t &pos) { - while (pos < len) { - if (buf[pos] == ':') { - return true; - } - pos++; - } - return false; -} - -static bool FindValueStruct(StringCastInputState &input_state, Vector &varchar_child, idx_t &row_idx, - ValidityMask &child_mask) { - auto start_pos = input_state.pos; - idx_t lvl = 0; - - auto &len = input_state.len; - auto &pos = input_state.pos; - auto &buf = input_state.buf; - while (pos < len) { - if (buf[pos] == '"' || buf[pos] == '\'') { - SkipToCloseQuotes(input_state); - } else if (buf[pos] == '{') { - SkipToClose(input_state, lvl, '}'); - } else if (buf[pos] == '[') { - SkipToClose(input_state, lvl, ']'); - } else if (buf[pos] == ',' || buf[pos] == '}') { - // TODO: start_pos at first non-whitespace character - StringCastInputState temp_state(buf, start_pos, pos); - if (IsNull(temp_state)) { - FlatVector::SetNull(varchar_child, row_idx, true); - return true; - } - // TODO: copy the unescaped portion of the string - FlatVector::GetData(varchar_child)[row_idx] = - StringVector::AddString(varchar_child, buf + start_pos, pos - start_pos); - child_mask.SetValid(row_idx); // any child not set to valid will remain invalid - return true; - } - pos++; - } - return false; -} - bool VectorStringToStruct::SplitStruct(const string_t &input, vector> &varchar_vectors, idx_t &row_idx, string_map_t &child_names, vector> &child_masks) { @@ -558,7 +524,9 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector= key_end) { - // empty key name unsupported - return false; - } - string_t found_key(buf + key_start, UnsafeNumericCast(key_end - key_start)); + SkipWhitespace(input_state); + return (pos == len); + } - auto it = child_names.find(found_key); - if (it == child_names.end()) { - return false; // false key + while (pos < len) { + optional_idx start_pos; + idx_t end_pos; + while (pos < len && (buf[pos] != ':' || input_state.escaped)) { + bool set_escaped = false; + if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToCloseQuotes(input_state)) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '{') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToClose(input_state, lvl, '}')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '[') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '\\') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + set_escaped = true; + } + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; } - child_idx = it->second; + input_state.escaped = set_escaped; pos++; - SkipWhitespace(input_state); - if (!FindValueStruct(input_state, *varchar_vectors[child_idx], row_idx, child_masks[child_idx].get())) { - return false; + } + if (pos == len) { + return false; + } + if (!start_pos.IsValid()) { + //! Key can not be empty + return false; + } + idx_t key_start = start_pos.GetIndex(); + end_pos++; + StringCastInputState key_temp_state(buf, key_start, end_pos); + if (IsNull(key_temp_state)) { + //! Key can not be NULL + return false; + } + auto child_name = HandleString(temp_vec, buf, key_start, end_pos); + auto it = child_names.find(child_name); + if (it == child_names.end()) { + return false; // false key + } + child_idx = it->second; + + start_pos = optional_idx(); + pos++; + SkipWhitespace(input_state); + while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { + if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToCloseQuotes(input_state)) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '{') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToClose(input_state, lvl, '}')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '[') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '\\') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + input_state.escaped = true; + } + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; } pos++; - SkipWhitespace(input_state); } + if (pos == len) { + return false; + } + auto &child_vec = *varchar_vectors[child_idx]; + auto string_data = FlatVector::GetData(child_vec); + auto &child_mask = child_masks[child_idx].get(); + + if (!start_pos.IsValid()) { + start_pos = 0; + end_pos = 0; + } else { + end_pos++; + } + auto value_start = start_pos.GetIndex(); + StringCastInputState value_temp_state(buf, value_start, end_pos); + if (IsNull(value_temp_state)) { + child_mask.SetInvalid(row_idx); + } else { + string_data[row_idx] = HandleString(child_vec, buf, value_start, end_pos); + child_mask.SetValid(row_idx); + } + + if (buf[pos] == '}') { + break; + } + pos++; + SkipWhitespace(input_state); } + pos++; SkipWhitespace(input_state); return (pos == len); } diff --git a/test/sql/cast/string_to_list_cast.test b/test/sql/cast/string_to_list_cast.test index c64a950651cf..37f5b24e4ea8 100644 --- a/test/sql/cast/string_to_list_cast.test +++ b/test/sql/cast/string_to_list_cast.test @@ -488,7 +488,7 @@ statement ok CREATE TABLE assorted_lists(col1 INT[], col2 VARCHAR[], col3 DATE[]); statement ok -COPY (SELECT [8,7,6], '[hello, Duck''DB]', '[2022-12-2, 1929-01-25]') TO '__TEST_DIR__/assorted_lists.csv' (Header 0); +COPY (SELECT [8,7,6], $$[hello, Duck\\'DB]$$, '[2022-12-2, 1929-01-25]') TO '__TEST_DIR__/assorted_lists.csv' (Header 0); statement ok COPY assorted_lists FROM '__TEST_DIR__/assorted_lists.csv'; @@ -507,19 +507,24 @@ select '[{"bar":"\""}]'::VARCHAR[]; ---- [{"bar":"\""}] -# escaped '\', does not count as an escape for " statement error select '[{"bar":"\\""}]'::VARCHAR[]; ---- +# escapes are only processed once the {} is cast as well +statement error +query I +select '[{"bar":"\\""}]'::STRUCT(bar VARCHAR)[]; +---- + # uneven amount of escapes does escape the " query I -select '[{"bar":"\\\""}]'::VARCHAR[]; +select '[{"bar":"\\\""}]'::STRUCT(bar VARCHAR)[]; ---- -[{"bar":"\\\""}] +[{'bar': \"}] # all are escaped except for the last one query I -select '[{"bar":"\"\"\\\"\"\"\\"}]'::VARCHAR[]; +select '[{"bar":"\"\"\\\"\"\"\\"}]'::STRUCT(bar VARCHAR)[]; ---- -[{"bar":"\"\"\\\"\"\"\\"}] +[{'bar': ""\"""\}] From 9b9df847038a72c11a0dbd82af74f03d1e1134ce Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 6 Feb 2025 13:09:28 +0100 Subject: [PATCH 021/142] messed up one piece of escape handling logic --- src/function/cast/vector_cast_helpers.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index b6048c12eb53..347d7102bfa1 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -437,6 +437,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { pos++; SkipWhitespace(input_state); while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { + bool set_escaped = false; if (buf[pos] == '"' || buf[pos] == '\'') { if (!start_pos.IsValid()) { start_pos = pos; @@ -473,7 +474,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { start_pos = pos; } if (!input_state.escaped) { - input_state.escaped = true; + set_escaped = true; } } else if (!StringUtil::CharacterIsSpace(buf[pos])) { if (!start_pos.IsValid()) { @@ -481,6 +482,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { } end_pos = pos; } + input_state.escaped = set_escaped; pos++; } if (pos == len) { From fe5a76f12b08a49f68c5a18b35e63c8c1f74d996 Mon Sep 17 00:00:00 2001 From: pdet Date: Mon, 10 Feb 2025 13:42:38 -0300 Subject: [PATCH 022/142] Do duckdb_extract_statements to be able to execute pivot --- data/csv/flights.csv | 4 ++++ src/common/adbc/adbc.cpp | 40 ++++++++++++++++++++++++++++++++++++- test/api/adbc/test_adbc.cpp | 13 ++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 data/csv/flights.csv diff --git a/data/csv/flights.csv b/data/csv/flights.csv new file mode 100644 index 000000000000..7e8e451da4d1 --- /dev/null +++ b/data/csv/flights.csv @@ -0,0 +1,4 @@ +FlightDate|UniqueCarrier|OriginCityName|DestCityName +1988-01-01|AA|New York, NY|Los Angeles, CA +1988-01-02|AA|New York, NY|Los Angeles, CA +1988-01-03|AA|New York, NY|Los Angeles, CA diff --git a/src/common/adbc/adbc.cpp b/src/common/adbc/adbc.cpp index 35ceb2f3406f..09ac3aa5d8dc 100644 --- a/src/common/adbc/adbc.cpp +++ b/src/common/adbc/adbc.cpp @@ -875,8 +875,46 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char duckdb_destroy_prepare(&wrapper->statement); wrapper->statement = nullptr; } - auto res = duckdb_prepare(wrapper->connection, query, &wrapper->statement); + duckdb_extracted_statements extracted_statements; + auto extract_statements_size = duckdb_extract_statements(wrapper->connection, query, &extracted_statements); + auto error_msg_extract_statements = duckdb_extract_statements_error(extracted_statements); + if (error_msg_extract_statements != nullptr) { + // Things went wrong when executing internal prepared statement + delete extracted_statements; + SetError(error, error_msg_extract_statements); + return ADBC_STATUS_INTERNAL; + } + // Now lets loop over the statements, and execute every one + for (idx_t i = 0; i < extract_statements_size - 1; i++) { + duckdb_prepared_statement statement_internal; + auto res = + duckdb_prepare_extracted_statement(wrapper->connection, extracted_statements, i, &statement_internal); + auto error_msg = duckdb_prepare_error(statement_internal); + auto adbc_status = CheckResult(res, error, error_msg); + if (adbc_status != ADBC_STATUS_OK) { + // Things went wrong when executing internal prepared statement + delete extracted_statements; + delete statement_internal; + return adbc_status; + } + // Execute + duckdb_arrow out_result; + res = duckdb_execute_prepared_arrow(statement_internal, &out_result); + if (res != DuckDBSuccess) { + SetError(error, duckdb_query_arrow_error(out_result)); + delete out_result; + delete statement_internal; + delete extracted_statements; + return ADBC_STATUS_INVALID_ARGUMENT; + } + delete out_result; + delete statement_internal; + } + // Besides ze last, this one we return + auto res = duckdb_prepare_extracted_statement(wrapper->connection, extracted_statements, + extract_statements_size - 1, &wrapper->statement); auto error_msg = duckdb_prepare_error(wrapper->statement); + delete extracted_statements; return CheckResult(res, error, error_msg); } diff --git a/test/api/adbc/test_adbc.cpp b/test/api/adbc/test_adbc.cpp index afcbb596d073..213ed7710dd0 100644 --- a/test/api/adbc/test_adbc.cpp +++ b/test/api/adbc/test_adbc.cpp @@ -194,6 +194,19 @@ TEST_CASE("ADBC - Test ingestion - Lineitem", "[adbc]") { REQUIRE(db.QueryAndCheck("SELECT l_partkey, l_comment FROM lineitem WHERE l_orderkey=1 ORDER BY l_linenumber")); } +TEST_CASE("ADBC - Pivot", "[adbc]") { + if (!duckdb_lib) { + return; + } + ADBCTestDatabase db; + + auto input_data = db.QueryArrow("SELECT * FROM read_csv_auto(\'data/csv/flights.csv\')"); + + db.CreateTable("flights", input_data); + + REQUIRE(db.QueryAndCheck("PIVOT flights ON UniqueCarrier USING COUNT(1) GROUP BY OriginCityName;")); +} + TEST_CASE("Test Null Error/Database", "[adbc]") { if (!duckdb_lib) { return; From 1dbbb6c1370706afcc12c9f7f53663eca447ddba Mon Sep 17 00:00:00 2001 From: Richard Wesley Date: Mon, 10 Feb 2025 12:02:48 -0800 Subject: [PATCH 023/142] Issue #8265: AsOf Nested Loop * For small probe cardinalities, plan a nested loop join + aggregate * Add asof_loop_join_threshold setting to control its use. --- src/common/settings.json | 6 + .../physical_plan/plan_asof_join.cpp | 232 +++++++++++++++++- src/include/duckdb/main/client_config.hpp | 2 + src/include/duckdb/main/settings.hpp | 11 + src/main/config.cpp | 1 + src/main/settings/autogenerated_settings.cpp | 17 ++ test/sql/join/asof/test_asof_join.test | 5 +- .../join/asof/test_asof_join_merge.test_slow | 4 + .../join/asof/test_asof_join_pushdown.test | 3 +- 9 files changed, 277 insertions(+), 4 deletions(-) diff --git a/src/common/settings.json b/src/common/settings.json index 6b6718ea5474..ffa09591a398 100644 --- a/src/common/settings.json +++ b/src/common/settings.json @@ -114,6 +114,12 @@ "internal_setting": "arrow_use_list_view", "scope": "global" }, + { + "name": "asof_loop_join_threshold", + "description": "The maximum number of rows we need on the left side of an ASOF join to use a nested loop join", + "type": "UBIGINT", + "scope": "local" + }, { "name": "autoinstall_extension_repository", "description": "Overrides the custom endpoint for extension installation on autoloading", diff --git a/src/execution/physical_plan/plan_asof_join.cpp b/src/execution/physical_plan/plan_asof_join.cpp index 927defa4ff27..aa2df50d6313 100644 --- a/src/execution/physical_plan/plan_asof_join.cpp +++ b/src/execution/physical_plan/plan_asof_join.cpp @@ -1,8 +1,14 @@ +#include "duckdb/catalog/catalog_entry/aggregate_function_catalog_entry.hpp" +#include "duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp" +#include "duckdb/execution/operator/aggregate/physical_streaming_window.hpp" #include "duckdb/execution/operator/aggregate/physical_window.hpp" #include "duckdb/execution/operator/join/physical_asof_join.hpp" #include "duckdb/execution/operator/join/physical_iejoin.hpp" +#include "duckdb/execution/operator/join/physical_nested_loop_join.hpp" #include "duckdb/execution/operator/projection/physical_projection.hpp" +#include "duckdb/function/aggregate/distributive_function_utils.hpp" #include "duckdb/execution/physical_plan_generator.hpp" +#include "duckdb/function/function_binder.hpp" #include "duckdb/main/client_context.hpp" #include "duckdb/planner/expression/bound_constant_expression.hpp" #include "duckdb/planner/expression/bound_reference_expression.hpp" @@ -10,6 +16,223 @@ namespace duckdb { +static unique_ptr PlanAsOfLoopJoin(LogicalComparisonJoin &op, unique_ptr &probe, + unique_ptr &build, ClientContext &context) { + + // Plan a inverse nested loop join, then aggregate the values to choose the optimal match for each probe row. + // Use a row number primary key to handle duplicate probe values. + // aggregate the fields to produce at most one match per probe row, + // then project the columns back into the correct order and drop the primary key. + // + // ∠* \ pk + // | + // Γ pk;first(P),arg_xxx(B,inequality) + // | + // ∠*,inequality + // | + // ⨠swapped + // / \ + // B W pk:row_number + // | + // P + + LogicalComparisonJoin join_op(InverseJoinType(op.join_type)); + + join_op.types = op.children[1]->types; + const auto &probe_types = op.children[0]->types; + join_op.types.insert(join_op.types.end(), probe_types.begin(), probe_types.end()); + + // Fill in the projection maps to simplify the code below + // Since NLJ doesn't support projection, but ASOF does, + // we have to track this carefully... + join_op.left_projection_map = op.right_projection_map; + if (join_op.left_projection_map.empty()) { + for (idx_t i = 0; i < op.children[1]->types.size(); ++i) { + join_op.left_projection_map.emplace_back(i); + } + } + + join_op.right_projection_map = op.left_projection_map; + if (join_op.right_projection_map.empty()) { + for (idx_t i = 0; i < op.children[0]->types.size(); ++i) { + join_op.right_projection_map.emplace_back(i); + } + } + + // Project pk + LogicalType pk_type = LogicalType::BIGINT; + join_op.types.emplace_back(pk_type); + + auto binder = Binder::CreateBinder(context); + FunctionBinder function_binder(*binder); + auto asof_idx = op.conditions.size(); + string arg_min_max; + for (idx_t i = 0; i < op.conditions.size(); ++i) { + const auto &cond = op.conditions[i]; + JoinCondition nested_cond; + nested_cond.left = cond.right->Copy(); + nested_cond.right = cond.left->Copy(); + if (!nested_cond.left || !nested_cond.right) { + return nullptr; + } + nested_cond.comparison = FlipComparisonExpression(cond.comparison); + join_op.conditions.emplace_back(std::move(nested_cond)); + switch (cond.comparison) { + case ExpressionType::COMPARE_GREATERTHANOREQUALTO: + case ExpressionType::COMPARE_GREATERTHAN: + D_ASSERT(asof_idx == op.conditions.size()); + asof_idx = i; + arg_min_max = "arg_max"; + break; + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + case ExpressionType::COMPARE_LESSTHAN: + D_ASSERT(asof_idx == op.conditions.size()); + asof_idx = i; + arg_min_max = "arg_min"; + break; + default: + break; + } + } + + // NLJ does not support some join types + switch (join_op.join_type) { + case JoinType::SEMI: + case JoinType::ANTI: + case JoinType::MARK: + case JoinType::INNER: + case JoinType::RIGHT: + // Unfortunately, this does not check all the join types... + if (!PhysicalNestedLoopJoin::IsSupported(op.conditions, op.join_type)) { + return nullptr; + } + break; + case JoinType::OUTER: + case JoinType::LEFT: + // RIGHT ASOF JOINs produce the entire build table and would require grouping on all build rows, + // which defeats the purpose of this optimisation. + default: + return nullptr; + } + + QueryErrorContext error_context; + auto arg_min_max_func = binder->GetCatalogEntry(CatalogType::SCALAR_FUNCTION_ENTRY, SYSTEM_CATALOG, DEFAULT_SCHEMA, + arg_min_max, OnEntryNotFound::RETURN_NULL, error_context); + // Can't find the arg_min/max aggregate we need, so give up before we break anything. + if (!arg_min_max_func || arg_min_max_func->type != CatalogType::AGGREGATE_FUNCTION_ENTRY) { + return nullptr; + } + auto &arg_min_max_entry = arg_min_max_func->Cast(); + + // PhysicalHashAggregate requires that the arguments to aggregate functions be bound references, + // so we Project the (shared) ordering argument on the end of the join results. + vector> comp_list; + for (const auto &col_type : join_op.types) { + const auto col_idx = comp_list.size(); + comp_list.emplace_back(make_uniq(col_type, col_idx)); + } + vector comp_types = join_op.types; + auto comp_expr = op.conditions[asof_idx].right->Copy(); + comp_types.emplace_back(comp_expr->return_type); + comp_list.emplace_back(std::move(comp_expr)); + + // Bind the aggregates first so we can abort safely if we can't find one. + vector aggr_types(1, pk_type); + + // Wrap all the projected non-pk probe fields in `first` aggregates; + vector> aggregates; + for (const auto &i : join_op.right_projection_map) { + const auto col_idx = op.children[1]->types.size() + i; + const auto col_type = join_op.types[col_idx]; + aggr_types.emplace_back(col_type); + + vector> aggr_children; + auto col_ref = make_uniq(col_type, col_idx); + aggr_children.push_back(std::move(col_ref)); + + auto first_aggregate = FirstFunctionGetter::GetFunction(col_type); + auto aggr_expr = make_uniq(std::move(first_aggregate), std::move(aggr_children), + nullptr, nullptr, AggregateType::NON_DISTINCT); + D_ASSERT(col_type == aggr_expr->return_type); + aggregates.emplace_back(std::move(aggr_expr)); + } + + // Wrap all the projected build fields in `arg_max/min` aggregates using the inequality ordering; + // We are doing all this first in case we can't find a matching function. + for (const auto &col_idx : join_op.left_projection_map) { + const auto col_type = join_op.types[col_idx]; + aggr_types.emplace_back(col_type); + + vector> aggr_children; + auto col_ref = make_uniq(col_type, col_idx); + aggr_children.push_back(std::move(col_ref)); + auto comp_expr = make_uniq(comp_types.back(), comp_types.size() - 1); + aggr_children.push_back(std::move(comp_expr)); + vector child_types; + for (const auto &child : aggr_children) { + child_types.emplace_back(child->return_type); + } + + auto &func = arg_min_max_entry; + ErrorData error; + auto best_function = function_binder.BindFunction(func.name, func.functions, child_types, error); + if (!best_function.IsValid()) { + return nullptr; + } + auto bound_function = func.functions.GetFunctionByOffset(best_function.GetIndex()); + auto aggr_expr = function_binder.BindAggregateFunction(bound_function, std::move(aggr_children), nullptr, + AggregateType::NON_DISTINCT); + D_ASSERT(col_type == aggr_expr->return_type); + aggregates.emplace_back(std::move(aggr_expr)); + } + + // Add a synthetic primary integer key to the probe relation using streaming windowing. + vector> window_select; + auto pk = make_uniq(ExpressionType::WINDOW_ROW_NUMBER, pk_type, nullptr, nullptr); + pk->start = WindowBoundary::UNBOUNDED_PRECEDING; + pk->end = WindowBoundary::CURRENT_ROW_ROWS; + pk->alias = "row_number"; + window_select.emplace_back(std::move(pk)); + + auto window_types = probe->types; + window_types.emplace_back(pk_type); + + idx_t probe_cardinality = op.children[0]->EstimateCardinality(context); + auto window = make_uniq(window_types, std::move(window_select), probe_cardinality); + window->children.emplace_back(std::move(probe)); + + auto join = make_uniq(join_op, std::move(build), std::move(window), + std::move(join_op.conditions), join_op.join_type, probe_cardinality); + + // Plan a projection of the compare column + auto comp = make_uniq(std::move(comp_types), std::move(comp_list), probe_cardinality); + comp->children.emplace_back(std::move(join)); + + // Plan an aggregation on the output of the join, grouping by key; + // TODO: Can we make it perfect? + // Note that the NLJ produced all fields, but only the projected ones were aggregated + vector> groups; + auto pk_ref = make_uniq(pk_type, join_op.types.size() - 1); + groups.emplace_back(std::move(pk_ref)); + auto aggr = make_uniq(context, aggr_types, std::move(aggregates), std::move(groups), + probe_cardinality); + aggr->children.emplace_back(std::move(comp)); + + // Project away primary/grouping key + // The aggregates were generated in the output order of the original ASOF, + // so we just have to shift away the pk + vector> project_list; + for (column_t i = 1; i < aggr->types.size(); ++i) { + auto col_ref = make_uniq(aggr->types[i], i); + project_list.emplace_back(std::move(col_ref)); + } + + auto proj = make_uniq(op.types, std::move(project_list), probe_cardinality); + proj->children.emplace_back(std::move(aggr)); + + return proj; +} + unique_ptr PhysicalPlanGenerator::PlanAsOfJoin(LogicalComparisonJoin &op) { // now visit the children D_ASSERT(op.children.size() == 2); @@ -42,7 +265,14 @@ unique_ptr PhysicalPlanGenerator::PlanAsOfJoin(LogicalComparis } D_ASSERT(asof_idx < op.conditions.size()); - if (!ClientConfig::GetConfig(context).force_asof_iejoin) { + auto &config = ClientConfig::GetConfig(context); + if (!config.force_asof_iejoin) { + if (op.children[0]->has_estimated_cardinality && lhs_cardinality <= config.asof_loop_join_threshold) { + auto result = PlanAsOfLoopJoin(op, left, right, context); + if (result) { + return result; + } + } return make_uniq(op, std::move(left), std::move(right)); } diff --git a/src/include/duckdb/main/client_config.hpp b/src/include/duckdb/main/client_config.hpp index 9cedb4074a49..4e73f8f922b8 100644 --- a/src/include/duckdb/main/client_config.hpp +++ b/src/include/duckdb/main/client_config.hpp @@ -101,6 +101,8 @@ struct ClientConfig { idx_t nested_loop_join_threshold = 5; //! The number of rows we need on either table to choose a merge join over an IE join idx_t merge_join_threshold = 1000; + //! The maximum number of rows to use the nested loop join implementation + idx_t asof_loop_join_threshold = 2048; //! The maximum amount of memory to keep buffered in a streaming query result. Default: 1mb. idx_t streaming_buffer_size = 1000000; diff --git a/src/include/duckdb/main/settings.hpp b/src/include/duckdb/main/settings.hpp index b9c979dcaa13..02ee9b2ee507 100644 --- a/src/include/duckdb/main/settings.hpp +++ b/src/include/duckdb/main/settings.hpp @@ -218,6 +218,17 @@ struct ArrowOutputListViewSetting { static Value GetSetting(const ClientContext &context); }; +struct AsofLoopJoinThresholdSetting { + using RETURN_TYPE = idx_t; + static constexpr const char *Name = "asof_loop_join_threshold"; + static constexpr const char *Description = + "The maximum number of rows we need on the left side of an ASOF join to use a nested loop join"; + static constexpr const char *InputType = "UBIGINT"; + static void SetLocal(ClientContext &context, const Value ¶meter); + static void ResetLocal(ClientContext &context); + static Value GetSetting(const ClientContext &context); +}; + struct AutoinstallExtensionRepositorySetting { using RETURN_TYPE = string; static constexpr const char *Name = "autoinstall_extension_repository"; diff --git a/src/main/config.cpp b/src/main/config.cpp index a69075c6d65a..2bebd3458093 100644 --- a/src/main/config.cpp +++ b/src/main/config.cpp @@ -72,6 +72,7 @@ static const ConfigurationOption internal_options[] = { DUCKDB_GLOBAL(ArrowLargeBufferSizeSetting), DUCKDB_GLOBAL(ArrowLosslessConversionSetting), DUCKDB_GLOBAL(ArrowOutputListViewSetting), + DUCKDB_LOCAL(AsofLoopJoinThresholdSetting), DUCKDB_GLOBAL(AutoinstallExtensionRepositorySetting), DUCKDB_GLOBAL(AutoinstallKnownExtensionsSetting), DUCKDB_GLOBAL(AutoloadKnownExtensionsSetting), diff --git a/src/main/settings/autogenerated_settings.cpp b/src/main/settings/autogenerated_settings.cpp index d007da71fc52..c7c71fd0c177 100644 --- a/src/main/settings/autogenerated_settings.cpp +++ b/src/main/settings/autogenerated_settings.cpp @@ -177,6 +177,23 @@ Value ArrowOutputListViewSetting::GetSetting(const ClientContext &context) { return Value::BOOLEAN(config.options.arrow_use_list_view); } +//===----------------------------------------------------------------------===// +// Asof Loop Join Threshold +//===----------------------------------------------------------------------===// +void AsofLoopJoinThresholdSetting::SetLocal(ClientContext &context, const Value &input) { + auto &config = ClientConfig::GetConfig(context); + config.asof_loop_join_threshold = input.GetValue(); +} + +void AsofLoopJoinThresholdSetting::ResetLocal(ClientContext &context) { + ClientConfig::GetConfig(context).asof_loop_join_threshold = ClientConfig().asof_loop_join_threshold; +} + +Value AsofLoopJoinThresholdSetting::GetSetting(const ClientContext &context) { + auto &config = ClientConfig::GetConfig(context); + return Value::UBIGINT(config.asof_loop_join_threshold); +} + //===----------------------------------------------------------------------===// // Autoinstall Extension Repository //===----------------------------------------------------------------------===// diff --git a/test/sql/join/asof/test_asof_join.test b/test/sql/join/asof/test_asof_join.test index 4f8e0ae0123e..a879c582fc8b 100644 --- a/test/sql/join/asof/test_asof_join.test +++ b/test/sql/join/asof/test_asof_join.test @@ -68,10 +68,11 @@ SELECT s1.starts as s1_starts, s2.starts as s2_starts, FROM samples AS s1 ASOF JOIN samples as s2 ON s2.ends >= (s1.ends - 5) -WHERE s1_starts <> s2_starts; +WHERE s1_starts <> s2_starts +ORDER BY ALL ---- -21 14 10 5 +21 14 # Use an ASOF join inside of a correlated subquery diff --git a/test/sql/join/asof/test_asof_join_merge.test_slow b/test/sql/join/asof/test_asof_join_merge.test_slow index 544deaad4cd5..12266d3747b7 100644 --- a/test/sql/join/asof/test_asof_join_merge.test_slow +++ b/test/sql/join/asof/test_asof_join_merge.test_slow @@ -11,6 +11,10 @@ PRAGMA threads=4 statement ok SET temp_directory='__TEST_DIR__/temp.tmp' +# Force PhysicalAsOfJoin +statement ok +PRAGMA asof_loop_join_threshold = 0; + query II WITH build AS ( SELECT k, ('2021-01-01'::TIMESTAMP + INTERVAL (i) SECOND) AS t, i % 37 AS v diff --git a/test/sql/join/asof/test_asof_join_pushdown.test b/test/sql/join/asof/test_asof_join_pushdown.test index 1ef308a6eb91..9345d84b58f7 100644 --- a/test/sql/join/asof/test_asof_join_pushdown.test +++ b/test/sql/join/asof/test_asof_join_pushdown.test @@ -24,7 +24,8 @@ FROM right_pushdown d1 ASOF JOIN ( SELECT * FROM right_pushdown WHERE value is not NULL ) d2 - ON d1.time >= d2.time; + ON d1.time >= d2.time +ORDER BY ALL; ---- 0 0 0.0 0.0 1 0 NULL 0.0 From 5d8434ab6214d1a6646a7a1f0e02da12d231ebf0 Mon Sep 17 00:00:00 2001 From: Richard Wesley Date: Mon, 10 Feb 2025 12:22:35 -0800 Subject: [PATCH 024/142] Issue #8265: AsOf Nested Loop * Add asof_loop_join_threshold loops to tests. --- test/optimizer/joins/asof_join_adds_rows.test | 10 ++++- .../cross_join_and_unnest_dont_work.test | 10 ++++- test/sql/join/asof/test_asof_join.test | 9 +++- .../sql/join/asof/test_asof_join_doubles.test | 4 ++ .../asof/test_asof_join_inequalities.test | 8 ++++ .../join/asof/test_asof_join_integers.test | 11 +++-- .../asof/test_asof_join_missing.test_slow | 3 ++ .../join/asof/test_asof_join_pushdown.test | 42 +++++++++++-------- .../join/asof/test_asof_join_subquery.test | 8 ++++ .../join/asof/test_asof_join_timestamps.test | 8 ++++ .../sql/join/asof/test_asof_join_varchar.test | 8 ++++ 11 files changed, 97 insertions(+), 24 deletions(-) diff --git a/test/optimizer/joins/asof_join_adds_rows.test b/test/optimizer/joins/asof_join_adds_rows.test index 2b15fbcb45a3..2ebdfd145f3a 100644 --- a/test/optimizer/joins/asof_join_adds_rows.test +++ b/test/optimizer/joins/asof_join_adds_rows.test @@ -37,6 +37,12 @@ create table large_build as from values (1, '1992-03-22 01:02:19'::TIMESTAMP), (1, '1992-03-22 01:02:20'::TIMESTAMP) t(lb_const, b); +# Compare NLJ optimisation to operator +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; + query I select a from (select * from small_probe, child_join where c=sp_const) asof join large_build on (lb_const = sp_const and a < b) order by a; ---- @@ -109,4 +115,6 @@ ORDER BY timepoint; ---- ID1 fqn1 fqn1 2021-01-01 00:00:00 -ID1 fqn2 fqn2 2021-03-03 00:00:00 \ No newline at end of file +ID1 fqn2 fqn2 2021-03-03 00:00:00 + +endloop diff --git a/test/optimizer/joins/cross_join_and_unnest_dont_work.test b/test/optimizer/joins/cross_join_and_unnest_dont_work.test index ee1eee4acd87..4c4c6bf5356e 100644 --- a/test/optimizer/joins/cross_join_and_unnest_dont_work.test +++ b/test/optimizer/joins/cross_join_and_unnest_dont_work.test @@ -37,6 +37,12 @@ create table large_build as from values (1, '1992-03-22 01:02:19'::TIMESTAMP), (1, '1992-03-22 01:02:20'::TIMESTAMP) t(lb_const, b); +# Compare NLJ optimisation to operator +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; + query I select a from (select * from small_probe, child_join where c=sp_const) asof join large_build on (lb_const = sp_const and a < b) order by a; ---- @@ -109,4 +115,6 @@ ORDER BY timepoint; ---- ID1 fqn1 fqn1 2021-01-01 00:00:00 -ID1 fqn2 fqn2 2021-03-03 00:00:00 \ No newline at end of file +ID1 fqn2 fqn2 2021-03-03 00:00:00 + +endloop diff --git a/test/sql/join/asof/test_asof_join.test b/test/sql/join/asof/test_asof_join.test index a879c582fc8b..4f1dd370ce13 100644 --- a/test/sql/join/asof/test_asof_join.test +++ b/test/sql/join/asof/test_asof_join.test @@ -30,6 +30,12 @@ create table trades("when" timestamp, symbol int); statement ok insert into trades values ('2020-01-01 00:00:03', 1); +# Compare NLJ optimisation to operator +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; + query III SELECT t.*, p.price FROM trades t ASOF JOIN prices p @@ -74,8 +80,7 @@ ORDER BY ALL 10 5 21 14 -# Use an ASOF join inside of a correlated subquery - +endloop # # Errors diff --git a/test/sql/join/asof/test_asof_join_doubles.test b/test/sql/join/asof/test_asof_join_doubles.test index e6c98a7aed3c..9fa3d24df568 100644 --- a/test/sql/join/asof/test_asof_join_doubles.test +++ b/test/sql/join/asof/test_asof_join_doubles.test @@ -9,6 +9,9 @@ PRAGMA enable_verification # Inequality only # +statement ok +PRAGMA asof_loop_join_threshold=0; + # Use doubles for readable infinities statement ok CREATE TABLE events0 (begin DOUBLE, value INTEGER); @@ -324,3 +327,4 @@ ASOF RIGHT JOIN USING (key, begin) ORDER BY 1 ASC NULLS FIRST, 2 ---- + diff --git a/test/sql/join/asof/test_asof_join_inequalities.test b/test/sql/join/asof/test_asof_join_inequalities.test index 8d5a3e1f312b..eef40824789c 100644 --- a/test/sql/join/asof/test_asof_join_inequalities.test +++ b/test/sql/join/asof/test_asof_join_inequalities.test @@ -37,6 +37,12 @@ foreach debug False True statement ok PRAGMA debug_asof_iejoin=${debug} +# Check NLJ results against both +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; + # # Strictly Greater Than # @@ -229,3 +235,5 @@ NULL -infinity -1 NULL NULL -10 endloop + +endloop diff --git a/test/sql/join/asof/test_asof_join_integers.test b/test/sql/join/asof/test_asof_join_integers.test index 1bc4b0762a6b..77c414fb4631 100644 --- a/test/sql/join/asof/test_asof_join_integers.test +++ b/test/sql/join/asof/test_asof_join_integers.test @@ -5,7 +5,7 @@ statement ok PRAGMA enable_verification -# Join on a string range +# Join on an integer range statement ok CREATE TABLE events0 (begin INTEGER, value INTEGER); @@ -26,8 +26,11 @@ CREATE TABLE probe0 AS FROM range(0,10) ; -# This is not implemented yet because it requires a dedicated operator -# instead of LEAD(...infinity::INTEGER) +# Compare NLJ optimisation to operator +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; # INNER ON inequality only query II @@ -134,3 +137,5 @@ ORDER BY ALL 9 3 NULL -1 NULL 9 + +endloop diff --git a/test/sql/join/asof/test_asof_join_missing.test_slow b/test/sql/join/asof/test_asof_join_missing.test_slow index 57a2c64e5231..17355c981f99 100644 --- a/test/sql/join/asof/test_asof_join_missing.test_slow +++ b/test/sql/join/asof/test_asof_join_missing.test_slow @@ -13,6 +13,9 @@ PRAGMA enable_verification # * First payload bin empty # * Multiple scanned payload blocks +statement ok +PRAGMA asof_loop_join_threshold=0; + # Check results against IEJoin foreach debug False True diff --git a/test/sql/join/asof/test_asof_join_pushdown.test b/test/sql/join/asof/test_asof_join_pushdown.test index 9345d84b58f7..465b407ad850 100644 --- a/test/sql/join/asof/test_asof_join_pushdown.test +++ b/test/sql/join/asof/test_asof_join_pushdown.test @@ -14,6 +14,29 @@ INSERT INTO right_pushdown VALUES (1, NULL), ; +statement ok +CREATE TABLE issue13899(seq_no INT, amount DECIMAL(10,2)); + +statement ok +INSERT INTO issue13899 VALUES + (1,1.00), + (2,null), + (3,null), + (4,null), + (5,2.00), + (6,null), + (7,null), + (8,3.00), + (9,null), + (10,null), + (11,5.00); + +# Compare NLJ optimisation to operator +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; + query IIII SELECT d1.time, @@ -104,23 +127,6 @@ ORDER BY ALL 5 6 10 11 -statement ok -CREATE TABLE issue13899(seq_no INT, amount DECIMAL(10,2)); - -statement ok -INSERT INTO issue13899 VALUES - (1,1.00), - (2,null), - (3,null), - (4,null), - (5,2.00), - (6,null), - (7,null), - (8,3.00), - (9,null), - (10,null), - (11,5.00); - query III select a.seq_no, @@ -143,3 +149,5 @@ ORDER BY 1 9 NULL 3.00 10 NULL 3.00 11 5.00 5.00 + +endloop diff --git a/test/sql/join/asof/test_asof_join_subquery.test b/test/sql/join/asof/test_asof_join_subquery.test index ec2f72687adb..f61c3f67e602 100644 --- a/test/sql/join/asof/test_asof_join_subquery.test +++ b/test/sql/join/asof/test_asof_join_subquery.test @@ -16,6 +16,12 @@ INSERT INTO events VALUES (8, 3) ; +# Compare NLJ optimisation to operator +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; + query II SELECT begin, value IN ( SELECT e1.value @@ -34,3 +40,5 @@ ORDER BY ALL; 3.0 true 6.0 true 8.0 true + +endloop diff --git a/test/sql/join/asof/test_asof_join_timestamps.test b/test/sql/join/asof/test_asof_join_timestamps.test index 7e2a6ec0e69e..00aa276d793d 100644 --- a/test/sql/join/asof/test_asof_join_timestamps.test +++ b/test/sql/join/asof/test_asof_join_timestamps.test @@ -32,6 +32,12 @@ INSERT INTO probe0 VALUES ('infinity') ; +# Compare NLJ optimisation to operator +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; + # INNER ON inequality only query II nosort SELECT p.begin, e.value @@ -204,3 +210,5 @@ ON p.begin >= e.begin ORDER BY p.begin ASC ---- 2023-03-21 12:00:00 + +endloop diff --git a/test/sql/join/asof/test_asof_join_varchar.test b/test/sql/join/asof/test_asof_join_varchar.test index ef5f31e7bfa4..b008c0e2816f 100644 --- a/test/sql/join/asof/test_asof_join_varchar.test +++ b/test/sql/join/asof/test_asof_join_varchar.test @@ -26,6 +26,12 @@ CREATE TABLE probe0 AS FROM range(0,10) ; +# Compare NLJ optimisation to operator +foreach threshold 0 32 + +statement ok +PRAGMA asof_loop_join_threshold = ${threshold}; + # INNER ON inequality only query II SELECT p.begin, e.value @@ -131,3 +137,5 @@ ORDER BY ALL 9 3 NULL -1 NULL 9 + +endloop From 110808fe6dac91c1ea0f8a699be2dd1ec756a127 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 11 Feb 2025 00:48:04 +0100 Subject: [PATCH 025/142] allow escaping whitespace --- src/function/cast/vector_cast_helpers.cpp | 26 +++++++++++++++++------ test/sql/cast/string_to_list_cast.test | 5 +++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 347d7102bfa1..d9d9d3c2717a 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -34,9 +34,21 @@ inline static void SkipWhitespace(StringCastInputState &input_state) { auto &buf = input_state.buf; auto &pos = input_state.pos; auto &len = input_state.len; - while (pos < len && StringUtil::CharacterIsSpace(buf[pos])) { + while (pos < len) { + bool set_escaped = false; + if (buf[pos] == '\\') { + if (!input_state.escaped) { + set_escaped = true; + } + } else if (StringUtil::CharacterIsSpace(buf[pos])) { + if (input_state.escaped) { + break; + } + } else { + break; + } pos++; - input_state.escaped = false; + input_state.escaped = set_escaped; } } @@ -282,7 +294,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { if (!input_state.escaped) { set_escaped = true; } - } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { if (!start_pos.IsValid()) { start_pos = pos; } @@ -413,7 +425,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!input_state.escaped) { set_escaped = true; } - } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { if (!start_pos.IsValid()) { start_pos = pos; } @@ -476,7 +488,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!input_state.escaped) { set_escaped = true; } - } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { if (!start_pos.IsValid()) { start_pos = pos; } @@ -586,7 +598,7 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Tue, 11 Feb 2025 01:53:39 +0100 Subject: [PATCH 026/142] better way of dealing with escaped spaces --- src/function/cast/vector_cast_helpers.cpp | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index d9d9d3c2717a..29be5cc75e71 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -34,21 +34,12 @@ inline static void SkipWhitespace(StringCastInputState &input_state) { auto &buf = input_state.buf; auto &pos = input_state.pos; auto &len = input_state.len; - while (pos < len) { - bool set_escaped = false; - if (buf[pos] == '\\') { - if (!input_state.escaped) { - set_escaped = true; - } - } else if (StringUtil::CharacterIsSpace(buf[pos])) { - if (input_state.escaped) { - break; - } - } else { - break; - } + if (input_state.escaped) { + return; + } + while (pos < len && StringUtil::CharacterIsSpace(buf[pos])) { pos++; - input_state.escaped = set_escaped; + input_state.escaped = false; } } From f65c09760338f9aa031dad63d6f4d50cca3fb8c6 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 11 Feb 2025 14:16:33 +0100 Subject: [PATCH 027/142] yet another case of missing backslash escape logic --- src/function/cast/vector_cast_helpers.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 29be5cc75e71..a51ff8ee1732 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -623,6 +623,7 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Tue, 11 Feb 2025 14:38:24 +0100 Subject: [PATCH 028/142] don't trim escaped backslashes at the end of the input --- src/function/cast/vector_cast_helpers.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index a51ff8ee1732..9154681dcead 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -285,6 +285,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { if (!input_state.escaped) { set_escaped = true; } + end_pos = pos; } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { if (!start_pos.IsValid()) { start_pos = pos; @@ -416,6 +417,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!input_state.escaped) { set_escaped = true; } + end_pos = pos; } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { if (!start_pos.IsValid()) { start_pos = pos; @@ -479,6 +481,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!input_state.escaped) { set_escaped = true; } + end_pos = pos; } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { if (!start_pos.IsValid()) { start_pos = pos; @@ -589,6 +592,7 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Tue, 11 Feb 2025 10:53:00 -0300 Subject: [PATCH 029/142] whis is this leaking --- src/common/adbc/adbc.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/adbc/adbc.cpp b/src/common/adbc/adbc.cpp index 09ac3aa5d8dc..0a77b0e67b42 100644 --- a/src/common/adbc/adbc.cpp +++ b/src/common/adbc/adbc.cpp @@ -880,7 +880,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char auto error_msg_extract_statements = duckdb_extract_statements_error(extracted_statements); if (error_msg_extract_statements != nullptr) { // Things went wrong when executing internal prepared statement - delete extracted_statements; + delete (reinterpret_cast (&extracted_statements)); SetError(error, error_msg_extract_statements); return ADBC_STATUS_INTERNAL; } @@ -893,7 +893,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char auto adbc_status = CheckResult(res, error, error_msg); if (adbc_status != ADBC_STATUS_OK) { // Things went wrong when executing internal prepared statement - delete extracted_statements; + delete (reinterpret_cast (&extracted_statements)); delete statement_internal; return adbc_status; } @@ -904,7 +904,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char SetError(error, duckdb_query_arrow_error(out_result)); delete out_result; delete statement_internal; - delete extracted_statements; + delete (reinterpret_cast (&extracted_statements)); return ADBC_STATUS_INVALID_ARGUMENT; } delete out_result; @@ -914,7 +914,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char auto res = duckdb_prepare_extracted_statement(wrapper->connection, extracted_statements, extract_statements_size - 1, &wrapper->statement); auto error_msg = duckdb_prepare_error(wrapper->statement); - delete extracted_statements; + delete (reinterpret_cast (&extracted_statements)); return CheckResult(res, error, error_msg); } From fb59f61e0178288ba3adb2c4cb72e49f994f24d5 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 11 Feb 2025 17:20:44 +0100 Subject: [PATCH 030/142] add support for unnamed struct format to VARCHAR->STRUCT cast --- src/function/cast/vector_cast_helpers.cpp | 404 +++++++++++++------- test/sql/cast/string_to_unnamed_struct.test | 61 +++ 2 files changed, 326 insertions(+), 139 deletions(-) create mode 100644 test/sql/cast/string_to_unnamed_struct.test diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 9154681dcead..572b03e563e7 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -86,6 +86,8 @@ static bool SkipToClose(StringCastInputState &input_state, idx_t &lvl, char clos } } else if (buf[idx] == '{') { brackets.push_back('}'); + } else if (buf[idx] == '(') { + brackets.push_back(')'); } else if (buf[idx] == '[') { brackets.push_back(']'); lvl++; @@ -538,171 +540,295 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vectorsecond; + + start_pos = optional_idx(); + pos++; + SkipWhitespace(input_state); + while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { + bool set_escaped = false; + if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToCloseQuotes(input_state)) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '{') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToClose(input_state, lvl, '}')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '(') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToClose(input_state, lvl, ')')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '[') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '\\') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + set_escaped = true; + } + end_pos = pos; + } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; } - end_pos = pos; + pos++; + input_state.escaped = set_escaped; + } + if (pos == len) { + return false; + } + auto &child_vec = *varchar_vectors[child_idx]; + auto string_data = FlatVector::GetData(child_vec); + auto &child_mask = child_masks[child_idx].get(); + + if (!start_pos.IsValid()) { + start_pos = 0; + end_pos = 0; + } else { + end_pos++; + } + auto value_start = start_pos.GetIndex(); + StringCastInputState value_temp_state(buf, value_start, end_pos); + if (IsNull(value_temp_state)) { + child_mask.SetInvalid(row_idx); + } else { + string_data[row_idx] = HandleString(child_vec, buf, value_start, end_pos); + child_mask.SetValid(row_idx); + } + + if (buf[pos] == '}') { + break; } - input_state.escaped = set_escaped; pos++; + SkipWhitespace(input_state); } - if (pos == len) { - return false; - } - if (!start_pos.IsValid()) { - //! Key can not be empty - return false; - } - idx_t key_start = start_pos.GetIndex(); - end_pos++; - StringCastInputState key_temp_state(buf, key_start, end_pos); - if (IsNull(key_temp_state)) { - //! Key can not be NULL - return false; - } - auto child_name = HandleString(temp_vec, buf, key_start, end_pos); - auto it = child_names.find(child_name); - if (it == child_names.end()) { - return false; // false key - } - child_idx = it->second; + } else { + //! This is an unnamed struct in the form of `(value, value_2, ...)` + D_ASSERT(end_char == ')'); + idx_t child_idx = 0; + while (pos < len) { + if (child_idx == child_names.size()) { + return false; + } - start_pos = optional_idx(); - pos++; - SkipWhitespace(input_state); - while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { - bool set_escaped = false; - if (buf[pos] == '"' || buf[pos] == '\'') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!input_state.escaped) { - if (!SkipToCloseQuotes(input_state)) { - return false; + optional_idx start_pos; + idx_t end_pos; + while (pos < len && ((buf[pos] != ',' && buf[pos] != ')') || input_state.escaped)) { + bool set_escaped = false; + if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; } - } - end_pos = pos; - } else if (buf[pos] == '{') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!input_state.escaped) { - if (!SkipToClose(input_state, lvl, '}')) { - return false; + if (!input_state.escaped) { + if (!SkipToCloseQuotes(input_state)) { + return false; + } } - } - end_pos = pos; - } else if (buf[pos] == '[') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!input_state.escaped) { - lvl++; - if (!SkipToClose(input_state, lvl, ']')) { - return false; + end_pos = pos; + } else if (buf[pos] == '{') { + if (!start_pos.IsValid()) { + start_pos = pos; } + if (!input_state.escaped) { + if (!SkipToClose(input_state, lvl, '}')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '(') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + if (!SkipToClose(input_state, lvl, ')')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '[') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; + } + } + end_pos = pos; + } else if (buf[pos] == '\\') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!input_state.escaped) { + set_escaped = true; + } + end_pos = pos; + } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; } - end_pos = pos; - } else if (buf[pos] == '\\') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!input_state.escaped) { - set_escaped = true; - } - end_pos = pos; - } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; + pos++; + input_state.escaped = set_escaped; } - pos++; - input_state.escaped = set_escaped; - } - if (pos == len) { - return false; - } - auto &child_vec = *varchar_vectors[child_idx]; - auto string_data = FlatVector::GetData(child_vec); - auto &child_mask = child_masks[child_idx].get(); + if (pos == len) { + return false; + } + auto &child_vec = *varchar_vectors[child_idx]; + auto string_data = FlatVector::GetData(child_vec); + auto &child_mask = child_masks[child_idx].get(); - if (!start_pos.IsValid()) { - start_pos = 0; - end_pos = 0; - } else { - end_pos++; - } - auto value_start = start_pos.GetIndex(); - StringCastInputState value_temp_state(buf, value_start, end_pos); - if (IsNull(value_temp_state)) { - child_mask.SetInvalid(row_idx); - } else { - string_data[row_idx] = HandleString(child_vec, buf, value_start, end_pos); - child_mask.SetValid(row_idx); - } + if (!start_pos.IsValid()) { + start_pos = 0; + end_pos = 0; + } else { + end_pos++; + } + auto value_start = start_pos.GetIndex(); + StringCastInputState value_temp_state(buf, value_start, end_pos); + if (IsNull(value_temp_state)) { + child_mask.SetInvalid(row_idx); + } else { + string_data[row_idx] = HandleString(child_vec, buf, value_start, end_pos); + child_mask.SetValid(row_idx); + } - if (buf[pos] == '}') { - break; + if (buf[pos] == ')') { + break; + } + child_idx++; + pos++; + SkipWhitespace(input_state); } - pos++; - SkipWhitespace(input_state); } pos++; SkipWhitespace(input_state); diff --git a/test/sql/cast/string_to_unnamed_struct.test b/test/sql/cast/string_to_unnamed_struct.test new file mode 100644 index 000000000000..879def4a9948 --- /dev/null +++ b/test/sql/cast/string_to_unnamed_struct.test @@ -0,0 +1,61 @@ +# name: test/sql/cast/string_to_unnamed_struct.test +# group: [cast] + +# Basic single value struct +query I +select $$(abc)$$::STRUCT(a VARCHAR) +---- +{'a': abc} + +# Multiple values +query I +select $$(abc, def, ghi)$$::STRUCT(a VARCHAR, b VARCHAR, c VARCHAR) +---- +{'a': abc, 'b': def, 'c': ghi} + +# Empty unnamed struct +query I +select $$()$$::STRUCT(a VARCHAR) +---- +{'a': NULL} + +# Nested regular struct inside unnamed struct +query I +select $$({'amount': 42})$$::STRUCT(a STRUCT(amount INT)) +---- +{'a': {'amount': 42}} + +# Nested unnamed struct inside unnamed struct +query I +select $$((42))$$::STRUCT(a STRUCT(amount INT)) +---- +{'a': {'amount': 42}} + +# Nested unnamed struct AND regular struct inside unnamed struct +query I +select $$((42), {amount: 21})$$::STRUCT(a STRUCT(amount INT), b STRUCT(amount INT)) +---- +{'a': {'amount': 42}, 'b': {'amount': 21}} + +# List inside unnamed struct +query I +select $$([1,2,3], [4,5,6])$$::STRUCT(a INTEGER[], b INTEGER[]) +---- +{'a': [1, 2, 3], 'b': [4, 5, 6]} + +statement error +select $$([1,2,3],)$$::STRUCT(a INTEGER[]) +---- +can't be cast to the destination type STRUCT + +# Empty string in the second child of the unnamed struct +query I +select $$([1,2,3],)$$::STRUCT(a INTEGER[], b VARCHAR) +---- +{'a': [1, 2, 3], 'b': } + +# Empty string in the second child of a named struct +query I +select $${'a': [1,2,3],'b':}$$::STRUCT(a INTEGER[], b VARCHAR) +---- +{'a': [1, 2, 3], 'b': } From 1748b0794e2f8477e444f754c2fa032c4e444ef9 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 11 Feb 2025 17:29:46 +0100 Subject: [PATCH 031/142] need to recognize ( as a scope as well --- src/function/cast/vector_cast_helpers.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 572b03e563e7..6a54b9435d42 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -160,9 +160,18 @@ static string_t HandleString(Vector &vec, const char *buf, idx_t start, idx_t en //! Close scope scopes.pop(); } - if (!quoted && (current_char == '[' || current_char == '{')) { + if (!quoted && (current_char == '[' || current_char == '{' || current_char == '(')) { //! New scope - scopes.push(current_char == '[' ? ']' : '}'); + char end_char; + if (current_char == '[') { + end_char = ']'; + } else if (current_char == '{') { + end_char = '}'; + } else { + D_ASSERT(current_char == '('); + end_char = ')'; + } + scopes.push(end_char); } //! Regular character string_data[copied_count++] = current_char; From 1da281bc7b055486a17d6443efe8b6d345236d5b Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 11 Feb 2025 17:33:50 +0100 Subject: [PATCH 032/142] add another test, with escaped leading and trailing spaces, and escaped backslashes --- test/sql/cast/string_to_unnamed_struct.test | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/sql/cast/string_to_unnamed_struct.test b/test/sql/cast/string_to_unnamed_struct.test index 879def4a9948..58f548065789 100644 --- a/test/sql/cast/string_to_unnamed_struct.test +++ b/test/sql/cast/string_to_unnamed_struct.test @@ -59,3 +59,8 @@ query I select $${'a': [1,2,3],'b':}$$::STRUCT(a INTEGER[], b VARCHAR) ---- {'a': [1, 2, 3], 'b': } + +query I +select $$[(" test "), {'a': (\\ test \\)}]$$::STRUCT(a VARCHAR)[] +---- +[{'a': test }, {'a': (\\ test \\)}] From 8252d4de62a05bfd5518ce7b937d9e6b1cc50c4b Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 11 Feb 2025 17:35:56 +0100 Subject: [PATCH 033/142] some more nesting --- test/sql/cast/string_to_unnamed_struct.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/cast/string_to_unnamed_struct.test b/test/sql/cast/string_to_unnamed_struct.test index 58f548065789..ecf4c39ec468 100644 --- a/test/sql/cast/string_to_unnamed_struct.test +++ b/test/sql/cast/string_to_unnamed_struct.test @@ -61,6 +61,6 @@ select $${'a': [1,2,3],'b':}$$::STRUCT(a INTEGER[], b VARCHAR) {'a': [1, 2, 3], 'b': } query I -select $$[(" test "), {'a': (\\ test \\)}]$$::STRUCT(a VARCHAR)[] +select $$[((" test ")), {'a': (\\ test \\)}]$$::STRUCT(a STRUCT("inner" VARCHAR))[] ---- -[{'a': test }, {'a': (\\ test \\)}] +[{'a': {'inner': test }}, {'a': {'inner': \ test \}}] From d2a2d3dc5649eda69a5400f449cfa27b64204db9 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 11 Feb 2025 17:49:45 +0100 Subject: [PATCH 034/142] adjust tests --- test/sql/cast/string_to_nested_types_cast.test_slow | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/sql/cast/string_to_nested_types_cast.test_slow b/test/sql/cast/string_to_nested_types_cast.test_slow index 90c97c24128d..b8f1346f59e4 100644 --- a/test/sql/cast/string_to_nested_types_cast.test_slow +++ b/test/sql/cast/string_to_nested_types_cast.test_slow @@ -78,7 +78,7 @@ SELECT CAST(LIST(timestamp_ns)::VARCHAR AS TIME[]) FROM test_all_types(); query I SELECT CAST(LIST(blob)::VARCHAR AS BLOB[]) FROM test_all_types(); ---- -[thisisalongblob\x00withnullbytes, \x00\x00\x00a, NULL] +[thisisalongblobx00withnullbytes, x00x00x00a, NULL] query I SELECT CAST(LIST(interval)::VARCHAR AS INTERVAL[]) FROM test_all_types(); @@ -191,8 +191,8 @@ SELECT CAST(struct_pack(A=>timestamp_ns)::VARCHAR AS STRUCT(A TIME)) FROM test_a query I SELECT CAST(struct_pack(A=>blob)::VARCHAR AS STRUCT(A BLOB)) FROM test_all_types(); ---- -{'A': thisisalongblob\x00withnullbytes} -{'A': \x00\x00\x00a} +{'A': thisisalongblobx00withnullbytes} +{'A': x00x00x00a} {'A': NULL} query I From a5173d3de01269d2a6d19dc00c73241c189e4762 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 11 Feb 2025 19:46:16 +0100 Subject: [PATCH 035/142] fix tidy issues --- src/function/cast/vector_cast_helpers.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 6a54b9435d42..cb0feb7dce9f 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -1,18 +1,18 @@ #include "duckdb/function/cast/vector_cast_helpers.hpp" -#include "duckdb/common/typedefs.hpp" #include "duckdb/common/stack.hpp" +#include "duckdb/common/typedefs.hpp" namespace { struct StringCastInputState { public: - StringCastInputState(const char *buf, idx_t &pos, idx_t &len) : buf(buf), pos(pos), len(len) { + StringCastInputState(const char *buf, duckdb::idx_t &pos, duckdb::idx_t &len) : buf(buf), pos(pos), len(len) { } public: const char *buf; - idx_t &pos; - idx_t &len; + duckdb::idx_t &pos; + duckdb::idx_t &len; bool escaped = false; }; From 50613b65e0d55455a5922f9e55d57bb91d4435a0 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 10:14:45 +0100 Subject: [PATCH 036/142] fix up test --- test/sql/cast/string_to_list_cast.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/cast/string_to_list_cast.test b/test/sql/cast/string_to_list_cast.test index fdaa3fd72242..61f367326ac4 100644 --- a/test/sql/cast/string_to_list_cast.test +++ b/test/sql/cast/string_to_list_cast.test @@ -511,11 +511,11 @@ statement error select '[{"bar":"\\""}]'::VARCHAR[]; ---- -# escapes are only processed once the {} is cast as well +# Unescaped doublequote ends the quote early, leaving an uneven amount of `"`, causing an error statement error -query I select '[{"bar":"\\""}]'::STRUCT(bar VARCHAR)[]; ---- +can't be cast to the destination type LIST # uneven amount of escapes does escape the " query I From 384f5f01b5bf4f584975bbd2a66e0733756a499c Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 13:10:06 +0100 Subject: [PATCH 037/142] moved the escaped case to the start of the cases, reduces complexity of the bodies of the other cases --- src/function/cast/vector_cast_helpers.cpp | 37 ++-- test/sql/cast/string_to_list_escapes.test | 198 ++++++++++++++++++++++ 2 files changed, 215 insertions(+), 20 deletions(-) create mode 100644 test/sql/cast/string_to_list_escapes.test diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index cb0feb7dce9f..16cc40192bab 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -235,26 +235,27 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { } bool set_escaped = false; - if (buf[pos] == '[') { + if (input_state.escaped) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; + } else if (buf[pos] == '[') { if (!start_pos.IsValid()) { start_pos = pos; } //! Start of a LIST - if (!input_state.escaped) { - lvl++; - if (!SkipToClose(input_state, lvl, ']')) { - return false; - } + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; } end_pos = pos; } else if ((buf[pos] == '"' || buf[pos] == '\'')) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - if (!SkipToCloseQuotes(input_state)) { - return false; - } + if (!SkipToCloseQuotes(input_state)) { + return false; } end_pos = pos; } else if (buf[pos] == '{') { @@ -262,14 +263,12 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { start_pos = pos; } //! Start of a STRUCT - if (!input_state.escaped) { - idx_t struct_lvl = 0; - if (!SkipToClose(input_state, struct_lvl, '}')) { - return false; - } + idx_t struct_lvl = 0; + if (!SkipToClose(input_state, struct_lvl, '}')) { + return false; } end_pos = pos; - } else if (buf[pos] == ',' || buf[pos] == ']') { + } else if ((buf[pos] == ',' || buf[pos] == ']')) { if (buf[pos] != ']' || start_pos.IsValid() || seen_value) { if (!start_pos.IsValid()) { state.HandleValue(buf, 0, 0); @@ -293,11 +292,9 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - set_escaped = true; - } + set_escaped = true; end_pos = pos; - } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { if (!start_pos.IsValid()) { start_pos = pos; } diff --git a/test/sql/cast/string_to_list_escapes.test b/test/sql/cast/string_to_list_escapes.test new file mode 100644 index 000000000000..aca72ada840f --- /dev/null +++ b/test/sql/cast/string_to_list_escapes.test @@ -0,0 +1,198 @@ +# name: test/sql/cast/string_to_list_escapes.test +# group: [cast] + +query I +SELECT $$[hello, world]$$::VARCHAR[]; +---- +[hello, world] + +query I +SELECT $$[hello\ world, world]$$::VARCHAR[]; +---- +[hello world, world] + +query I +SELECT $$[hello\,world, test]$$::VARCHAR[]; +---- +[hello,world, test] + +query I +SELECT $$[hello\,, test]$$::VARCHAR[]; +---- +[hello,, test] + +query I +SELECT $$[hello\"quoted\"text, more]$$::VARCHAR[]; +---- +[hello"quoted"text, more] + +query I +SELECT $$[escaped\\backslash, test]$$::VARCHAR[]; +---- +[escaped\backslash, test] + +query I +SELECT $$[nested[brackets], test]$$::VARCHAR[]; +---- +[nested[brackets], test] + +query I +SELECT $$[quote\'in\'string, test]$$::VARCHAR[]; +---- +[quote'in'string, test] + +query I +SELECT $$[mix\ of\ special\,chars]$$::VARCHAR[]; +---- +[mix of special,chars] + +query I +SELECT $$["ends with space ", "trailing space "]$$::VARCHAR[]; +---- +[ends with space , trailing space ] + +query I +SELECT $$["ends with comma,", "another,"]$$::VARCHAR[]; +---- +[ends with comma,, another,] + +query I +SELECT $$["quote at end\"", "\""]$$::VARCHAR[]; +---- +[quote at end", "] + +query I +SELECT $$["ends with bracket]", "[bracket"]$$::VARCHAR[]; +---- +[ends with bracket], [bracket] + +query I +SELECT $$["backslash at end\\", "\\"]$$::VARCHAR[]; +---- +[backslash at end\, \] + +query I +SELECT $$[" space at start", " leading space"]$$::VARCHAR[]; +---- +[ space at start, leading space] + +query I +SELECT $$[",comma at start", ",leading comma"]$$::VARCHAR[]; +---- +[,comma at start, ,leading comma] + +query I +SELECT $$["\"quote at start", "\"leading quote"]$$::VARCHAR[]; +---- +["quote at start, "leading quote] + +query I +SELECT $$["[bracket at start", "[leading bracket"]$$::VARCHAR[]; +---- +[[bracket at start, [leading bracket] + +query I +SELECT $$["\\backslash at start", "\\leading backslash"]$$::VARCHAR[]; +---- +[\backslash at start, \leading backslash] + +query I +SELECT $$[" space at start and end ", " leading and trailing space "]$$::VARCHAR[]; +---- +[ space at start and end , leading and trailing space ] + +query I +SELECT $$[",comma at start and end,", ",leading and trailing comma,"]$$::VARCHAR[]; +---- +[,comma at start and end,, ,leading and trailing comma,] + +query I +SELECT $$["\"quote at start and end\"", "\"leading and trailing quote\""]$$::VARCHAR[]; +---- +["quote at start and end", "leading and trailing quote"] + +query I +SELECT $$["[bracket at start and end]", "[leading and trailing bracket]"]$$::VARCHAR[]; +---- +[[bracket at start and end], [leading and trailing bracket]] + +query I +SELECT $$["\\backslash at start and end\\", "\\leading and trailing backslash\\"]$$::VARCHAR[]; +---- +[\backslash at start and end\, \leading and trailing backslash\] + + +query I +SELECT $$[" mix, of special\ characters " , "[various] \"combinations\" "]$$::VARCHAR[]; +---- +[ mix, of special characters , [various] "combinations" ] + +query I +SELECT $$[", starts and ends with ,", "[brackets] and ,commas,"]$$::VARCHAR[]; +---- +[, starts and ends with ,, [brackets] and ,commas,] + +query I +SELECT $$["\"quotes\" and \ spaces ", "\ leading and trailing \ "]$$::VARCHAR[]; +---- +["quotes" and spaces , leading and trailing ] + +query I +SELECT $$["[complex\ combination, of\" special]", "\\all cases covered\\"]$$::VARCHAR[]; +---- +[[complex combination, of" special], \all cases covered\] + +query I +SELECT $$["hello, world"]$$::VARCHAR[]; +---- +[hello, world] + +statement error +SELECT $$["missing quote]]$$::VARCHAR[]; -- Mismatched quotes +---- +can't be cast to the destination type + +statement error +SELECT $$["backslash at end\"]$$::VARCHAR[]; -- Improper escaping +---- +can't be cast to the destination type + +statement error +SELECT $$[unescaped[bracket]$$::VARCHAR[]; -- Unescaped bracket +---- +can't be cast to the destination type + +statement error +SELECT $$[unterminated string]"]$$::VARCHAR[]; +---- +can't be cast to the destination type + +query I +SELECT $$[]$$::VARCHAR[]; -- Empty list +---- +[] + +query I +SELECT $$[""]$$::VARCHAR[]; -- List with empty string +---- +[] + +query I +SELECT $$[" "]$$::VARCHAR[]; -- List with whitespace string +---- +[ ] + +query I +SELECT $$["\\"]$$::VARCHAR[]; -- List with only a backslash +---- +[\] + +query I +SELECT $$["\""]$$::VARCHAR[]; -- List with only a quote +---- +["] + +query I +SELECT $$[\,]$$::VARCHAR[]; -- List with only a comma +---- +[,] From 26e8d22a7433508af51c087e40c7683227bfcdf8 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 13:14:36 +0100 Subject: [PATCH 038/142] give varchar->struct the same treatment, escaped case should be on top --- src/function/cast/vector_cast_helpers.cpp | 80 ++++++++++------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 16cc40192bab..b443ac7e858e 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -565,56 +565,52 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Wed, 12 Feb 2025 12:04:26 +0100 Subject: [PATCH 039/142] Ensure MergeCollectionTask has a writer --- src/execution/operator/persistent/physical_batch_insert.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/execution/operator/persistent/physical_batch_insert.cpp b/src/execution/operator/persistent/physical_batch_insert.cpp index 2e546c477282..f898af2c7014 100644 --- a/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/execution/operator/persistent/physical_batch_insert.cpp @@ -215,7 +215,9 @@ class MergeCollectionTask : public BatchInsertTask { auto &gstate = gstate_p.Cast(); auto &lstate = lstate_p.Cast(); // merge together the collections - D_ASSERT(lstate.writer); + if (!lstate.writer) { + lstate.writer = &gstate.table.GetStorage().CreateOptimisticWriter(context); + } auto final_collection = gstate.MergeCollections(context, std::move(merge_collections), *lstate.writer); // add the merged-together collection to the set of batch indexes lock_guard l(gstate.lock); From e78d96e43c1262bdb8e0ab694d8abc08ec5b721b Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 13:17:56 +0100 Subject: [PATCH 040/142] and varchar->map as well --- src/function/cast/vector_cast_helpers.cpp | 67 +++++++++++------------ 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index b443ac7e858e..bc2e11085291 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -387,46 +387,43 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { idx_t end_pos; while (pos < len && (buf[pos] != '=' || input_state.escaped)) { bool set_escaped = false; - if (buf[pos] == '"' || buf[pos] == '\'') { + if (input_state.escaped) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - if (!SkipToCloseQuotes(input_state)) { - return false; - } + end_pos = pos; + } else if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToCloseQuotes(input_state)) { + return false; } end_pos = pos; } else if (buf[pos] == '{') { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - if (!SkipToClose(input_state, lvl, '}')) { - return false; - } + if (!SkipToClose(input_state, lvl, '}')) { + return false; } end_pos = pos; } else if (buf[pos] == '[') { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - lvl++; - if (!SkipToClose(input_state, lvl, ']')) { - return false; - } + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; } end_pos = pos; } else if (buf[pos] == '\\') { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - set_escaped = true; - } + set_escaped = true; end_pos = pos; - } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { if (!start_pos.IsValid()) { start_pos = pos; } @@ -451,46 +448,44 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { SkipWhitespace(input_state); while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { bool set_escaped = false; - if (buf[pos] == '"' || buf[pos] == '\'') { + + if (input_state.escaped) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - if (!SkipToCloseQuotes(input_state)) { - return false; - } + end_pos = pos; + } else if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToCloseQuotes(input_state)) { + return false; } end_pos = pos; } else if (buf[pos] == '{') { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - if (!SkipToClose(input_state, lvl, '}')) { - return false; - } + if (!SkipToClose(input_state, lvl, '}')) { + return false; } end_pos = pos; } else if (buf[pos] == '[') { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - lvl++; - if (!SkipToClose(input_state, lvl, ']')) { - return false; - } + lvl++; + if (!SkipToClose(input_state, lvl, ']')) { + return false; } end_pos = pos; } else if (buf[pos] == '\\') { if (!start_pos.IsValid()) { start_pos = pos; } - if (!input_state.escaped) { - set_escaped = true; - } + set_escaped = true; end_pos = pos; - } else if (!StringUtil::CharacterIsSpace(buf[pos]) || input_state.escaped) { + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { if (!start_pos.IsValid()) { start_pos = pos; } From 9bdf2d5bc601a0126206b3d5ec7bd9803bce6e76 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 13:21:56 +0100 Subject: [PATCH 041/142] same for unnamed structs --- src/function/cast/vector_cast_helpers.cpp | 41 +++++++++++------------ 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index bc2e11085291..d679c7126816 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -35,6 +35,7 @@ inline static void SkipWhitespace(StringCastInputState &input_state) { auto &pos = input_state.pos; auto &len = input_state.len; if (input_state.escaped) { + //! Escaped whitespace should not be skipped return; } while (pos < len && StringUtil::CharacterIsSpace(buf[pos])) { @@ -735,56 +736,52 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Wed, 12 Feb 2025 09:56:56 -0300 Subject: [PATCH 042/142] I think I should use duckdb_destroy_extracted instead of delete --- src/common/adbc/adbc.cpp | 8 ++++---- test/api/adbc/test_adbc.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/adbc/adbc.cpp b/src/common/adbc/adbc.cpp index 0a77b0e67b42..95cc90639d7a 100644 --- a/src/common/adbc/adbc.cpp +++ b/src/common/adbc/adbc.cpp @@ -880,7 +880,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char auto error_msg_extract_statements = duckdb_extract_statements_error(extracted_statements); if (error_msg_extract_statements != nullptr) { // Things went wrong when executing internal prepared statement - delete (reinterpret_cast (&extracted_statements)); + duckdb_destroy_extracted(&extracted_statements); SetError(error, error_msg_extract_statements); return ADBC_STATUS_INTERNAL; } @@ -893,7 +893,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char auto adbc_status = CheckResult(res, error, error_msg); if (adbc_status != ADBC_STATUS_OK) { // Things went wrong when executing internal prepared statement - delete (reinterpret_cast (&extracted_statements)); + duckdb_destroy_extracted(&extracted_statements); delete statement_internal; return adbc_status; } @@ -904,7 +904,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char SetError(error, duckdb_query_arrow_error(out_result)); delete out_result; delete statement_internal; - delete (reinterpret_cast (&extracted_statements)); + duckdb_destroy_extracted(&extracted_statements); return ADBC_STATUS_INVALID_ARGUMENT; } delete out_result; @@ -914,7 +914,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char auto res = duckdb_prepare_extracted_statement(wrapper->connection, extracted_statements, extract_statements_size - 1, &wrapper->statement); auto error_msg = duckdb_prepare_error(wrapper->statement); - delete (reinterpret_cast (&extracted_statements)); + duckdb_destroy_extracted(&extracted_statements); return CheckResult(res, error, error_msg); } diff --git a/test/api/adbc/test_adbc.cpp b/test/api/adbc/test_adbc.cpp index 213ed7710dd0..124f6472e6a3 100644 --- a/test/api/adbc/test_adbc.cpp +++ b/test/api/adbc/test_adbc.cpp @@ -19,7 +19,7 @@ bool SUCCESS(AdbcStatusCode status) { return status == ADBC_STATUS_OK; } -const char *duckdb_lib = std::getenv("DUCKDB_INSTALL_LIB"); +const char *duckdb_lib = "/Users/holanda/Documents/Projects/duckdb/cmake-build-debug/src/libduckdb.dylib"; class ADBCTestDatabase { public: From 3edeac6b5069ee4ed66dffa60477ea4843d0e57d Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 13:59:44 +0100 Subject: [PATCH 043/142] turn off RESPECT_SCOPES for struct keys, so escapes are interpreted even when they would otherwise be 'inside of a deeper scope' --- src/function/cast/vector_cast_helpers.cpp | 51 +++------- test/sql/cast/string_to_struct_escapes.test | 103 ++++++++++++++++++++ 2 files changed, 118 insertions(+), 36 deletions(-) create mode 100644 test/sql/cast/string_to_struct_escapes.test diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index d679c7126816..092c02d10c8a 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -126,6 +126,7 @@ struct CountPartOperation { } }; +template static string_t HandleString(Vector &vec, const char *buf, idx_t start, idx_t end) { D_ASSERT(start <= end); auto length = end - start; @@ -162,17 +163,20 @@ static string_t HandleString(Vector &vec, const char *buf, idx_t start, idx_t en scopes.pop(); } if (!quoted && (current_char == '[' || current_char == '{' || current_char == '(')) { - //! New scope - char end_char; - if (current_char == '[') { - end_char = ']'; - } else if (current_char == '{') { - end_char = '}'; - } else { - D_ASSERT(current_char == '('); - end_char = ')'; + if (RESPECT_SCOPES) { + //! 'RESPECT_SCOPES' is false in things like STRUCT keys, these are regular strings + //! New scope + char end_char; + if (current_char == '[') { + end_char = ']'; + } else if (current_char == '{') { + end_char = '}'; + } else { + D_ASSERT(current_char == '('); + end_char = ')'; + } + scopes.push(end_char); } - scopes.push(end_char); } //! Regular character string_data[copied_count++] = current_char; @@ -575,31 +579,6 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector(temp_vec, buf, key_start, end_pos); auto it = child_names.find(child_name); if (it == child_names.end()) { return false; // false key diff --git a/test/sql/cast/string_to_struct_escapes.test b/test/sql/cast/string_to_struct_escapes.test new file mode 100644 index 000000000000..9d9f03d4eaac --- /dev/null +++ b/test/sql/cast/string_to_struct_escapes.test @@ -0,0 +1,103 @@ +# name: test/sql/cast/string_to_struct_escapes.test +# group: [cast] + +query I +SELECT $${name: value, age: 30}$$::STRUCT(name VARCHAR, age INT); +---- +{'name': value, 'age': 30} + +query I +SELECT $${name: John, city: "New York"}$$::STRUCT(name VARCHAR, city VARCHAR); +---- +{'name': John, 'city': New York} + +query I +SELECT $${quote_at_start: "\"test\"", age: 30}$$::STRUCT(quote_at_start VARCHAR, age INT); +---- +{'quote_at_start': "test", 'age': 30} + +query I +SELECT $${user_name: Alice, status: active}$$::STRUCT(user_name VARCHAR, status VARCHAR); +---- +{'user_name': Alice, 'status': active} + +query I +SELECT $${special_characters: "comma, semicolon; and backslash\\", age: 30}$$::STRUCT(special_characters VARCHAR, age INT); +---- +{'special_characters': comma, semicolon; and backslash\, 'age': 30} + +query I +SELECT $${a: 10, b: "hello world"}$$::STRUCT(a INT, b VARCHAR); +---- +{'a': 10, 'b': hello world} + +query I +SELECT $${first_name: "John", last_name: "Doe", age: 28}$$::STRUCT(first_name VARCHAR, last_name VARCHAR, age INT); +---- +{'first_name': John, 'last_name': Doe, 'age': 28} + +query I +SELECT $${first name: John, age: 30}$$::STRUCT("first name" VARCHAR, age INT); +---- +{'first name': John, 'age': 30} + +# Invalid: Value contains a quote that isn't escaped +statement error +SELECT $${name: "John "Doe"}$$::STRUCT(name VARCHAR); +---- +can't be cast to the destination type + +# Invalid: Value contains a comma that isn't escaped +statement error +SELECT $${name: John, age, 30}$$::STRUCT(name VARCHAR, age INT); +---- +can't be cast to the destination type + +# Name is free to contain `,`, only `:` is problematic +query I +SELECT $${user,name: Alice, age: 30}$$::STRUCT("user,name" VARCHAR, age INT); +---- +{'user,name': Alice, 'age': 30} + +# Invalid: Contains an unescaped closing bracket +statement error +SELECT $${name: Alice, age: 30})$$::STRUCT(name VARCHAR, age INT); +---- +can't be cast to the destination type + +# Invalid: Name contains a backslash +statement error +SELECT $${backslash\name: value}$$::STRUCT("backslash\name" VARCHAR); +---- +can't be cast to the destination type + +# first `:` is not escaped, won't match the "name:" struct key +statement error +SELECT $${name: test, value: 30}$$::STRUCT("name:" VARCHAR, value INT); +---- +can't be cast to the destination type + +# Name can contain escaped `:` +query I +SELECT $${name\:: test, value: 30}$$::STRUCT("name:" VARCHAR, value INT); +---- +{'name:': test, 'value': 30} + +# Name consists of `{}`, not a problem, with this syntax we expect a name, which is a plain string +# Only reserved character there is `:` (and quotes, and backslash of course) +query I +SELECT $${{name}: John, age: 3}$$::STRUCT("{name}" VARCHAR, age INT); +---- +{'{name}': John, 'age': 3} + +# Name has `{` which normally starts a bracket that disables interpreting escape characters +query I +SELECT $${{\"name\"}: John, age: 3}$$::STRUCT("{""name""}" VARCHAR, age INT); +---- +{'{"name"}': John, 'age': 3} + +# Invalid: Unterminated string value +statement error +SELECT $${name: "John, age: 30}$$::STRUCT(name VARCHAR, age INT); +---- +can't be cast to the destination type From da9ab9032a69d10d5a261183f9536c1312f2e2c8 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 14:19:15 +0100 Subject: [PATCH 044/142] more tests --- test/sql/cast/string_to_struct_escapes.test | 103 +++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/test/sql/cast/string_to_struct_escapes.test b/test/sql/cast/string_to_struct_escapes.test index 9d9f03d4eaac..869a48bc35c6 100644 --- a/test/sql/cast/string_to_struct_escapes.test +++ b/test/sql/cast/string_to_struct_escapes.test @@ -47,7 +47,7 @@ SELECT $${name: "John "Doe"}$$::STRUCT(name VARCHAR); ---- can't be cast to the destination type -# Invalid: Value contains a comma that isn't escaped +# second key has no ending character (:) statement error SELECT $${name: John, age, 30}$$::STRUCT(name VARCHAR, age INT); ---- @@ -101,3 +101,104 @@ statement error SELECT $${name: "John, age: 30}$$::STRUCT(name VARCHAR, age INT); ---- can't be cast to the destination type + +query I +SELECT $${}$$::STRUCT(name VARCHAR, age INT); +---- +{'name': NULL, 'age': NULL} + +# STRUCT with whitespace around colon (escaped) +query I +SELECT $${name : John, age : 30}$$::STRUCT(name VARCHAR, age INT); +---- +{'name': John, 'age': 30} + +# STRUCT with escaped backslash in value +query I +SELECT $${path: "C:\\Users\\John"}$$::STRUCT(path VARCHAR); +---- +{'path': C:\Users\John} + +# STRUCT with special characters in value, properly escaped +query I +SELECT $${description: "Special characters: \\, \", ;, (, )"}$$::STRUCT(description VARCHAR); +---- +{'description': Special characters: \, ", ;, (, )} + +# Valid: Name with escaped space +query I +SELECT $${first\ name: "John", age: 30}$$::STRUCT("first name" VARCHAR, age INT); +---- +{'first name': John, 'age': 30} + +# Valid: Name with escaped quote +query I +SELECT $${\"quote at start\": "value", age: 30}$$::STRUCT("""quote at start""" VARCHAR, age INT); +---- +{'"quote at start"': value, 'age': 30} + +# Valid: Name with escaped backslash +query I +SELECT $${backslash\\name: "John Doe", age: 30}$$::STRUCT("backslash\name" VARCHAR, age INT); +---- +{'backslash\name': John Doe, 'age': 30} + +# Valid: Name with escaped comma +query I +SELECT $${user\,name: "Alice", age: 25}$$::STRUCT("user,name" VARCHAR, age INT); +---- +{'user,name': Alice, 'age': 25} + +# Valid: Name with escaped parenthesis +query I +SELECT $${user\(name\): "Alice", status: "active"}$$::STRUCT("user(name)" VARCHAR, status VARCHAR); +---- +{'user(name)': Alice, 'status': active} + +# Valid: Name with unescaped parenthesis +query I +SELECT $${user(name): "Alice", status: "active"}$$::STRUCT("user(name)" VARCHAR, status VARCHAR); +---- +{'user(name)': Alice, 'status': active} + +# Valid: Name with escaped space at end +query I +SELECT $${user\ name\ : "Alice", age\ : 25}$$::STRUCT("user name " VARCHAR, "age " INT); +---- +{'user name ': Alice, 'age ': 25} + +# Invalid: Name contains unescaped quote +statement error +SELECT $${"quote"start": "value", age: 30}$$::STRUCT("quote""start" VARCHAR, age INT); +---- +can't be cast to the destination type + +# Invalid: Name contains unescaped backslash +statement error +SELECT $${backslash\name: "John", age: 30}$$::STRUCT("backslash\name" VARCHAR, age INT); +---- +can't be cast to the destination type + +# Valid: Name contains (unescaped) opening parenthesis +query I +SELECT $${user(name: "Alice", age: 25}$$::STRUCT("user(name" VARCHAR, age INT); +---- +{'user(name': Alice, 'age': 25} + +# Name is single double quote +query I +SELECT $${\": "value", age: 30}$$::STRUCT("""" VARCHAR, age INTEGER) +---- +{'"': value, 'age': 30} + +# Name with only a special character (escaped) +query I +SELECT $${\\: "escaped", age: 30}$$::STRUCT("\" VARCHAR, age INT); +---- +{'\': escaped, 'age': 30} + +# Name with only a special character (not escaped) +query I +SELECT $${@: "value", age: 30}$$::STRUCT("@" VARCHAR, age INT); +---- +{'@': value, 'age': 30} From ba1cb2eb64a68a6a24094f4fe8dfb9cf23f9c18f Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 14:49:59 +0100 Subject: [PATCH 045/142] add escape tests for maps, also fix a bug: map keys are allowed to be empty, should accept this in cast --- src/function/cast/vector_cast_helpers.cpp | 11 +- test/sql/cast/string_to_map_escapes.test | 141 ++++++++++++++++++++++ 2 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 test/sql/cast/string_to_map_escapes.test diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 092c02d10c8a..be2a21b195c8 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -280,7 +280,6 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { } else { auto start = start_pos.GetIndex(); auto end = (end_pos + 1) - start; - auto substr = std::string(buf + start, end); state.HandleValue(buf, start, end_pos + 1); } seen_value = true; @@ -441,11 +440,12 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { return false; } if (!start_pos.IsValid()) { - //! Key can not be empty - return false; + start_pos = 0; + end_pos = 0; + } else { + end_pos++; } - auto key_substr = std::string(buf + start_pos.GetIndex(), buf + end_pos + 1); - if (!state.HandleKey(buf, start_pos.GetIndex(), end_pos + 1)) { + if (!state.HandleKey(buf, start_pos.GetIndex(), end_pos)) { return false; } start_pos = optional_idx(); @@ -506,7 +506,6 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { //! Value is empty state.HandleValue(buf, 0, 0); } else { - auto value_substr = std::string(buf + start_pos.GetIndex(), buf + end_pos + 1); state.HandleValue(buf, start_pos.GetIndex(), end_pos + 1); } if (buf[pos] == '}') { diff --git a/test/sql/cast/string_to_map_escapes.test b/test/sql/cast/string_to_map_escapes.test new file mode 100644 index 000000000000..40cf7834451b --- /dev/null +++ b/test/sql/cast/string_to_map_escapes.test @@ -0,0 +1,141 @@ +# name: test/sql/cast/string_to_map_escapes.test +# group: [cast] + +# Valid: key and value with escaped space +query I +SELECT $${key\ with\ space = value\ with\ space}$$::MAP(VARCHAR, VARCHAR); +---- +{key with space=value with space} + +# Valid: key with escaped quote and value with escaped quote +query I +SELECT $${\"key\" = \"value\"}$$::MAP(VARCHAR, VARCHAR); +---- +{"key"="value"} + +# Valid: key with escaped backslash, value with escaped backslash +query I +SELECT $${key\ with\ backslash = value\ with\ backslash}$$::MAP(VARCHAR, VARCHAR); +---- +{key with backslash=value with backslash} + +# Valid: key with escaped comma, value with escaped comma +query I +SELECT $${key\ with\, comma = value\ with\, comma}$$::MAP(VARCHAR, VARCHAR); +---- +{key with, comma=value with, comma} + +# Valid: key and value with escaped colon +query I +SELECT $${key\ with\ colon\: = value\ with\ colon\:}$$::MAP(VARCHAR, VARCHAR); +---- +{key with colon:=value with colon:} + +## FIXME: not sure what to do here, maybe we shouldn't "respect scopes" if the child type is not nested +## Valid: key and value with parentheses +#query I +#SELECT $${key\ (with\ parens) = value\ (with\ parens)}$$::MAP(VARCHAR, VARCHAR); +#---- +#{key (with parens)=value (with parens)} + +# Valid: key contains unescaped space +query I +SELECT $${key with space = value with space}$$::MAP(VARCHAR, VARCHAR); +---- +{key with space=value with space} + +# Invalid: key input contains quotes +query I +SELECT $${key"with"quote = value}$$::MAP(VARCHAR, VARCHAR); +---- +{keywithquote=value} + +# Valid: value input contains quotes +query I +SELECT $${key = value"with"quote}$$::MAP(VARCHAR, VARCHAR); +---- +{key=valuewithquote} + +# Valid: key contains unescaped comma +query I +SELECT $${key,with,comma = value}$$::MAP(VARCHAR, VARCHAR); +---- +{key,with,comma=value} + +# Invalid: value contains unescaped comma +statement error +SELECT $${key = value,with,comma}$$::MAP(VARCHAR, VARCHAR); +---- +can't be cast to the destination type MAP + +# Valid: key contains unescaped curly bracket +query I +SELECT $${key{with}bracket = value}$$::MAP(VARCHAR, VARCHAR); +---- +{key{with}bracket=value} + +# Invalid: value contains unescaped curly bracket +query I +SELECT $${key = value{with}bracket}$$::MAP(VARCHAR, VARCHAR); +---- +{key=value{with}bracket} + +# Valid: key contains useless backslashes +query I +SELECT $${key\with\backslash = value}$$::MAP(VARCHAR, VARCHAR); +---- +{keywithbackslash=value} + +# Valid: value contains useless backslashes +query I +SELECT $${key = value\with\backslash}$$::MAP(VARCHAR, VARCHAR); +---- +{key=valuewithbackslash} + +# Valid: key/value contains unescaped equal sign +query II +SELECT $${key=with=equals = value}$$::MAP(VARCHAR, VARCHAR) a, a['key']; +---- +{key=with=equals = value} with=equals = value + +# Valid: key/value contains unescaped equal sign +query II +SELECT $${key\=with=equals = value}$$::MAP(VARCHAR, VARCHAR) a, a['key=with']; +---- +{key=with=equals = value} equals = value + +# Valid: key/value contains unescaped equal sign +query II +SELECT $${key\=with\=equals = value}$$::MAP(VARCHAR, VARCHAR) a, a['key=with=equals']; +---- +{key=with=equals=value} value + +# Edge Case: Empty MAP with no keys/values +query I +SELECT $${}$$::MAP(VARCHAR, VARCHAR); +---- +{} + +# Valid: MAP with empty key and value +query I +SELECT $${=}$$::MAP(VARCHAR, VARCHAR); +---- +{=} + +# Edge Case: MAP with special characters only (escaped) +query I +SELECT $${\{escaped\brace\} = \}escaped\brace\\}$$::MAP(VARCHAR, VARCHAR); +---- +{{escapedbrace}=}escapedbrace\} + +# Edge Case: MAP with only a key and no value +query I +SELECT $${key=}$$::MAP(VARCHAR, VARCHAR); +---- +{key=} + +# Valid: MAP with an empty key +query I +SELECT $${=value}$$::MAP(VARCHAR, VARCHAR); +---- +{=value} From 89eccf147cc017e9d7c7c279575b45da33208144 Mon Sep 17 00:00:00 2001 From: pdet Date: Wed, 12 Feb 2025 10:56:14 -0300 Subject: [PATCH 046/142] Woopsie --- test/api/adbc/test_adbc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/api/adbc/test_adbc.cpp b/test/api/adbc/test_adbc.cpp index 124f6472e6a3..213ed7710dd0 100644 --- a/test/api/adbc/test_adbc.cpp +++ b/test/api/adbc/test_adbc.cpp @@ -19,7 +19,7 @@ bool SUCCESS(AdbcStatusCode status) { return status == ADBC_STATUS_OK; } -const char *duckdb_lib = "/Users/holanda/Documents/Projects/duckdb/cmake-build-debug/src/libduckdb.dylib"; +const char *duckdb_lib = std::getenv("DUCKDB_INSTALL_LIB"); class ADBCTestDatabase { public: From f1179bf6b65ef8941c6fdd1416f9baa7ba9b5836 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 15:31:51 +0100 Subject: [PATCH 047/142] fix unused variable --- src/function/cast/vector_cast_helpers.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index be2a21b195c8..f00bb8157249 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -279,7 +279,6 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { state.HandleValue(buf, 0, 0); } else { auto start = start_pos.GetIndex(); - auto end = (end_pos + 1) - start; state.HandleValue(buf, start, end_pos + 1); } seen_value = true; From c640ee19654b461dc3566987682c683ddb4c31b4 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Wed, 12 Feb 2025 16:06:39 +0100 Subject: [PATCH 048/142] improve performance of hashing longer strings --- src/common/types/hash.cpp | 73 +++++++++++---------------------------- 1 file changed, 21 insertions(+), 52 deletions(-) diff --git a/src/common/types/hash.cpp b/src/common/types/hash.cpp index 83a1ef22310e..160d5f3c2924 100644 --- a/src/common/types/hash.cpp +++ b/src/common/types/hash.cpp @@ -4,6 +4,7 @@ #include "duckdb/common/types/string_type.hpp" #include "duckdb/common/types/interval.hpp" #include "duckdb/common/types/uhugeint.hpp" +#include "duckdb/common/fast_mem.hpp" #include #include @@ -80,68 +81,36 @@ hash_t Hash(char *val) { return Hash(val); } -// MIT License -// Copyright (c) 2018-2021 Martin Ankerl -// https://github.com/martinus/robin-hood-hashing/blob/3.11.5/LICENSE -hash_t HashBytes(void *ptr, size_t len) noexcept { - static constexpr uint64_t M = UINT64_C(0xc6a4a7935bd1e995); - static constexpr uint64_t SEED = UINT64_C(0xe17a1465); - static constexpr unsigned int R = 47; +hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept { + // This seed slightly improves bit distribution, taken from here: + // https://github.com/martinus/robin-hood-hashing/blob/3.11.5/LICENSE + // MIT License Copyright (c) 2018-2021 Martin Ankerl + hash_t h = 0xe17a1465U ^ (len * 0xc6a4a7935bd1e995U); - auto const *const data64 = static_cast(ptr); - uint64_t h = SEED ^ (len * M); - - size_t const n_blocks = len / 8; - for (size_t i = 0; i < n_blocks; ++i) { - auto k = Load(reinterpret_cast(data64 + i)); + // Hash/combine in blocks of 8 bytes + for (const auto end = ptr + len - (len & 7U); ptr != end; ptr += 8U) { + h ^= Load(ptr); + h *= 0xd6e8feb86659fd93U; + } - k *= M; - k ^= k >> R; - k *= M; + // XOR with remaining (<8) bytes + hash_t hr = 0; + FastMemcpy(&hr, ptr, len & 7U); + h ^= hr; - h ^= k; - h *= M; - } + // Finalize + h *= 0xd6e8feb86659fd93U; + h ^= h >> 32; - auto const *const data8 = reinterpret_cast(data64 + n_blocks); - switch (len & 7U) { - case 7: - h ^= static_cast(data8[6]) << 48U; - DUCKDB_EXPLICIT_FALLTHROUGH; - case 6: - h ^= static_cast(data8[5]) << 40U; - DUCKDB_EXPLICIT_FALLTHROUGH; - case 5: - h ^= static_cast(data8[4]) << 32U; - DUCKDB_EXPLICIT_FALLTHROUGH; - case 4: - h ^= static_cast(data8[3]) << 24U; - DUCKDB_EXPLICIT_FALLTHROUGH; - case 3: - h ^= static_cast(data8[2]) << 16U; - DUCKDB_EXPLICIT_FALLTHROUGH; - case 2: - h ^= static_cast(data8[1]) << 8U; - DUCKDB_EXPLICIT_FALLTHROUGH; - case 1: - h ^= static_cast(data8[0]); - h *= M; - DUCKDB_EXPLICIT_FALLTHROUGH; - default: - break; - } - h ^= h >> R; - h *= M; - h ^= h >> R; - return static_cast(h); + return h; } hash_t Hash(const char *val, size_t size) { - return HashBytes((void *)val, size); + return HashBytes(const_data_ptr_cast(val), size); } hash_t Hash(uint8_t *val, size_t size) { - return HashBytes((void *)val, size); + return HashBytes(const_data_ptr_cast(val), size); } } // namespace duckdb From f503fbad1dfb57da6964601c44714e94c84c3d56 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Wed, 12 Feb 2025 16:07:28 +0100 Subject: [PATCH 049/142] implement bit-packing for RleBpEncoder and allow for larger dictionaries --- .../include/parquet_rle_bp_encoder.hpp | 139 ++++++++++-------- .../writer/templated_column_writer.hpp | 9 +- extension/parquet/parquet_extension.cpp | 12 +- .../parquet/writer/enum_column_writer.cpp | 9 +- .../writer/primitive_column_writer.cpp | 21 ++- 5 files changed, 107 insertions(+), 83 deletions(-) diff --git a/extension/parquet/include/parquet_rle_bp_encoder.hpp b/extension/parquet/include/parquet_rle_bp_encoder.hpp index af321c160c17..689d0fee132c 100644 --- a/extension/parquet/include/parquet_rle_bp_encoder.hpp +++ b/extension/parquet/include/parquet_rle_bp_encoder.hpp @@ -16,95 +16,116 @@ namespace duckdb { class RleBpEncoder { public: - explicit RleBpEncoder(uint32_t bit_width) - : byte_width((bit_width + 7) / 8), byte_count(idx_t(-1)), run_count(idx_t(-1)) { + explicit RleBpEncoder(uint32_t bit_width_p) : bit_width(bit_width_p), byte_width((bit_width + 7) / 8) { } public: - //! NOTE: Prepare is only required if a byte count is required BEFORE writing - //! This is the case with e.g. writing repetition/definition levels - //! If GetByteCount() is not required, prepare can be safely skipped - void BeginPrepare(uint32_t first_value) { - byte_count = 0; - run_count = 1; - current_run_count = 1; - last_value = first_value; - } - void PrepareValue(uint32_t value) { - if (value != last_value) { - FinishRun(); - last_value = value; - } else { - current_run_count++; - } - } - void FinishPrepare() { - FinishRun(); + void BeginWrite() { + rle_count = 0; + bp_block_count = 0; } - void BeginWrite(WriteStream &writer, uint32_t first_value) { - // start the RLE runs - last_value = first_value; - current_run_count = 1; - } void WriteValue(WriteStream &writer, uint32_t value) { - if (value != last_value) { + if (bp_block_count != 0) { + // We already committed to a BP run + D_ASSERT(rle_count == 0); + bp_block[bp_block_count++] = value; + if (bp_block_count == BP_BLOCK_SIZE) { + WriteRun(writer); + } + return; + } + + if (rle_count == 0) { + // Starting fresh, try for an RLE run first + rle_value = value; + rle_count = 1; + return; + } + + // We're trying for an RLE run + if (rle_value == value) { + // Same as current RLE value + rle_count++; + return; + } + + // Value differs from current RLE value + if (rle_count >= MINIMUM_RLE_COUNT) { + // We have enough values for an RLE run WriteRun(writer); - last_value = value; - } else { - current_run_count++; + rle_value = value; + rle_count = 1; + return; + } + + // Not enough values, convert and commit to a BP run + D_ASSERT(bp_block_count == 0); + for (idx_t i = 0; i < rle_count; i++) { + bp_block[bp_block_count++] = rle_value; } + bp_block[bp_block_count++] = value; + rle_count = 0; } + void FinishWrite(WriteStream &writer) { WriteRun(writer); } - idx_t GetByteCount() { - D_ASSERT(byte_count != idx_t(-1)); - return byte_count; - } - private: - //! meta information + //! Meta information + uint32_t bit_width; uint32_t byte_width; - //! RLE run information - idx_t byte_count; - idx_t run_count; - idx_t current_run_count; - uint32_t last_value; + + //! RLE stuff + static constexpr idx_t MINIMUM_RLE_COUNT = 4; + uint32_t rle_value; + idx_t rle_count; + + //! BP stuff + static constexpr idx_t BP_BLOCK_SIZE = BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; + uint32_t bp_block[BP_BLOCK_SIZE] = {0}; + uint32_t bp_block_packed[BP_BLOCK_SIZE] = {0}; + idx_t bp_block_count; private: - void FinishRun() { - // last value, or value has changed - // write out the current run - byte_count += ParquetDecodeUtils::GetVarintSize(current_run_count << 1) + byte_width; - current_run_count = 1; - run_count++; - } void WriteRun(WriteStream &writer) { - // write the header of the run - ParquetDecodeUtils::VarintEncode(current_run_count << 1, writer); - // now write the value - D_ASSERT(last_value >> (byte_width * 8) == 0); + if (rle_count != 0) { + WriteCurrentBlockRLE(writer); + } else { + WriteCurrentBlockBP(writer); + } + } + + void WriteCurrentBlockRLE(WriteStream &writer) { + ParquetDecodeUtils::VarintEncode(rle_count << 1 | 0, writer); // (... | 0) signals RLE run + D_ASSERT(rle_value >> (byte_width * 8) == 0); switch (byte_width) { case 1: - writer.Write(last_value); + writer.Write(rle_value); break; case 2: - writer.Write(last_value); + writer.Write(rle_value); break; case 3: - writer.Write(last_value & 0xFF); - writer.Write((last_value >> 8) & 0xFF); - writer.Write((last_value >> 16) & 0xFF); + writer.Write(rle_value & 0xFF); + writer.Write((rle_value >> 8) & 0xFF); + writer.Write((rle_value >> 16) & 0xFF); break; case 4: - writer.Write(last_value); + writer.Write(rle_value); break; default: throw InternalException("unsupported byte width for RLE encoding"); } - current_run_count = 1; + rle_count = 0; + } + + void WriteCurrentBlockBP(WriteStream &writer) { + ParquetDecodeUtils::VarintEncode(BP_BLOCK_SIZE / 8 << 1 | 1, writer); // (... | 1) signals BP run + ParquetDecodeUtils::BitPackAligned(bp_block, data_ptr_cast(bp_block_packed), BP_BLOCK_SIZE, bit_width); + writer.WriteData(data_ptr_cast(bp_block_packed), BP_BLOCK_SIZE * bit_width / 8); + bp_block_count = 0; } }; diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index a4324a70b48a..d9dbbc4bb4c4 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -240,15 +240,12 @@ class StandardColumnWriter : public PrimitiveColumnWriter { auto &src_val = data_ptr[r]; auto value_index = page_state.dictionary.at(src_val); if (!page_state.dict_written_value) { - // first value - // write the bit-width as a one-byte entry + // first value: write the bit-width as a one-byte entry and initialize writer temp_writer.Write(page_state.dict_bit_width); - // now begin writing the actual value - page_state.dict_encoder.BeginWrite(temp_writer, value_index); + page_state.dict_encoder.BeginWrite(); page_state.dict_written_value = true; - } else { - page_state.dict_encoder.WriteValue(temp_writer, value_index); } + page_state.dict_encoder.WriteValue(temp_writer, value_index); } break; } diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index f93b3f04acbe..9545b3cb96c6 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -192,7 +192,12 @@ struct ParquetWriteBindData : public TableFunctionData { bool debug_use_openssl = true; //! After how many distinct values should we abandon dictionary compression and bloom filters? - idx_t dictionary_size_limit = row_group_size / 100; + idx_t dictionary_size_limit = row_group_size / 20; + + void SetToDefaultDictionarySizeLimit() { + // This depends on row group size so we should "reset" if the row group size is changed + dictionary_size_limit = row_group_size / 20; + } //! What false positive rate are we willing to accept for bloom filters double bloom_filter_false_positive_ratio = 0.01; @@ -1185,6 +1190,7 @@ unique_ptr ParquetWriteBind(ClientContext &context, CopyFunctionBi D_ASSERT(names.size() == sql_types.size()); bool row_group_size_bytes_set = false; bool compression_level_set = false; + bool dictionary_size_limit_set = false; auto bind_data = make_uniq(); for (auto &option : input.info.options) { const auto loption = StringUtil::Lower(option.first); @@ -1194,6 +1200,9 @@ unique_ptr ParquetWriteBind(ClientContext &context, CopyFunctionBi } if (loption == "row_group_size" || loption == "chunk_size") { bind_data->row_group_size = option.second[0].GetValue(); + if (!dictionary_size_limit_set) { + bind_data->SetToDefaultDictionarySizeLimit(); + } } else if (loption == "row_group_size_bytes") { auto roption = option.second[0]; if (roption.GetTypeMutable().id() == LogicalTypeId::VARCHAR) { @@ -1269,6 +1278,7 @@ unique_ptr ParquetWriteBind(ClientContext &context, CopyFunctionBi throw BinderException("dictionary_size_limit must be greater than 0 or 0 to disable"); } bind_data->dictionary_size_limit = val; + dictionary_size_limit_set = true; } else if (loption == "bloom_filter_false_positive_ratio") { auto val = option.second[0].GetValue(); if (val <= 0) { diff --git a/extension/parquet/writer/enum_column_writer.cpp b/extension/parquet/writer/enum_column_writer.cpp index 51a2959cf36c..8518019efedd 100644 --- a/extension/parquet/writer/enum_column_writer.cpp +++ b/extension/parquet/writer/enum_column_writer.cpp @@ -36,15 +36,12 @@ void EnumColumnWriter::WriteEnumInternal(WriteStream &temp_writer, Vector &input for (idx_t r = chunk_start; r < chunk_end; r++) { if (mask.RowIsValid(r)) { if (!page_state.written_value) { - // first value - // write the bit-width as a one-byte entry + // first value: write the bit-width as a one-byte entry and initialize writer temp_writer.Write(bit_width); - // now begin writing the actual value - page_state.encoder.BeginWrite(temp_writer, ptr[r]); + page_state.encoder.BeginWrite(); page_state.written_value = true; - } else { - page_state.encoder.WriteValue(temp_writer, ptr[r]); } + page_state.encoder.WriteValue(temp_writer, ptr[r]); } } } diff --git a/extension/parquet/writer/primitive_column_writer.cpp b/extension/parquet/writer/primitive_column_writer.cpp index 627605fa23fa..d69504717365 100644 --- a/extension/parquet/writer/primitive_column_writer.cpp +++ b/extension/parquet/writer/primitive_column_writer.cpp @@ -123,22 +123,21 @@ void PrimitiveColumnWriter::WriteLevels(WriteStream &temp_writer, const unsafe_v } // write the levels using the RLE-BP encoding - auto bit_width = RleBpDecoder::ComputeBitWidth((max_value)); + const auto bit_width = RleBpDecoder::ComputeBitWidth((max_value)); RleBpEncoder rle_encoder(bit_width); - rle_encoder.BeginPrepare(levels[offset]); - for (idx_t i = offset + 1; i < offset + count; i++) { - rle_encoder.PrepareValue(levels[i]); + // have to write to an intermediate stream first because we need to know the size + MemoryStream intermediate_stream(Allocator::DefaultAllocator()); + rle_encoder.BeginWrite(); + for (idx_t i = offset; i < offset + count; i++) { + rle_encoder.WriteValue(intermediate_stream, levels[i]); } - rle_encoder.FinishPrepare(); + rle_encoder.FinishWrite(intermediate_stream); // start off by writing the byte count as a uint32_t - temp_writer.Write(rle_encoder.GetByteCount()); - rle_encoder.BeginWrite(temp_writer, levels[offset]); - for (idx_t i = offset + 1; i < offset + count; i++) { - rle_encoder.WriteValue(temp_writer, levels[i]); - } - rle_encoder.FinishWrite(temp_writer); + temp_writer.Write(NumericCast(intermediate_stream.GetPosition())); + // copy over the written data + temp_writer.WriteData(intermediate_stream.GetData(), intermediate_stream.GetPosition()); } void PrimitiveColumnWriter::NextPage(PrimitiveColumnWriterState &state) { From c8e5916cb4f41dc5c31559af8fc4b1b1c6198e48 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Wed, 12 Feb 2025 16:21:16 +0100 Subject: [PATCH 050/142] init primitive dictionary --- extension/parquet/column_writer.cpp | 6 +- .../writer/primitive_column_writer.hpp | 5 +- .../writer/templated_column_writer.hpp | 15 ++- .../writer/primitive_column_writer.cpp | 2 +- .../duckdb/common/primitive_dictionary.hpp | 106 ++++++++++++++++++ 5 files changed, 122 insertions(+), 12 deletions(-) create mode 100644 src/include/duckdb/common/primitive_dictionary.hpp diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index 9693724d2c35..178d2fe91d0b 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -209,8 +209,8 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat // GeoParquet files. class WKBColumnWriterState final : public StandardColumnWriterState { public: - WKBColumnWriterState(ClientContext &context, duckdb_parquet::RowGroup &row_group, idx_t col_idx) - : StandardColumnWriterState(row_group, col_idx), geo_data(), geo_data_writer(context) { + WKBColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx) + : StandardColumnWriterState(writer, row_group, col_idx), geo_data(), geo_data_writer(writer.GetContext()) { } GeoParquetColumnMetadata geo_data; @@ -228,7 +228,7 @@ class WKBColumnWriter final : public StandardColumnWriter InitializeWriteState(duckdb_parquet::RowGroup &row_group) override { - auto result = make_uniq(context, row_group, row_group.columns.size()); + auto result = make_uniq(writer, row_group, row_group.columns.size()); result->encoding = Encoding::RLE_DICTIONARY; RegisterToRowGroup(row_group); return std::move(result); diff --git a/extension/parquet/include/writer/primitive_column_writer.hpp b/extension/parquet/include/writer/primitive_column_writer.hpp index 0a97064e918a..4e9e55436a6d 100644 --- a/extension/parquet/include/writer/primitive_column_writer.hpp +++ b/extension/parquet/include/writer/primitive_column_writer.hpp @@ -36,12 +36,13 @@ struct PageWriteInformation { class PrimitiveColumnWriterState : public ColumnWriterState { public: - PrimitiveColumnWriterState(duckdb_parquet::RowGroup &row_group, idx_t col_idx) - : row_group(row_group), col_idx(col_idx) { + PrimitiveColumnWriterState(ParquetWriter &writer_p, duckdb_parquet::RowGroup &row_group, idx_t col_idx) + : writer(writer_p), row_group(row_group), col_idx(col_idx) { page_info.emplace_back(); } ~PrimitiveColumnWriterState() override = default; + ParquetWriter &writer; duckdb_parquet::RowGroup &row_group; idx_t col_idx; vector page_info; diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index d9dbbc4bb4c4..5adfbd832e94 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -13,6 +13,7 @@ #include "parquet_dbp_encoder.hpp" #include "parquet_dlba_encoder.hpp" #include "parquet_rle_bp_encoder.hpp" +#include "duckdb/common/primitive_dictionary.hpp" namespace duckdb { @@ -34,8 +35,10 @@ static void TemplatedWritePlain(Vector &col, ColumnWriterStatistics *stats, cons template class StandardColumnWriterState : public PrimitiveColumnWriterState { public: - StandardColumnWriterState(duckdb_parquet::RowGroup &row_group, idx_t col_idx) - : PrimitiveColumnWriterState(row_group, col_idx) { + StandardColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx) + : PrimitiveColumnWriterState(writer, row_group, col_idx), + dictionary(BufferAllocator::Get(writer.GetContext()), writer.DictionarySizeLimit(), + 2e6) { // TODO: make size configurable } ~StandardColumnWriterState() override = default; @@ -44,7 +47,7 @@ class StandardColumnWriterState : public PrimitiveColumnWriterState { idx_t total_string_size = 0; uint32_t key_bit_width = 0; - unordered_map dictionary; + PrimitiveDictionary dictionary; duckdb_parquet::Encoding::type encoding; }; @@ -53,7 +56,7 @@ class StandardWriterPageState : public ColumnWriterPageState { public: explicit StandardWriterPageState(const idx_t total_value_count, const idx_t total_string_size, duckdb_parquet::Encoding::type encoding_p, - const unordered_map &dictionary_p) + const PrimitiveDictionary &dictionary_p) : encoding(encoding_p), dbp_initialized(false), dbp_encoder(total_value_count), dlba_initialized(false), dlba_encoder(total_value_count, total_string_size), bss_encoder(total_value_count, sizeof(TGT)), dictionary(dictionary_p), dict_written_value(false), @@ -69,7 +72,7 @@ class StandardWriterPageState : public ColumnWriterPageState { BssEncoder bss_encoder; - const unordered_map &dictionary; + const PrimitiveDictionary &dictionary; bool dict_written_value; uint32_t dict_bit_width; RleBpEncoder dict_encoder; @@ -86,7 +89,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { public: unique_ptr InitializeWriteState(duckdb_parquet::RowGroup &row_group) override { - auto result = make_uniq>(row_group, row_group.columns.size()); + auto result = make_uniq>(writer, row_group, row_group.columns.size()); result->encoding = duckdb_parquet::Encoding::RLE_DICTIONARY; RegisterToRowGroup(row_group); return std::move(result); diff --git a/extension/parquet/writer/primitive_column_writer.cpp b/extension/parquet/writer/primitive_column_writer.cpp index d69504717365..675379873809 100644 --- a/extension/parquet/writer/primitive_column_writer.cpp +++ b/extension/parquet/writer/primitive_column_writer.cpp @@ -13,7 +13,7 @@ PrimitiveColumnWriter::PrimitiveColumnWriter(ParquetWriter &writer, idx_t schema } unique_ptr PrimitiveColumnWriter::InitializeWriteState(duckdb_parquet::RowGroup &row_group) { - auto result = make_uniq(row_group, row_group.columns.size()); + auto result = make_uniq(writer, row_group, row_group.columns.size()); RegisterToRowGroup(row_group); return std::move(result); } diff --git a/src/include/duckdb/common/primitive_dictionary.hpp b/src/include/duckdb/common/primitive_dictionary.hpp new file mode 100644 index 000000000000..db77cad84410 --- /dev/null +++ b/src/include/duckdb/common/primitive_dictionary.hpp @@ -0,0 +1,106 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/common/primitive_dictionary.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/common/types/string_type.hpp" +#include "duckdb/common/allocator.hpp" + +namespace duckdb { + +template +class PrimitiveDictionary { +private: + static constexpr uint32_t INVALID_OFFSET = static_cast(-1); + + struct primitive_dictionary_entry_t { + T value; + uint32_t offset; + }; + +public: + PrimitiveDictionary(Allocator &allocator, idx_t maximum_size_p, idx_t plain_capacity_p) + : maximum_size(maximum_size_p), size(0), capacity(NextPowerOfTwo(maximum_size * 2)), + capacity_mask(capacity - 1), plain_capacity(plain_capacity_p), plain_offset(0), + allocated_dictionary(allocator.Allocate(capacity * sizeof(primitive_dictionary_entry_t))), + allocated_plain(allocator.Allocate(std::is_same::value ? plain_capacity : capacity * sizeof(T))), + dictionary(reinterpret_cast(allocated_dictionary.get())), + plain(allocated_plain.get()) { + // Initialize empty + for (idx_t i = 0; i < capacity; i++) { + dictionary[i].offset = INVALID_OFFSET; + } + } + +public: + bool Insert(T value, uint32_t &offset) { + auto &entry = Lookup(value); + bool success = size < capacity; + if (entry.offset == INVALID_OFFSET) { + success &= AddToPlain(value); + entry.value = value; + entry.offset = size++; + } + offset = entry.offset; + return success; + } + + uint32_t GetOffset(const T &value) const { + return Lookup(value).offset; + } + +private: + primitive_dictionary_entry_t &Lookup(const T &value) const { + return dictionary[Hash(value) & capacity_mask]; + } + + bool AddToPlain(const T &value) { + static_cast(plain)[plain_offset++] = value; + return true; + } + + bool AddToPlain(string_t &value) { + if (plain_offset + sizeof(uint32_t) + value.GetSize() > plain_capacity) { + return false; // Out of capacity + } + + // Store string length and increment offset + Store(UnsafeNumericCast(value.GetSize()), plain + plain_offset); + plain_offset += sizeof(uint32_t); + + // Copy over string data to plain, update "value" to point to it, and increment offset + memcpy(plain + plain_offset, value.GetData(), value.GetSize()); + value = string_t(char_ptr_cast(plain + plain_offset), value.GetSize()); + plain_offset += value.GetSize(); + + return true; + } + +private: + //! Maximum size and current size + const idx_t maximum_size; + idx_t size; + + //! Capacity (power of two) and corresponding mask + const idx_t capacity; + const idx_t capacity_mask; + + //! Capacity/offset of plain encoded data + const idx_t plain_capacity; + idx_t plain_offset; + + //! Allocated regions for dictionary/plain + AllocatedData allocated_dictionary; + AllocatedData allocated_plain; + + //! Pointers to allocated regions for convenience + primitive_dictionary_entry_t *const dictionary; + data_ptr_t const plain; +}; + +} // namespace duckdb From 1e5d01dee7555f01ce403d576a06d08c893a6802 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 12 Feb 2025 17:34:55 +0100 Subject: [PATCH 051/142] support unnamed structs to appear in the other casts (MAP KEY+VALUE, (named)STRUCT VALUE, LIST VALUE) --- src/function/cast/vector_cast_helpers.cpp | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index f00bb8157249..0600eb1de4ce 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -273,6 +273,16 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { return false; } end_pos = pos; + } else if (buf[pos] == '(') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + //! Start of an (unnamed) STRUCT + idx_t struct_lvl = 0; + if (!SkipToClose(input_state, struct_lvl, ')')) { + return false; + } + end_pos = pos; } else if ((buf[pos] == ',' || buf[pos] == ']')) { if (buf[pos] != ']' || start_pos.IsValid() || seen_value) { if (!start_pos.IsValid()) { @@ -411,6 +421,14 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { return false; } end_pos = pos; + } else if (buf[pos] == '(') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToClose(input_state, lvl, ')')) { + return false; + } + end_pos = pos; } else if (buf[pos] == '[') { if (!start_pos.IsValid()) { start_pos = pos; @@ -474,6 +492,14 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { return false; } end_pos = pos; + } else if (buf[pos] == '(') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToClose(input_state, lvl, ')')) { + return false; + } + end_pos = pos; } else if (buf[pos] == '[') { if (!start_pos.IsValid()) { start_pos = pos; From 7642be6e9a8b18e0989ae98fff831dcd224c808e Mon Sep 17 00:00:00 2001 From: Richard Wesley Date: Wed, 12 Feb 2025 11:04:59 -0800 Subject: [PATCH 052/142] Issue #8265: AsOf Nested Loop * Set default asof_loop_join_threshold based on tuning tests. --- src/include/duckdb/main/client_config.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/duckdb/main/client_config.hpp b/src/include/duckdb/main/client_config.hpp index 4e73f8f922b8..4398a788585c 100644 --- a/src/include/duckdb/main/client_config.hpp +++ b/src/include/duckdb/main/client_config.hpp @@ -102,7 +102,7 @@ struct ClientConfig { //! The number of rows we need on either table to choose a merge join over an IE join idx_t merge_join_threshold = 1000; //! The maximum number of rows to use the nested loop join implementation - idx_t asof_loop_join_threshold = 2048; + idx_t asof_loop_join_threshold = 64; //! The maximum amount of memory to keep buffered in a streaming query result. Default: 1mb. idx_t streaming_buffer_size = 1000000; From 86292911e02543a6b81cd89e059b89b9c4394ddf Mon Sep 17 00:00:00 2001 From: pdet Date: Wed, 12 Feb 2025 17:46:06 -0300 Subject: [PATCH 053/142] Also use destroy_statement --- src/common/adbc/adbc.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/adbc/adbc.cpp b/src/common/adbc/adbc.cpp index 95cc90639d7a..b83bcbc67585 100644 --- a/src/common/adbc/adbc.cpp +++ b/src/common/adbc/adbc.cpp @@ -875,7 +875,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char duckdb_destroy_prepare(&wrapper->statement); wrapper->statement = nullptr; } - duckdb_extracted_statements extracted_statements; + duckdb_extracted_statements extracted_statements = nullptr; auto extract_statements_size = duckdb_extract_statements(wrapper->connection, query, &extracted_statements); auto error_msg_extract_statements = duckdb_extract_statements_error(extracted_statements); if (error_msg_extract_statements != nullptr) { @@ -886,7 +886,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char } // Now lets loop over the statements, and execute every one for (idx_t i = 0; i < extract_statements_size - 1; i++) { - duckdb_prepared_statement statement_internal; + duckdb_prepared_statement statement_internal = nullptr; auto res = duckdb_prepare_extracted_statement(wrapper->connection, extracted_statements, i, &statement_internal); auto error_msg = duckdb_prepare_error(statement_internal); @@ -894,7 +894,7 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char if (adbc_status != ADBC_STATUS_OK) { // Things went wrong when executing internal prepared statement duckdb_destroy_extracted(&extracted_statements); - delete statement_internal; + duckdb_destroy_prepare(&statement_internal); return adbc_status; } // Execute @@ -903,12 +903,12 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char if (res != DuckDBSuccess) { SetError(error, duckdb_query_arrow_error(out_result)); delete out_result; - delete statement_internal; + duckdb_destroy_prepare(&statement_internal); duckdb_destroy_extracted(&extracted_statements); return ADBC_STATUS_INVALID_ARGUMENT; } delete out_result; - delete statement_internal; + duckdb_destroy_prepare(&statement_internal); } // Besides ze last, this one we return auto res = duckdb_prepare_extracted_statement(wrapper->connection, extracted_statements, From 5649f9141c88a2139300025955fd329a76eeebf4 Mon Sep 17 00:00:00 2001 From: Richard Wesley <13156216+hawkfish@users.noreply.github.com> Date: Wed, 12 Feb 2025 15:34:47 -0800 Subject: [PATCH 054/142] Issue #8265: AsOf Nested Loop * Fix ASCII art final baskslash silliness with an NBS... --- src/execution/physical_plan/plan_asof_join.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/execution/physical_plan/plan_asof_join.cpp b/src/execution/physical_plan/plan_asof_join.cpp index aa2df50d6313..10fc0d47d059 100644 --- a/src/execution/physical_plan/plan_asof_join.cpp +++ b/src/execution/physical_plan/plan_asof_join.cpp @@ -31,7 +31,7 @@ static unique_ptr PlanAsOfLoopJoin(LogicalComparisonJoin &op, // ∠*,inequality // | // ⨠swapped - // / \ + // / \  // B W pk:row_number // | // P From de5a838119946e42ab773950218ab79afbe61966 Mon Sep 17 00:00:00 2001 From: Richard Wesley <13156216+hawkfish@users.noreply.github.com> Date: Wed, 12 Feb 2025 18:38:03 -0800 Subject: [PATCH 055/142] Issue #8265: AsOf Nested Loop * Make test deterministic --- test/sql/join/asof/test_asof_join_pushdown.test | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/sql/join/asof/test_asof_join_pushdown.test b/test/sql/join/asof/test_asof_join_pushdown.test index 465b407ad850..8a55a0ebd400 100644 --- a/test/sql/join/asof/test_asof_join_pushdown.test +++ b/test/sql/join/asof/test_asof_join_pushdown.test @@ -63,7 +63,8 @@ FROM right_pushdown d1 ASOF LEFT JOIN ( SELECT * FROM right_pushdown WHERE value is not NULL ) d2 - ON d1.time >= d2.time; + ON d1.time >= d2.time +ORDER BY ALL; ---- 0 0 0.0 0.0 1 0 NULL 0.0 From e136bc76034e84f0788a39edc8985c2e74f8ab35 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Thu, 13 Feb 2025 11:51:43 +0100 Subject: [PATCH 056/142] integrate PrimitiveDictionary into Parquet writer and improve writing levels --- extension/parquet/column_writer.cpp | 68 +++++------- .../include/parquet_rle_bp_encoder.hpp | 15 ++- .../writer/primitive_column_writer.hpp | 2 +- .../writer/templated_column_writer.hpp | 50 +++------ .../writer/primitive_column_writer.cpp | 15 ++- src/common/serializer/memory_stream.cpp | 4 + src/common/types/hash.cpp | 5 + .../duckdb/common/primitive_dictionary.hpp | 104 ++++++++++++++---- .../common/serializer/memory_stream.hpp | 41 +++---- src/include/duckdb/common/types/datetime.hpp | 8 ++ src/include/duckdb/common/types/hash.hpp | 3 + 11 files changed, 195 insertions(+), 120 deletions(-) diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index 178d2fe91d0b..4841aca02355 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -191,10 +191,14 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat } } else { // no parent: set definition levels only from this validity mask - for (idx_t i = 0; i < count; i++) { - const auto is_null = !validity.RowIsValid(i); - state.definition_levels.emplace_back(is_null ? null_value : define_value); - state.null_count += is_null; + if (validity.AllValid()) { + state.definition_levels.insert(state.definition_levels.end(), count, define_value); + } else { + for (idx_t i = 0; i < count; i++) { + const auto is_null = !validity.RowIsValid(i); + state.definition_levels.emplace_back(is_null ? null_value : define_value); + state.null_count += is_null; + } } if (!can_have_nulls && state.null_count != 0) { throw IOException("Parquet writer: map key column is not allowed to contain NULL values"); @@ -219,10 +223,10 @@ class WKBColumnWriterState final : public StandardColumnWriterState { class WKBColumnWriter final : public StandardColumnWriter { public: - WKBColumnWriter(ClientContext &context_p, ParquetWriter &writer, idx_t schema_idx, vector schema_path_p, - idx_t max_repeat, idx_t max_define, bool can_have_nulls, string name) + WKBColumnWriter(ParquetWriter &writer, idx_t schema_idx, vector schema_path_p, idx_t max_repeat, + idx_t max_define, bool can_have_nulls, string name) : StandardColumnWriter(writer, schema_idx, std::move(schema_path_p), max_repeat, max_define, can_have_nulls), - column_name(std::move(name)), context(context_p) { + column_name(std::move(name)) { this->writer.GetGeoParquetData().RegisterGeometryColumn(column_name); } @@ -253,7 +257,6 @@ class WKBColumnWriter final : public StandardColumnWriter ColumnWriter::CreateWriterRecursive(ClientContext &cont schema_path.push_back(name); if (type.id() == LogicalTypeId::BLOB && type.GetAlias() == "WKB_BLOB" && GeoParquetFileMetadata::IsGeoParquetConversionEnabled(context)) { - return make_uniq(context, writer, schema_idx, std::move(schema_path), max_repeat, max_define, + return make_uniq(writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls, name); } @@ -584,41 +587,30 @@ struct NumericLimits { } }; -} // namespace duckdb - -namespace std { template <> -struct hash { - size_t operator()(const duckdb::ParquetIntervalTargetType &val) const { - return duckdb::Hash(duckdb::const_char_ptr_cast(val.bytes), - duckdb::ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE); - } -}; +hash_t Hash(ParquetIntervalTargetType val) { + return Hash(const_char_ptr_cast(val.bytes), ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE); +} template <> -struct hash { - size_t operator()(const duckdb::ParquetUUIDTargetType &val) const { - return duckdb::Hash(duckdb::const_char_ptr_cast(val.bytes), duckdb::ParquetUUIDTargetType::PARQUET_UUID_SIZE); - } -}; +hash_t Hash(ParquetUUIDTargetType val) { + return Hash(const_char_ptr_cast(val.bytes), ParquetUUIDTargetType::PARQUET_UUID_SIZE); +} template <> -struct hash { - size_t operator()(const duckdb::float_na_equal &val) const { - if (std::isnan(val.val)) { - return duckdb::Hash(std::numeric_limits::quiet_NaN()); - } - return duckdb::Hash(val.val); +hash_t Hash(float_na_equal val) { + if (std::isnan(val.val)) { + return Hash(std::numeric_limits::quiet_NaN()); } -}; + return Hash(val.val); +} template <> -struct hash { - inline size_t operator()(const duckdb::double_na_equal &val) const { - if (std::isnan(val.val)) { - return duckdb::Hash(std::numeric_limits::quiet_NaN()); - } - return duckdb::Hash(val.val); +hash_t Hash(double_na_equal val) { + if (std::isnan(val.val)) { + return Hash(std::numeric_limits::quiet_NaN()); } -}; -} // namespace std + return Hash(val.val); +} + +} // namespace duckdb diff --git a/extension/parquet/include/parquet_rle_bp_encoder.hpp b/extension/parquet/include/parquet_rle_bp_encoder.hpp index 689d0fee132c..b0fd130a33f4 100644 --- a/extension/parquet/include/parquet_rle_bp_encoder.hpp +++ b/extension/parquet/include/parquet_rle_bp_encoder.hpp @@ -8,9 +8,7 @@ #pragma once -#include "parquet_types.h" -#include "thrift_tools.hpp" -#include "resizable_buffer.hpp" +#include "decode_utils.hpp" namespace duckdb { @@ -25,7 +23,7 @@ class RleBpEncoder { bp_block_count = 0; } - void WriteValue(WriteStream &writer, uint32_t value) { + void WriteValue(WriteStream &writer, const uint32_t &value) { if (bp_block_count != 0) { // We already committed to a BP run D_ASSERT(rle_count == 0); @@ -68,6 +66,15 @@ class RleBpEncoder { rle_count = 0; } + void WriteMany(WriteStream &writer, uint32_t value, idx_t count) { + D_ASSERT(bp_block_count == 0); + if (rle_count != 0) { + WriteRun(writer); + } + rle_value = value; + rle_count = count; + } + void FinishWrite(WriteStream &writer) { WriteRun(writer); } diff --git a/extension/parquet/include/writer/primitive_column_writer.hpp b/extension/parquet/include/writer/primitive_column_writer.hpp index 4e9e55436a6d..ccaa02f79503 100644 --- a/extension/parquet/include/writer/primitive_column_writer.hpp +++ b/extension/parquet/include/writer/primitive_column_writer.hpp @@ -82,7 +82,7 @@ class PrimitiveColumnWriter : public ColumnWriter { protected: static void WriteLevels(WriteStream &temp_writer, const unsafe_vector &levels, idx_t max_value, - idx_t start_offset, idx_t count); + idx_t start_offset, idx_t count, optional_idx null_count = optional_idx()); virtual duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state); diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index 5adfbd832e94..ca3b3ca8a804 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -38,7 +38,8 @@ class StandardColumnWriterState : public PrimitiveColumnWriterState { StandardColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx) : PrimitiveColumnWriterState(writer, row_group, col_idx), dictionary(BufferAllocator::Get(writer.GetContext()), writer.DictionarySizeLimit(), - 2e6) { // TODO: make size configurable + 2097152), // TODO: make size configurable + encoding(duckdb_parquet::Encoding::PLAIN) { } ~StandardColumnWriterState() override = default; @@ -60,7 +61,7 @@ class StandardWriterPageState : public ColumnWriterPageState { : encoding(encoding_p), dbp_initialized(false), dbp_encoder(total_value_count), dlba_initialized(false), dlba_encoder(total_value_count, total_string_size), bss_encoder(total_value_count, sizeof(TGT)), dictionary(dictionary_p), dict_written_value(false), - dict_bit_width(RleBpDecoder::ComputeBitWidth(dictionary.size())), dict_encoder(dict_bit_width) { + dict_bit_width(RleBpDecoder::ComputeBitWidth(dictionary.GetSize())), dict_encoder(dict_bit_width) { } duckdb_parquet::Encoding::type encoding; @@ -152,7 +153,6 @@ class StandardColumnWriter : public PrimitiveColumnWriter { auto data_ptr = FlatVector::GetData(vector); idx_t vector_index = 0; - uint32_t new_value_index = state.dictionary.size(); const bool check_parent_empty = parent && !parent->is_empty.empty(); const idx_t parent_index = state.definition_levels.size(); @@ -168,12 +168,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } if (validity.RowIsValid(vector_index)) { const auto &src_value = data_ptr[vector_index]; - if (state.dictionary.size() <= writer.DictionarySizeLimit()) { - if (state.dictionary.find(src_value) == state.dictionary.end()) { - state.dictionary[src_value] = new_value_index; - new_value_index++; - } - } + state.dictionary.Insert(src_value); state.total_value_count++; state.total_string_size += dlba_encoder::GetDlbaStringSize(src_value); } @@ -185,7 +180,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { const auto type = writer.GetType(schema_idx); auto &state = state_p.Cast>(); - if (state.dictionary.size() == 0 || state.dictionary.size() > writer.DictionarySizeLimit()) { + if (state.dictionary.GetSize() == 0 || state.dictionary.IsFull()) { if (writer.GetParquetVersion() == ParquetVersion::V1) { // Can't do the cool stuff for V1 state.encoding = duckdb_parquet::Encoding::PLAIN; @@ -207,9 +202,8 @@ class StandardColumnWriter : public PrimitiveColumnWriter { state.encoding = duckdb_parquet::Encoding::PLAIN; } } - state.dictionary.clear(); } else { - state.key_bit_width = RleBpDecoder::ComputeBitWidth(state.dictionary.size()); + state.key_bit_width = RleBpDecoder::ComputeBitWidth(state.dictionary.GetSize()); } } @@ -224,7 +218,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override { auto &state = state_p.Cast>(); - return state.dictionary.size(); + return state.dictionary.GetSize(); } void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state_p, @@ -240,14 +234,14 @@ class StandardColumnWriter : public PrimitiveColumnWriter { if (!mask.RowIsValid(r)) { continue; } - auto &src_val = data_ptr[r]; - auto value_index = page_state.dictionary.at(src_val); if (!page_state.dict_written_value) { // first value: write the bit-width as a one-byte entry and initialize writer temp_writer.Write(page_state.dict_bit_width); page_state.dict_encoder.BeginWrite(); page_state.dict_written_value = true; } + const auto &src_value = data_ptr[r]; + const auto value_index = page_state.dictionary.GetIndex(src_value); page_state.dict_encoder.WriteValue(temp_writer, value_index); } break; @@ -329,34 +323,22 @@ class StandardColumnWriter : public PrimitiveColumnWriter { void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override { auto &state = state_p.Cast>(); - D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY); - // first we need to sort the values in index order - auto values = vector(state.dictionary.size()); - for (const auto &entry : state.dictionary) { - values[entry.second] = entry.first; - } - state.bloom_filter = - make_uniq(state.dictionary.size(), writer.BloomFilterFalsePositiveRatio()); - - // first write the contents of the dictionary page to a temporary buffer - auto temp_writer = make_uniq( - Allocator::Get(writer.GetContext()), MaxValue(NextPowerOfTwo(state.dictionary.size() * sizeof(TGT)), - MemoryStream::DEFAULT_INITIAL_CAPACITY)); - for (idx_t r = 0; r < values.size(); r++) { - const TGT target_value = OP::template Operation(values[r]); + make_uniq(state.dictionary.GetSize(), writer.BloomFilterFalsePositiveRatio()); + + state.dictionary.IterateValues([&](const SRC &value) { + const TGT target_value = OP::template Operation(value); // update the statistics OP::template HandleStats(stats, target_value); // update the bloom filter auto hash = OP::template XXHash64(target_value); state.bloom_filter->FilterInsert(hash); - // actually write the dictionary value - OP::template WriteToStream(target_value, *temp_writer); - } + }); + // flush the dictionary page and add it to the to-be-written pages - WriteDictionary(state, std::move(temp_writer), values.size()); + WriteDictionary(state, state.dictionary.GetPlainMemoryStream(), state.dictionary.GetSize()); // bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up } diff --git a/extension/parquet/writer/primitive_column_writer.cpp b/extension/parquet/writer/primitive_column_writer.cpp index 675379873809..589f94ab316f 100644 --- a/extension/parquet/writer/primitive_column_writer.cpp +++ b/extension/parquet/writer/primitive_column_writer.cpp @@ -117,7 +117,7 @@ void PrimitiveColumnWriter::BeginWrite(ColumnWriterState &state_p) { } void PrimitiveColumnWriter::WriteLevels(WriteStream &temp_writer, const unsafe_vector &levels, - idx_t max_value, idx_t offset, idx_t count) { + idx_t max_value, idx_t offset, idx_t count, optional_idx null_count) { if (levels.empty() || count == 0) { return; } @@ -128,9 +128,15 @@ void PrimitiveColumnWriter::WriteLevels(WriteStream &temp_writer, const unsafe_v // have to write to an intermediate stream first because we need to know the size MemoryStream intermediate_stream(Allocator::DefaultAllocator()); + rle_encoder.BeginWrite(); - for (idx_t i = offset; i < offset + count; i++) { - rle_encoder.WriteValue(intermediate_stream, levels[i]); + if (null_count.IsValid() && null_count.GetIndex() == 0 || null_count.GetIndex() == count) { + // All are NULL or none are NULL + rle_encoder.WriteMany(intermediate_stream, levels[0], count); + } else { + for (idx_t i = offset; i < offset + count; i++) { + rle_encoder.WriteValue(intermediate_stream, levels[i]); + } } rle_encoder.FinishWrite(intermediate_stream); @@ -159,7 +165,8 @@ void PrimitiveColumnWriter::NextPage(PrimitiveColumnWriterState &state) { WriteLevels(temp_writer, state.repetition_levels, max_repeat, page_info.offset, page_info.row_count); // write the definition levels - WriteLevels(temp_writer, state.definition_levels, max_define, page_info.offset, page_info.row_count); + WriteLevels(temp_writer, state.definition_levels, max_define, page_info.offset, page_info.row_count, + state.null_count); } void PrimitiveColumnWriter::FlushPage(PrimitiveColumnWriterState &state) { diff --git a/src/common/serializer/memory_stream.cpp b/src/common/serializer/memory_stream.cpp index 92419b8d04c8..d608392e8cf7 100644 --- a/src/common/serializer/memory_stream.cpp +++ b/src/common/serializer/memory_stream.cpp @@ -102,4 +102,8 @@ idx_t MemoryStream::GetCapacity() const { return capacity; } +void MemoryStream::SetPosition(idx_t position_p) { + position = position_p; +} + } // namespace duckdb diff --git a/src/common/types/hash.cpp b/src/common/types/hash.cpp index 160d5f3c2924..f54295af4656 100644 --- a/src/common/types/hash.cpp +++ b/src/common/types/hash.cpp @@ -66,6 +66,11 @@ hash_t Hash(interval_t val) { return Hash(days) ^ Hash(months) ^ Hash(micros); } +template <> +hash_t Hash(dtime_tz_t val) { + return Hash(val.bits); +} + template <> hash_t Hash(const char *str) { return Hash(str, strlen(str)); diff --git a/src/include/duckdb/common/primitive_dictionary.hpp b/src/include/duckdb/common/primitive_dictionary.hpp index db77cad84410..0976882a2c8d 100644 --- a/src/include/duckdb/common/primitive_dictionary.hpp +++ b/src/include/duckdb/common/primitive_dictionary.hpp @@ -10,72 +10,132 @@ #include "duckdb/common/types/string_type.hpp" #include "duckdb/common/allocator.hpp" +#include "duckdb/common/serializer/memory_stream.hpp" namespace duckdb { template class PrimitiveDictionary { private: - static constexpr uint32_t INVALID_OFFSET = static_cast(-1); + static constexpr idx_t LOAD_FACTOR = 2; + static constexpr uint32_t INVALID_INDEX = static_cast(-1); struct primitive_dictionary_entry_t { T value; - uint32_t offset; + uint32_t index; + bool IsEmpty() const { + return index == INVALID_INDEX; + } }; public: + //! PrimitiveDictionary is a fixed-size linear probing hash table for primitive types + //! It is used to dictionary-encode data in, e.g., Parquet files PrimitiveDictionary(Allocator &allocator, idx_t maximum_size_p, idx_t plain_capacity_p) - : maximum_size(maximum_size_p), size(0), capacity(NextPowerOfTwo(maximum_size * 2)), + : maximum_size(maximum_size_p), size(0), capacity(NextPowerOfTwo(maximum_size * LOAD_FACTOR)), capacity_mask(capacity - 1), plain_capacity(plain_capacity_p), plain_offset(0), allocated_dictionary(allocator.Allocate(capacity * sizeof(primitive_dictionary_entry_t))), allocated_plain(allocator.Allocate(std::is_same::value ? plain_capacity : capacity * sizeof(T))), dictionary(reinterpret_cast(allocated_dictionary.get())), - plain(allocated_plain.get()) { + plain(reinterpret_cast(allocated_plain.get())), plain_raw(allocated_plain.get()), full(false) { // Initialize empty for (idx_t i = 0; i < capacity; i++) { - dictionary[i].offset = INVALID_OFFSET; + dictionary[i].index = INVALID_INDEX; } } public: - bool Insert(T value, uint32_t &offset) { + //! Insert value into dictionary (if not full) + void Insert(T value) { + if (full | (size == capacity)) { + full = true; + return; + } auto &entry = Lookup(value); - bool success = size < capacity; - if (entry.offset == INVALID_OFFSET) { - success &= AddToPlain(value); + if (entry.IsEmpty()) { + if (!AddToPlain(value)) { + full = true; + return; + } entry.value = value; - entry.offset = size++; + entry.index = size++; } - offset = entry.offset; - return success; } - uint32_t GetOffset(const T &value) const { - return Lookup(value).offset; + //! Get dictionary index of an already inserted value + uint32_t GetIndex(const T &value) const { + const auto &entry = Lookup(value); + D_ASSERT(!entry.IsEmpty()); + return entry.index; + } + + //! Iterates over inserted values + template ::value, int>::type = 0> + void IterateValues(const std::function &fun) const { + for (idx_t i = 0; i < size; i++) { + fun(plain[i]); + } + } + + //! Specialized template to iterate over string_t values + template ::value, int>::type = 0> + void IterateValues(const std::function &fun) const { + for (idx_t i = 0; i < capacity; i++) { + auto &entry = dictionary[i]; + if (entry.IsEmpty()) { + continue; + } + fun(entry.value); + } + } + + //! Get the number of unique values in the dictionary + idx_t GetSize() const { + return size; + } + + //! If any of the inserts caused the dictionary to be full, this returns true + bool IsFull() const { + return full; + } + + //! Get the plain written values as a memory stream (zero-copy) + unique_ptr GetPlainMemoryStream() const { + auto result = make_uniq(plain_raw, plain_capacity); + result->SetPosition(plain_offset); + return result; } private: + //! Looks up a value in the dictionary using linear probing primitive_dictionary_entry_t &Lookup(const T &value) const { - return dictionary[Hash(value) & capacity_mask]; + auto offset = Hash(value) & capacity_mask; + while (!dictionary[offset].IsEmpty() && dictionary[offset].value != value) { + ++offset &= capacity_mask; + } + return dictionary[offset]; } + //! Writes a value to the plain data bool AddToPlain(const T &value) { - static_cast(plain)[plain_offset++] = value; + plain[size] = value; + plain_offset += sizeof(T); return true; } + //! Specialized template to add a string_t value to the plain data bool AddToPlain(string_t &value) { if (plain_offset + sizeof(uint32_t) + value.GetSize() > plain_capacity) { return false; // Out of capacity } // Store string length and increment offset - Store(UnsafeNumericCast(value.GetSize()), plain + plain_offset); + Store(UnsafeNumericCast(value.GetSize()), plain_raw + plain_offset); plain_offset += sizeof(uint32_t); // Copy over string data to plain, update "value" to point to it, and increment offset - memcpy(plain + plain_offset, value.GetData(), value.GetSize()); - value = string_t(char_ptr_cast(plain + plain_offset), value.GetSize()); + memcpy(plain_raw + plain_offset, value.GetData(), value.GetSize()); + value = string_t(char_ptr_cast(plain_raw + plain_offset), value.GetSize()); plain_offset += value.GetSize(); return true; @@ -100,7 +160,11 @@ class PrimitiveDictionary { //! Pointers to allocated regions for convenience primitive_dictionary_entry_t *const dictionary; - data_ptr_t const plain; + T *const plain; + data_ptr_t const plain_raw; + + //! More values inserted than possible + bool full; }; } // namespace duckdb diff --git a/src/include/duckdb/common/serializer/memory_stream.hpp b/src/include/duckdb/common/serializer/memory_stream.hpp index a735ad1aa90e..f5cd1c153049 100644 --- a/src/include/duckdb/common/serializer/memory_stream.hpp +++ b/src/include/duckdb/common/serializer/memory_stream.hpp @@ -25,20 +25,20 @@ class MemoryStream : public WriteStream, public ReadStream { public: static constexpr idx_t DEFAULT_INITIAL_CAPACITY = 512; - // Create a new owning MemoryStream with an internal backing buffer with the specified capacity. The stream will - // own the backing buffer, resize it when needed and free its memory when the stream is destroyed + //! Create a new owning MemoryStream with an internal backing buffer with the specified capacity. The stream will + //! own the backing buffer, resize it when needed and free its memory when the stream is destroyed explicit MemoryStream(Allocator &allocator, idx_t capacity = DEFAULT_INITIAL_CAPACITY); - // Create a new owning MemoryStream with an internal backing buffer with the specified capacity. The stream will - // own the backing buffer, resize it when needed and free its memory when the stream is destroyed + //! Create a new owning MemoryStream with an internal backing buffer with the specified capacity. The stream will + //! own the backing buffer, resize it when needed and free its memory when the stream is destroyed explicit MemoryStream(idx_t capacity = DEFAULT_INITIAL_CAPACITY); - // Create a new non-owning MemoryStream over the specified external buffer and capacity. The stream will not take - // ownership of the backing buffer, will not attempt to resize it and will not free the memory when the stream - // is destroyed + //! Create a new non-owning MemoryStream over the specified external buffer and capacity. The stream will not take + //! ownership of the backing buffer, will not attempt to resize it and will not free the memory when the stream + //! is destroyed explicit MemoryStream(data_ptr_t buffer, idx_t capacity); - // Cant copy! + //! Cant copy! MemoryStream(const MemoryStream &) = delete; MemoryStream &operator=(const MemoryStream &) = delete; @@ -47,30 +47,33 @@ class MemoryStream : public WriteStream, public ReadStream { ~MemoryStream() override; - // Write data to the stream. - // Throws if the write would exceed the capacity of the stream and the backing buffer is not owned by the stream + //! Write data to the stream. + //! Throws if the write would exceed the capacity of the stream and the backing buffer is not owned by the stream void WriteData(const_data_ptr_t buffer, idx_t write_size) override; - // Read data from the stream. - // Throws if the read would exceed the capacity of the stream + //! Read data from the stream. + //! Throws if the read would exceed the capacity of the stream void ReadData(data_ptr_t buffer, idx_t read_size) override; - // Rewind the stream to the start, keeping the capacity and the backing buffer intact + //! Rewind the stream to the start, keeping the capacity and the backing buffer intact void Rewind(); - // Release ownership of the backing buffer and turn a owning stream into a non-owning one. - // The stream will no longer be responsible for freeing the data. - // The stream will also no longer attempt to automatically resize the buffer when the capacity is reached. + //! Release ownership of the backing buffer and turn a owning stream into a non-owning one. + //! The stream will no longer be responsible for freeing the data. + //! The stream will also no longer attempt to automatically resize the buffer when the capacity is reached. void Release(); - // Get a pointer to the underlying backing buffer + //! Get a pointer to the underlying backing buffer data_ptr_t GetData() const; - // Get the current position in the stream + //! Get the current position in the stream idx_t GetPosition() const; - // Get the capacity of the stream + //! Get the capacity of the stream idx_t GetCapacity() const; + + //! Set the position in the stream + void SetPosition(idx_t position); }; } // namespace duckdb diff --git a/src/include/duckdb/common/types/datetime.hpp b/src/include/duckdb/common/types/datetime.hpp index ccbb95524244..d52edd57379c 100644 --- a/src/include/duckdb/common/types/datetime.hpp +++ b/src/include/duckdb/common/types/datetime.hpp @@ -1,3 +1,11 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/common/types/datetime.hpp +// +// +//===----------------------------------------------------------------------===// + #pragma once #include "duckdb/common/common.hpp" diff --git a/src/include/duckdb/common/types/hash.hpp b/src/include/duckdb/common/types/hash.hpp index b213f249e402..128ee634dcd7 100644 --- a/src/include/duckdb/common/types/hash.hpp +++ b/src/include/duckdb/common/types/hash.hpp @@ -10,6 +10,7 @@ #include "duckdb/common/common.hpp" #include "duckdb/common/types.hpp" +#include "duckdb/common/types/datetime.hpp" namespace duckdb { @@ -63,6 +64,8 @@ template <> DUCKDB_API hash_t Hash(string_t val); template <> DUCKDB_API hash_t Hash(interval_t val); +template <> +DUCKDB_API hash_t Hash(dtime_tz_t val); DUCKDB_API hash_t Hash(const char *val, size_t size); DUCKDB_API hash_t Hash(uint8_t *val, size_t size); From 29172987ada96ea87651bd8b493b720b813f1ac6 Mon Sep 17 00:00:00 2001 From: pdet Date: Thu, 13 Feb 2025 08:06:41 -0300 Subject: [PATCH 057/142] more destroy --- src/common/adbc/adbc.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/adbc/adbc.cpp b/src/common/adbc/adbc.cpp index b83bcbc67585..17618c66ec55 100644 --- a/src/common/adbc/adbc.cpp +++ b/src/common/adbc/adbc.cpp @@ -892,9 +892,8 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char auto error_msg = duckdb_prepare_error(statement_internal); auto adbc_status = CheckResult(res, error, error_msg); if (adbc_status != ADBC_STATUS_OK) { - // Things went wrong when executing internal prepared statement - duckdb_destroy_extracted(&extracted_statements); duckdb_destroy_prepare(&statement_internal); + duckdb_destroy_extracted(&extracted_statements); return adbc_status; } // Execute @@ -902,15 +901,16 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char res = duckdb_execute_prepared_arrow(statement_internal, &out_result); if (res != DuckDBSuccess) { SetError(error, duckdb_query_arrow_error(out_result)); - delete out_result; + duckdb_destroy_arrow(&out_result); duckdb_destroy_prepare(&statement_internal); duckdb_destroy_extracted(&extracted_statements); return ADBC_STATUS_INVALID_ARGUMENT; } - delete out_result; + duckdb_destroy_arrow(&out_result); duckdb_destroy_prepare(&statement_internal); } - // Besides ze last, this one we return + + // Final statement (returned to caller) auto res = duckdb_prepare_extracted_statement(wrapper->connection, extracted_statements, extract_statements_size - 1, &wrapper->statement); auto error_msg = duckdb_prepare_error(wrapper->statement); From 61be508a733a7d31ee1c58174dee0317e328fb3f Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 13 Feb 2025 12:18:51 +0100 Subject: [PATCH 058/142] simplify SkipToClose --- src/function/cast/vector_cast_helpers.cpp | 41 ++++++++++------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 0600eb1de4ce..64d9780312f5 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -70,15 +70,15 @@ static bool SkipToCloseQuotes(StringCastInputState &input_state) { return false; } -static bool SkipToClose(StringCastInputState &input_state, idx_t &lvl, char close_bracket) { +static bool SkipToClose(StringCastInputState &input_state, idx_t &lvl) { auto &idx = input_state.pos; auto &buf = input_state.buf; auto &len = input_state.len; auto &escaped = input_state.escaped; - idx++; + + D_ASSERT(buf[idx] == '{' || buf[idx] == '[' || buf[idx] == '('); vector brackets; - brackets.push_back(close_bracket); while (idx < len) { if (!escaped) { if (buf[idx] == '"' || buf[idx] == '\'') { @@ -250,8 +250,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { start_pos = pos; } //! Start of a LIST - lvl++; - if (!SkipToClose(input_state, lvl, ']')) { + if (!SkipToClose(input_state, lvl)) { return false; } end_pos = pos; @@ -269,7 +268,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { } //! Start of a STRUCT idx_t struct_lvl = 0; - if (!SkipToClose(input_state, struct_lvl, '}')) { + if (!SkipToClose(input_state, struct_lvl)) { return false; } end_pos = pos; @@ -279,7 +278,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { } //! Start of an (unnamed) STRUCT idx_t struct_lvl = 0; - if (!SkipToClose(input_state, struct_lvl, ')')) { + if (!SkipToClose(input_state, struct_lvl)) { return false; } end_pos = pos; @@ -417,7 +416,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl, '}')) { + if (!SkipToClose(input_state, lvl)) { return false; } end_pos = pos; @@ -425,7 +424,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl, ')')) { + if (!SkipToClose(input_state, lvl)) { return false; } end_pos = pos; @@ -433,8 +432,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - lvl++; - if (!SkipToClose(input_state, lvl, ']')) { + if (!SkipToClose(input_state, lvl)) { return false; } end_pos = pos; @@ -488,7 +486,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl, '}')) { + if (!SkipToClose(input_state, lvl)) { return false; } end_pos = pos; @@ -496,7 +494,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl, ')')) { + if (!SkipToClose(input_state, lvl)) { return false; } end_pos = pos; @@ -504,8 +502,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - lvl++; - if (!SkipToClose(input_state, lvl, ']')) { + if (!SkipToClose(input_state, lvl)) { return false; } end_pos = pos; @@ -662,7 +659,7 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Thu, 13 Feb 2025 12:52:56 +0100 Subject: [PATCH 059/142] trigger a cast to unnamed struct --- src/function/cast/string_cast.cpp | 3 -- src/function/cast/vector_cast_helpers.cpp | 4 +- test/sql/cast/string_to_unnamed_struct.test | 54 +++++++++++++-------- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/src/function/cast/string_cast.cpp b/src/function/cast/string_cast.cpp index f3a19c4273bf..b39f6623e7da 100644 --- a/src/function/cast/string_cast.cpp +++ b/src/function/cast/string_cast.cpp @@ -233,9 +233,6 @@ bool VectorStringToStruct::StringToNestedTypeCastLoop(const string_t *source_dat result_mask.SetInvalid(i); continue; } - if (is_unnamed) { - throw ConversionException("Casting strings to unnamed structs is unsupported"); - } if (!VectorStringToStruct::SplitStruct(source_data[idx], child_vectors, i, child_names, child_masks)) { string text = "Type VARCHAR with value '" + source_data[idx].GetString() + "' can't be cast to the destination type STRUCT"; diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 64d9780312f5..e5dac2b37201 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -580,6 +580,7 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Thu, 13 Feb 2025 13:07:56 +0100 Subject: [PATCH 060/142] no need for a 'lvl' list nesting tracker --- src/function/cast/vector_cast_helpers.cpp | 52 ++++++++--------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index e5dac2b37201..53364abbfc1a 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -70,7 +70,7 @@ static bool SkipToCloseQuotes(StringCastInputState &input_state) { return false; } -static bool SkipToClose(StringCastInputState &input_state, idx_t &lvl) { +static bool SkipToClose(StringCastInputState &input_state) { auto &idx = input_state.pos; auto &buf = input_state.buf; auto &len = input_state.len; @@ -91,14 +91,7 @@ static bool SkipToClose(StringCastInputState &input_state, idx_t &lvl) { brackets.push_back(')'); } else if (buf[idx] == '[') { brackets.push_back(']'); - lvl++; } else if (buf[idx] == brackets.back()) { - if (buf[idx] == ']') { - if (lvl == 0) { - return false; - } - lvl--; - } brackets.pop_back(); if (brackets.empty()) { return true; @@ -217,7 +210,6 @@ template static bool SplitStringListInternal(const string_t &input, OP &state) { const char *buf = input.GetData(); idx_t len = input.GetSize(); - idx_t lvl = 1; idx_t pos = 0; StringCastInputState input_state(buf, pos, len); @@ -250,7 +242,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { start_pos = pos; } //! Start of a LIST - if (!SkipToClose(input_state, lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -267,8 +259,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { start_pos = pos; } //! Start of a STRUCT - idx_t struct_lvl = 0; - if (!SkipToClose(input_state, struct_lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -277,8 +268,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { start_pos = pos; } //! Start of an (unnamed) STRUCT - idx_t struct_lvl = 0; - if (!SkipToClose(input_state, struct_lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -293,10 +283,6 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { seen_value = true; } if (buf[pos] == ']') { - if (lvl == 0) { - return false; - } - lvl--; break; } start_pos = optional_idx(); @@ -318,7 +304,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { } pos++; SkipWhitespace(input_state); - return (pos == len && lvl == 0); + return (pos == len); } bool VectorStringToList::SplitStringList(const string_t &input, string_t *child_data, idx_t &child_start, @@ -377,7 +363,6 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { idx_t len = input.GetSize(); idx_t pos = 0; StringCastInputState input_state(buf, pos, len); - idx_t lvl = 0; SkipWhitespace(input_state); if (pos == len || buf[pos] != '{') { @@ -416,7 +401,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -424,7 +409,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -432,7 +417,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -486,7 +471,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -494,7 +479,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -502,7 +487,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { if (!start_pos.IsValid()) { start_pos = pos; } - if (!SkipToClose(input_state, lvl)) { + if (!SkipToClose(input_state)) { return false; } end_pos = pos; @@ -538,7 +523,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { } pos++; SkipWhitespace(input_state); - return (pos == len && lvl == 0); + return (pos == len); } bool VectorStringToMap::SplitStringMap(const string_t &input, string_t *child_key_data, string_t *child_val_data, @@ -561,7 +546,6 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Thu, 13 Feb 2025 13:14:09 +0100 Subject: [PATCH 061/142] remove temp_state, IsNull just needs a buf, start and end --- src/function/cast/vector_cast_helpers.cpp | 24 ++++++++--------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 53364abbfc1a..1e62089fbf79 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -21,10 +21,8 @@ struct StringCastInputState { namespace duckdb { // ------- Helper functions for splitting string nested types ------- -static bool IsNull(StringCastInputState &input_state) { - auto &buf = input_state.buf; - auto &pos = input_state.pos; - if (input_state.pos + 4 != input_state.len) { +static bool IsNull(const char *buf, idx_t pos, idx_t end_pos) { + if (pos + 4 != end_pos) { return false; } return StringUtil::CIEquals(string(buf + pos, buf + pos + 4), "null"); @@ -190,8 +188,7 @@ struct SplitStringListOperation { public: void HandleValue(const char *buf, idx_t start, idx_t end) { - StringCastInputState temp_state(buf, start, end); - if (IsNull(temp_state)) { + if (IsNull(buf, start, end)) { FlatVector::SetNull(child, entry_count, true); entry_count++; return; @@ -334,8 +331,7 @@ struct SplitStringMapOperation { Vector &varchar_val; bool HandleKey(const char *buf, idx_t start_pos, idx_t pos) { - StringCastInputState temp_state(buf, start_pos, pos); - if (IsNull(temp_state)) { + if (IsNull(buf, start_pos, pos)) { FlatVector::SetNull(varchar_val, child_start, true); FlatVector::SetNull(varchar_key, child_start, true); child_start++; @@ -346,8 +342,7 @@ struct SplitStringMapOperation { } void HandleValue(const char *buf, idx_t start_pos, idx_t pos) { - StringCastInputState temp_state(buf, start_pos, pos); - if (IsNull(temp_state)) { + if (IsNull(buf, start_pos, pos)) { FlatVector::SetNull(varchar_val, child_start, true); child_start++; return; @@ -609,8 +604,7 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Thu, 13 Feb 2025 13:18:24 +0100 Subject: [PATCH 062/142] use random seeds for bernoulli sample when parallel is enabled --- .../helper/physical_streaming_sample.cpp | 23 +++++--- src/execution/physical_plan/plan_sample.cpp | 3 +- .../helper/physical_streaming_sample.hpp | 9 +-- .../prepared_statement_in_pushdown.test | 0 test/sql/sample/bernoulli_sampling.test | 57 +++++++++++++++++++ 5 files changed, 77 insertions(+), 15 deletions(-) create mode 100644 test/optimizer/filter_pushdown/prepared_statement_in_pushdown.test create mode 100644 test/sql/sample/bernoulli_sampling.test diff --git a/src/execution/operator/helper/physical_streaming_sample.cpp b/src/execution/operator/helper/physical_streaming_sample.cpp index 309256244927..1062deb27ed7 100644 --- a/src/execution/operator/helper/physical_streaming_sample.cpp +++ b/src/execution/operator/helper/physical_streaming_sample.cpp @@ -5,10 +5,11 @@ namespace duckdb { -PhysicalStreamingSample::PhysicalStreamingSample(vector types, SampleMethod method, double percentage, - int64_t seed, idx_t estimated_cardinality) - : PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality), method(method), - percentage(percentage / 100), seed(seed) { +PhysicalStreamingSample::PhysicalStreamingSample(vector types, unique_ptr options, + idx_t estimated_cardinality) + : PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality), + sample_options(std::move(options)) { + percentage = sample_options->sample_size.GetValue() / 100; } //===--------------------------------------------------------------------===// @@ -49,13 +50,21 @@ void PhysicalStreamingSample::BernoulliSample(DataChunk &input, DataChunk &resul } } +bool PhysicalStreamingSample::ParallelOperator() const { + return !sample_options->repeatable; +} + unique_ptr PhysicalStreamingSample::GetOperatorState(ExecutionContext &context) const { - return make_uniq(seed); + if (!ParallelOperator()) { + return make_uniq(static_cast(sample_options->seed.GetIndex())); + } + RandomEngine random; + return make_uniq(static_cast(random.NextRandomInteger64())); } OperatorResultType PhysicalStreamingSample::Execute(ExecutionContext &context, DataChunk &input, DataChunk &chunk, GlobalOperatorState &gstate, OperatorState &state) const { - switch (method) { + switch (sample_options->method) { case SampleMethod::BERNOULLI_SAMPLE: BernoulliSample(input, chunk, state); break; @@ -70,7 +79,7 @@ OperatorResultType PhysicalStreamingSample::Execute(ExecutionContext &context, D InsertionOrderPreservingMap PhysicalStreamingSample::ParamsToString() const { InsertionOrderPreservingMap result; - result["Sample Method"] = EnumUtil::ToString(method) + ": " + to_string(100 * percentage) + "%"; + result["Sample Method"] = EnumUtil::ToString(sample_options->method) + ": " + to_string(100 * percentage) + "%"; return result; } diff --git a/src/execution/physical_plan/plan_sample.cpp b/src/execution/physical_plan/plan_sample.cpp index be55784779fb..2ccfacb8ac8c 100644 --- a/src/execution/physical_plan/plan_sample.cpp +++ b/src/execution/physical_plan/plan_sample.cpp @@ -29,8 +29,7 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalSample &op EnumUtil::ToString(op.sample_options->method)); } sample = make_uniq( - op.types, op.sample_options->method, op.sample_options->sample_size.GetValue(), - static_cast(op.sample_options->seed.GetIndex()), op.estimated_cardinality); + op.types, std::move(op.sample_options), op.estimated_cardinality); break; default: throw InternalException("Unimplemented sample method"); diff --git a/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp b/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp index dafaf849f556..68df848fec9f 100644 --- a/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp +++ b/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp @@ -19,12 +19,11 @@ class PhysicalStreamingSample : public PhysicalOperator { static constexpr const PhysicalOperatorType TYPE = PhysicalOperatorType::STREAMING_SAMPLE; public: - PhysicalStreamingSample(vector types, SampleMethod method, double percentage, int64_t seed, + PhysicalStreamingSample(vector types, unique_ptr options, idx_t estimated_cardinality); - SampleMethod method; + unique_ptr sample_options; double percentage; - int64_t seed; public: // Operator interface @@ -32,9 +31,7 @@ class PhysicalStreamingSample : public PhysicalOperator { OperatorResultType Execute(ExecutionContext &context, DataChunk &input, DataChunk &chunk, GlobalOperatorState &gstate, OperatorState &state) const override; - bool ParallelOperator() const override { - return true; - } + bool ParallelOperator() const override; InsertionOrderPreservingMap ParamsToString() const override; diff --git a/test/optimizer/filter_pushdown/prepared_statement_in_pushdown.test b/test/optimizer/filter_pushdown/prepared_statement_in_pushdown.test new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/sql/sample/bernoulli_sampling.test b/test/sql/sample/bernoulli_sampling.test new file mode 100644 index 000000000000..a00ff7311325 --- /dev/null +++ b/test/sql/sample/bernoulli_sampling.test @@ -0,0 +1,57 @@ +# name: test/sql/sample/bernoulli_sampling.test +# description: Test reservoir sample crash on large data sets +# group: [sample] + + +statement ok +create table output (num_rows INT); + +statement ok +select setseed(0.3); + +loop i 0 500 + +statement ok +WITH some_tab AS ( + SELECT UNNEST(range(1000)) AS id +), +some_tab_unq AS ( + SELECT distinct(id) AS id FROM some_tab +), +sampled AS ( + select id from some_tab_unq + USING SAMPLE 1% (bernoulli) +) +INSERT INTO output select count(*) as n_rows FROM sampled; + +endloop + + +query III +select min(num_rows) > 0, max(num_rows) < 25, count(*) FILTER (num_rows = 0) = 0 from output; +---- +true true true + +query III +select avg(rowid), min(rowid), max(rowid) from output where num_rows = 0; +---- +NULL NULL NULL + + + +statement ok +create table t1 as select range id from range(1000); + +statement ok +select setseed(0.6); + +query I nosort result_1 +select id from t1 USING SAMPLE 1% (bernoulli, 5); +---- + +query I nosort result_1 +select id from t1 USING SAMPLE 1% (bernoulli, 5); +---- + + + From 981b4c2f65671b0ebac8449af6c97fda925dc1d2 Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 13 Feb 2025 13:29:51 +0100 Subject: [PATCH 063/142] deduplicate some logic --- src/function/cast/vector_cast_helpers.cpp | 171 ++++++++-------------- 1 file changed, 65 insertions(+), 106 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 1e62089fbf79..d016ebeaa0d5 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -224,9 +224,6 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { idx_t end_pos; bool seen_value = false; while (pos < len) { - if (pos == len) { - return false; - } bool set_escaped = false; if (input_state.escaped) { @@ -352,6 +349,67 @@ struct SplitStringMapOperation { } }; +static inline bool MapKeyOrValueStateTransition(StringCastInputState &input_state, optional_idx &start_pos, + idx_t &end_pos) { + auto &buf = input_state.buf; + auto &pos = input_state.pos; + + bool set_escaped = false; + if (input_state.escaped) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; + } else if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToCloseQuotes(input_state)) { + return false; + } + end_pos = pos; + } else if (buf[pos] == '{') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToClose(input_state)) { + return false; + } + end_pos = pos; + } else if (buf[pos] == '(') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToClose(input_state)) { + return false; + } + end_pos = pos; + } else if (buf[pos] == '[') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToClose(input_state)) { + return false; + } + end_pos = pos; + } else if (buf[pos] == '\\') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + set_escaped = true; + end_pos = pos; + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; + } + input_state.escaped = set_escaped; + pos++; + + return true; +} + template static bool SplitStringMapInternal(const string_t &input, OP &state) { const char *buf = input.GetData(); @@ -378,58 +436,9 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { optional_idx start_pos; idx_t end_pos; while (pos < len && (buf[pos] != '=' || input_state.escaped)) { - bool set_escaped = false; - if (input_state.escaped) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; - } else if (buf[pos] == '"' || buf[pos] == '\'') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToCloseQuotes(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '{') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '(') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '[') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '\\') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - set_escaped = true; - end_pos = pos; - } else if (!StringUtil::CharacterIsSpace(buf[pos])) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; + if (!MapKeyOrValueStateTransition(input_state, start_pos, end_pos)) { + return false; } - input_state.escaped = set_escaped; - pos++; } if (pos == len) { return false; @@ -447,59 +456,9 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { pos++; SkipWhitespace(input_state); while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { - bool set_escaped = false; - - if (input_state.escaped) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; - } else if (buf[pos] == '"' || buf[pos] == '\'') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToCloseQuotes(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '{') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '(') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '[') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '\\') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - set_escaped = true; - end_pos = pos; - } else if (!StringUtil::CharacterIsSpace(buf[pos])) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; + if (!MapKeyOrValueStateTransition(input_state, start_pos, end_pos)) { + return false; } - input_state.escaped = set_escaped; - pos++; } if (pos == len) { return false; From 90ff46c233f1ab8f5ca2a5033c87f32ade14c2d2 Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 13 Feb 2025 13:38:46 +0100 Subject: [PATCH 064/142] use the same function in the list value --- src/function/cast/vector_cast_helpers.cpp | 208 +++++++++------------- 1 file changed, 80 insertions(+), 128 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index d016ebeaa0d5..750db4b0e2c7 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -203,6 +203,66 @@ struct SplitStringListOperation { Vector &child; }; +static inline bool ValueStateTransition(StringCastInputState &input_state, optional_idx &start_pos, idx_t &end_pos) { + auto &buf = input_state.buf; + auto &pos = input_state.pos; + + bool set_escaped = false; + if (input_state.escaped) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; + } else if (buf[pos] == '"' || buf[pos] == '\'') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToCloseQuotes(input_state)) { + return false; + } + end_pos = pos; + } else if (buf[pos] == '{') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToClose(input_state)) { + return false; + } + end_pos = pos; + } else if (buf[pos] == '(') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToClose(input_state)) { + return false; + } + end_pos = pos; + } else if (buf[pos] == '[') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + if (!SkipToClose(input_state)) { + return false; + } + end_pos = pos; + } else if (buf[pos] == '\\') { + if (!start_pos.IsValid()) { + start_pos = pos; + } + set_escaped = true; + end_pos = pos; + } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + if (!start_pos.IsValid()) { + start_pos = pos; + } + end_pos = pos; + } + input_state.escaped = set_escaped; + pos++; + + return true; +} + template static bool SplitStringListInternal(const string_t &input, OP &state) { const char *buf = input.GetData(); @@ -220,79 +280,32 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { //! Skip the '[' pos++; SkipWhitespace(input_state); - optional_idx start_pos; - idx_t end_pos; bool seen_value = false; while (pos < len) { - bool set_escaped = false; + optional_idx start_pos; + idx_t end_pos; - if (input_state.escaped) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; - } else if (buf[pos] == '[') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - //! Start of a LIST - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if ((buf[pos] == '"' || buf[pos] == '\'')) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToCloseQuotes(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '{') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - //! Start of a STRUCT - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '(') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - //! Start of an (unnamed) STRUCT - if (!SkipToClose(input_state)) { + while (pos < len && ((buf[pos] != ',' && buf[pos] != ']') || input_state.escaped)) { + if (!ValueStateTransition(input_state, start_pos, end_pos)) { return false; } - end_pos = pos; - } else if ((buf[pos] == ',' || buf[pos] == ']')) { - if (buf[pos] != ']' || start_pos.IsValid() || seen_value) { - if (!start_pos.IsValid()) { - state.HandleValue(buf, 0, 0); - } else { - auto start = start_pos.GetIndex(); - state.HandleValue(buf, start, end_pos + 1); - } - seen_value = true; - } - if (buf[pos] == ']') { - break; - } - start_pos = optional_idx(); - } else if (buf[pos] == '\\') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - set_escaped = true; - end_pos = pos; - } else if (!StringUtil::CharacterIsSpace(buf[pos])) { + } + if (pos == len) { + return false; + } + if (buf[pos] != ']' || start_pos.IsValid() || seen_value) { if (!start_pos.IsValid()) { - start_pos = pos; + state.HandleValue(buf, 0, 0); + } else { + auto start = start_pos.GetIndex(); + state.HandleValue(buf, start, end_pos + 1); } - end_pos = pos; + seen_value = true; } - input_state.escaped = set_escaped; + if (buf[pos] == ']') { + break; + } + pos++; SkipWhitespace(input_state); } @@ -349,67 +362,6 @@ struct SplitStringMapOperation { } }; -static inline bool MapKeyOrValueStateTransition(StringCastInputState &input_state, optional_idx &start_pos, - idx_t &end_pos) { - auto &buf = input_state.buf; - auto &pos = input_state.pos; - - bool set_escaped = false; - if (input_state.escaped) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; - } else if (buf[pos] == '"' || buf[pos] == '\'') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToCloseQuotes(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '{') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '(') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '[') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToClose(input_state)) { - return false; - } - end_pos = pos; - } else if (buf[pos] == '\\') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - set_escaped = true; - end_pos = pos; - } else if (!StringUtil::CharacterIsSpace(buf[pos])) { - if (!start_pos.IsValid()) { - start_pos = pos; - } - end_pos = pos; - } - input_state.escaped = set_escaped; - pos++; - - return true; -} - template static bool SplitStringMapInternal(const string_t &input, OP &state) { const char *buf = input.GetData(); @@ -436,7 +388,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { optional_idx start_pos; idx_t end_pos; while (pos < len && (buf[pos] != '=' || input_state.escaped)) { - if (!MapKeyOrValueStateTransition(input_state, start_pos, end_pos)) { + if (!ValueStateTransition(input_state, start_pos, end_pos)) { return false; } } @@ -456,7 +408,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { pos++; SkipWhitespace(input_state); while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { - if (!MapKeyOrValueStateTransition(input_state, start_pos, end_pos)) { + if (!ValueStateTransition(input_state, start_pos, end_pos)) { return false; } } From 201ba3fc4acf679f6cb7573f9283b0280b2353ce Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 13 Feb 2025 13:41:35 +0100 Subject: [PATCH 065/142] also use the same function in struct value --- src/function/cast/vector_cast_helpers.cpp | 54 +---------------------- 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 750db4b0e2c7..b5aa34682d0c 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -530,59 +530,9 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Thu, 13 Feb 2025 13:43:09 +0100 Subject: [PATCH 066/142] also use the same function in unnamed struct cast --- src/function/cast/vector_cast_helpers.cpp | 54 +---------------------- 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index b5aa34682d0c..e7c87164c955 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -573,59 +573,9 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector Date: Thu, 13 Feb 2025 14:01:46 +0100 Subject: [PATCH 067/142] improve cast error message for VARCHAR -> nested type --- src/function/cast/string_cast.cpp | 31 +++++++++---------- src/function/cast/vector_cast_helpers.cpp | 1 - test/sql/cast/string_to_list_cast.test | 2 +- test/sql/cast/string_to_struct_cast.test | 14 ++++++++- test/sql/cast/string_to_unnamed_struct.test | 2 +- test/sql/types/nested/array/array_cast.test | 6 +++- .../types/nested/array/array_try_cast.test | 4 +-- .../types/struct/unnamed_struct_casts.test | 5 +-- 8 files changed, 40 insertions(+), 25 deletions(-) diff --git a/src/function/cast/string_cast.cpp b/src/function/cast/string_cast.cpp index b39f6623e7da..07375bf7bd37 100644 --- a/src/function/cast/string_cast.cpp +++ b/src/function/cast/string_cast.cpp @@ -160,9 +160,9 @@ bool VectorStringToList::StringToNestedTypeCastLoop(const string_t *source_data, list_data[i].offset = total; if (!VectorStringToList::SplitStringList(source_data[idx], child_data, total, varchar_vector)) { - string text = "Type VARCHAR with value '" + source_data[idx].GetString() + - "' can't be cast to the destination type LIST"; - HandleVectorCastError::Operation(text, result_mask, i, vector_cast_data); + auto error = StringUtil::Format("Type VARCHAR with value '%s' can't be cast to the destination type %s", + source_data[idx].GetString(), result.GetType().ToString()); + HandleVectorCastError::Operation(error, result_mask, i, vector_cast_data); } list_data[i].length = total - list_data[i].offset; // length is the amount of parts coming from this string } @@ -234,12 +234,12 @@ bool VectorStringToStruct::StringToNestedTypeCastLoop(const string_t *source_dat continue; } if (!VectorStringToStruct::SplitStruct(source_data[idx], child_vectors, i, child_names, child_masks)) { - string text = "Type VARCHAR with value '" + source_data[idx].GetString() + - "' can't be cast to the destination type STRUCT"; + auto error = StringUtil::Format("Type VARCHAR with value '%s' can't be cast to the destination type %s", + source_data[idx].GetString(), result.GetType().ToString()); for (auto &child_mask : child_masks) { child_mask.get().SetInvalid(i); // some values may have already been found and set valid } - HandleVectorCastError::Operation(text, result_mask, i, vector_cast_data); + HandleVectorCastError::Operation(error, result_mask, i, vector_cast_data); } } @@ -316,10 +316,10 @@ bool VectorStringToMap::StringToNestedTypeCastLoop(const string_t *source_data, list_data[i].offset = total; if (!VectorStringToMap::SplitStringMap(source_data[idx], child_key_data, child_val_data, total, varchar_key_vector, varchar_val_vector)) { - string text = "Type VARCHAR with value '" + source_data[idx].GetString() + - "' can't be cast to the destination type MAP"; + auto error = StringUtil::Format("Type VARCHAR with value '%s' can't be cast to the destination type %s", + source_data[idx].GetString(), result.GetType().ToString()); FlatVector::SetNull(result, i, true); - HandleVectorCastError::Operation(text, result_mask, i, vector_cast_data); + HandleVectorCastError::Operation(error, result_mask, i, vector_cast_data); } list_data[i].length = total - list_data[i].offset; } @@ -379,10 +379,9 @@ bool VectorStringToArray::StringToNestedTypeCastLoop(const string_t *source_data if (array_size != str_array_size) { if (all_lengths_match) { all_lengths_match = false; - auto msg = - StringUtil::Format("Type VARCHAR with value '%s' can't be cast to the destination type ARRAY[%u]" - ", the size of the array must match the destination type", - source_data[idx].GetString(), array_size); + auto msg = StringUtil::Format("Type VARCHAR with value '%s' can't be cast to the destination type %s" + ", the size of the array must match the destination type", + source_data[idx].GetString(), result.GetType().ToString()); if (parameters.strict) { throw ConversionException(msg); } @@ -418,9 +417,9 @@ bool VectorStringToArray::StringToNestedTypeCastLoop(const string_t *source_data } if (!VectorStringToList::SplitStringList(source_data[idx], child_data, total, varchar_vector)) { - auto text = StringUtil::Format("Type VARCHAR with value '%s' can't be cast to the destination type ARRAY", - source_data[idx].GetString()); - HandleVectorCastError::Operation(text, result_mask, i, vector_cast_data); + auto error = StringUtil::Format("Type VARCHAR with value '%s' can't be cast to the destination type %s", + source_data[idx].GetString(), result.GetType().ToString()); + HandleVectorCastError::Operation(error, result_mask, i, vector_cast_data); } } D_ASSERT(total == child_count); diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index e7c87164c955..1d654da9b735 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -470,7 +470,6 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vector BLOB) # Should be able to cast from NULL query I @@ -138,11 +140,13 @@ NULL statement error SELECT (['2', 'abc', '3']::VARCHAR[3])::INT[] ---- +Could not convert string 'abc' to INT32 # Should not be able to cast to unrelated type statement error SELECT ([1,2,3]::INT[3])::INT; ---- +Unimplemented type for cast (INTEGER[3] -> INTEGER) # Should not be able to cast to list if child types fail query I @@ -166,4 +170,4 @@ SELECT '[1, 2, 3]'::INTEGER[3] query I SELECT TRY_CAST(l AS INTEGER[][3]) FROM VALUES (['foo']) as v(l); ---- -NULL \ No newline at end of file +NULL diff --git a/test/sql/types/nested/array/array_try_cast.test b/test/sql/types/nested/array/array_try_cast.test index d3398b54fa3d..4e90f308d6ff 100644 --- a/test/sql/types/nested/array/array_try_cast.test +++ b/test/sql/types/nested/array/array_try_cast.test @@ -53,7 +53,7 @@ NULL statement error SELECT CAST('[1,2]' as INTEGER[3]); ---- -Conversion Error: Type VARCHAR with value '[1,2]' can't be cast to the destination type ARRAY[3], the size of the array must match the destination type +Conversion Error: Type VARCHAR with value '[1,2]' can't be cast to the destination type INTEGER[3], the size of the array must match the destination type query I SELECT CAST('[NULL, [1], [NULL]]' as INTEGER[1][3]); @@ -78,7 +78,7 @@ SELECT CAST('[NULL, [1,NULL,3], [1,2,3]]' as INTEGER[3][3]); statement error SELECT CAST('[NULL, [1,NULL,3], [1,2]]' as INTEGER[3][3]); ---- -Conversion Error: Type VARCHAR with value '[1,2]' can't be cast to the destination type ARRAY[3], the size of the array must match the destination type +Conversion Error: Type VARCHAR with value '[1,2]' can't be cast to the destination type INTEGER[3], the size of the array must match the destination type query I SELECT TRY_CAST('[NULL, [1,NULL,3], [1,2]]' as INTEGER[3][3]); diff --git a/test/sql/types/struct/unnamed_struct_casts.test b/test/sql/types/struct/unnamed_struct_casts.test index 9ab777d37ffb..5b8d3d6b4a25 100644 --- a/test/sql/types/struct/unnamed_struct_casts.test +++ b/test/sql/types/struct/unnamed_struct_casts.test @@ -10,7 +10,8 @@ select row(42, 'hello') union all select '{'': 42,'': hello}'; ---- Conversion Error -statement error +query I select row(42, 'hello') union all select '(84, world)'; ---- -unsupported +(42, hello) +(84, world) From 591f090ebcd624fc6251711134e0e06ecc41da2e Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Thu, 13 Feb 2025 14:04:54 +0100 Subject: [PATCH 068/142] make string_dictionary_page_size_limit for Parquet writer configurable --- .../include/parquet_rle_bp_encoder.hpp | 17 ++++++++-- extension/parquet/include/parquet_writer.hpp | 8 +++-- .../writer/templated_column_writer.hpp | 3 +- extension/parquet/parquet_extension.cpp | 20 ++++++++++-- extension/parquet/parquet_writer.cpp | 5 +-- .../writer/primitive_column_writer.cpp | 2 +- src/common/types/hash.cpp | 2 +- .../duckdb/common/primitive_dictionary.hpp | 7 +++-- test/sql/copy/parquet/bloom_filters.test | 31 ++++++++++++++++++- 9 files changed, 78 insertions(+), 17 deletions(-) diff --git a/extension/parquet/include/parquet_rle_bp_encoder.hpp b/extension/parquet/include/parquet_rle_bp_encoder.hpp index b0fd130a33f4..f29c85edcf79 100644 --- a/extension/parquet/include/parquet_rle_bp_encoder.hpp +++ b/extension/parquet/include/parquet_rle_bp_encoder.hpp @@ -67,12 +67,23 @@ class RleBpEncoder { } void WriteMany(WriteStream &writer, uint32_t value, idx_t count) { - D_ASSERT(bp_block_count == 0); if (rle_count != 0) { - WriteRun(writer); + // If an RLE run is going on, write a single value to either finish it or convert to BP + WriteValue(writer, value); + count--; } + + if (bp_block_count != 0) { + // If a BP run is going on, finish it + while (bp_block_count != 0 && count > 0) { + WriteValue(writer, value); + count--; + } + } + + // Set remaining as current RLE run rle_value = value; - rle_count = count; + rle_count += count; } void FinishWrite(WriteStream &writer) { diff --git a/extension/parquet/include/parquet_writer.hpp b/extension/parquet/include/parquet_writer.hpp index 1ad586489067..8af50765e50f 100644 --- a/extension/parquet/include/parquet_writer.hpp +++ b/extension/parquet/include/parquet_writer.hpp @@ -79,8 +79,8 @@ class ParquetWriter { vector names, duckdb_parquet::CompressionCodec::type codec, ChildFieldIDs field_ids, const vector> &kv_metadata, shared_ptr encryption_config, idx_t dictionary_size_limit, - double bloom_filter_false_positive_ratio, int64_t compression_level, bool debug_use_openssl, - ParquetVersion parquet_version); + idx_t string_dictionary_page_size_limit, double bloom_filter_false_positive_ratio, + int64_t compression_level, bool debug_use_openssl, ParquetVersion parquet_version); public: void PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result); @@ -116,6 +116,9 @@ class ParquetWriter { idx_t DictionarySizeLimit() const { return dictionary_size_limit; } + idx_t StringDictionaryPageSizeLimit() const { + return string_dictionary_page_size_limit; + } double BloomFilterFalsePositiveRatio() const { return bloom_filter_false_positive_ratio; } @@ -149,6 +152,7 @@ class ParquetWriter { ChildFieldIDs field_ids; shared_ptr encryption_config; idx_t dictionary_size_limit; + idx_t string_dictionary_page_size_limit; double bloom_filter_false_positive_ratio; int64_t compression_level; bool debug_use_openssl; diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index ca3b3ca8a804..544ced9c04df 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -20,7 +20,6 @@ namespace duckdb { template static void TemplatedWritePlain(Vector &col, ColumnWriterStatistics *stats, const idx_t chunk_start, const idx_t chunk_end, const ValidityMask &mask, WriteStream &ser) { - const auto *ptr = FlatVector::GetData(col); for (idx_t r = chunk_start; r < chunk_end; r++) { if (!mask.RowIsValid(r)) { @@ -38,7 +37,7 @@ class StandardColumnWriterState : public PrimitiveColumnWriterState { StandardColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx) : PrimitiveColumnWriterState(writer, row_group, col_idx), dictionary(BufferAllocator::Get(writer.GetContext()), writer.DictionarySizeLimit(), - 2097152), // TODO: make size configurable + writer.StringDictionaryPageSizeLimit()), encoding(duckdb_parquet::Encoding::PLAIN) { } ~StandardColumnWriterState() override = default; diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index 9545b3cb96c6..9142c0db681e 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -49,6 +49,7 @@ #include "duckdb/planner/operator/logical_get.hpp" #include "duckdb/storage/statistics/base_statistics.hpp" #include "duckdb/storage/table/row_group.hpp" +#include "duckdb/common/primitive_dictionary.hpp" #endif namespace duckdb { @@ -199,6 +200,8 @@ struct ParquetWriteBindData : public TableFunctionData { dictionary_size_limit = row_group_size / 20; } + idx_t string_dictionary_page_size_limit = 2097152; + //! What false positive rate are we willing to accept for bloom filters double bloom_filter_false_positive_ratio = 0.01; @@ -1279,6 +1282,13 @@ unique_ptr ParquetWriteBind(ClientContext &context, CopyFunctionBi } bind_data->dictionary_size_limit = val; dictionary_size_limit_set = true; + } else if (loption == "string_dictionary_page_size_limit") { + auto val = option.second[0].GetValue(); + if (val > PrimitiveDictionary::MAXIMUM_POSSIBLE_SIZE) { + throw BinderException("string_dictionary_page_size_limit must be less than or equal to %llu", + PrimitiveDictionary::MAXIMUM_POSSIBLE_SIZE); + } + bind_data->string_dictionary_page_size_limit = val; } else if (loption == "bloom_filter_false_positive_ratio") { auto val = option.second[0].GetValue(); if (val <= 0) { @@ -1341,8 +1351,9 @@ unique_ptr ParquetWriteInitializeGlobal(ClientContext &conte global_state->writer = make_uniq( context, fs, file_path, parquet_bind.sql_types, parquet_bind.column_names, parquet_bind.codec, parquet_bind.field_ids.Copy(), parquet_bind.kv_metadata, parquet_bind.encryption_config, - parquet_bind.dictionary_size_limit, parquet_bind.bloom_filter_false_positive_ratio, - parquet_bind.compression_level, parquet_bind.debug_use_openssl, parquet_bind.parquet_version); + parquet_bind.dictionary_size_limit, parquet_bind.string_dictionary_page_size_limit, + parquet_bind.bloom_filter_false_positive_ratio, parquet_bind.compression_level, parquet_bind.debug_use_openssl, + parquet_bind.parquet_version); return std::move(global_state); } @@ -1520,6 +1531,9 @@ static void ParquetCopySerialize(Serializer &serializer, const FunctionData &bin default_value.bloom_filter_false_positive_ratio); serializer.WritePropertyWithDefault(114, "parquet_version", bind_data.parquet_version, default_value.parquet_version); + serializer.WritePropertyWithDefault(115, "string_dictionary_page_size_limit", + bind_data.string_dictionary_page_size_limit, + default_value.string_dictionary_page_size_limit); } static unique_ptr ParquetCopyDeserialize(Deserializer &deserializer, CopyFunction &function) { @@ -1550,6 +1564,8 @@ static unique_ptr ParquetCopyDeserialize(Deserializer &deserialize 113, "bloom_filter_false_positive_ratio", default_value.bloom_filter_false_positive_ratio); data->parquet_version = deserializer.ReadPropertyWithExplicitDefault(114, "parquet_version", default_value.parquet_version); + data->string_dictionary_page_size_limit = deserializer.ReadPropertyWithExplicitDefault( + 115, "string_dictionary_page_size_limit", default_value.string_dictionary_page_size_limit); return std::move(data); } diff --git a/extension/parquet/parquet_writer.cpp b/extension/parquet/parquet_writer.cpp index 9977f67ecd27..b3af8efe3e81 100644 --- a/extension/parquet/parquet_writer.cpp +++ b/extension/parquet/parquet_writer.cpp @@ -320,11 +320,12 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file vector names_p, CompressionCodec::type codec, ChildFieldIDs field_ids_p, const vector> &kv_metadata, shared_ptr encryption_config_p, idx_t dictionary_size_limit_p, - double bloom_filter_false_positive_ratio_p, int64_t compression_level_p, - bool debug_use_openssl_p, ParquetVersion parquet_version) + idx_t string_dictionary_page_size_limit_p, double bloom_filter_false_positive_ratio_p, + int64_t compression_level_p, bool debug_use_openssl_p, ParquetVersion parquet_version) : context(context), file_name(std::move(file_name_p)), sql_types(std::move(types_p)), column_names(std::move(names_p)), codec(codec), field_ids(std::move(field_ids_p)), encryption_config(std::move(encryption_config_p)), dictionary_size_limit(dictionary_size_limit_p), + string_dictionary_page_size_limit(string_dictionary_page_size_limit_p), bloom_filter_false_positive_ratio(bloom_filter_false_positive_ratio_p), compression_level(compression_level_p), debug_use_openssl(debug_use_openssl_p), parquet_version(parquet_version) { diff --git a/extension/parquet/writer/primitive_column_writer.cpp b/extension/parquet/writer/primitive_column_writer.cpp index 589f94ab316f..357c2138e14d 100644 --- a/extension/parquet/writer/primitive_column_writer.cpp +++ b/extension/parquet/writer/primitive_column_writer.cpp @@ -130,7 +130,7 @@ void PrimitiveColumnWriter::WriteLevels(WriteStream &temp_writer, const unsafe_v MemoryStream intermediate_stream(Allocator::DefaultAllocator()); rle_encoder.BeginWrite(); - if (null_count.IsValid() && null_count.GetIndex() == 0 || null_count.GetIndex() == count) { + if (null_count.IsValid() && (null_count.GetIndex() == 0 || null_count.GetIndex() == count)) { // All are NULL or none are NULL rle_encoder.WriteMany(intermediate_stream, levels[0], count); } else { diff --git a/src/common/types/hash.cpp b/src/common/types/hash.cpp index f54295af4656..f9fe42ffcbd5 100644 --- a/src/common/types/hash.cpp +++ b/src/common/types/hash.cpp @@ -100,7 +100,7 @@ hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept { // XOR with remaining (<8) bytes hash_t hr = 0; - FastMemcpy(&hr, ptr, len & 7U); + memcpy(&hr, ptr, len & 7U); h ^= hr; // Finalize diff --git a/src/include/duckdb/common/primitive_dictionary.hpp b/src/include/duckdb/common/primitive_dictionary.hpp index 0976882a2c8d..a4369776d37d 100644 --- a/src/include/duckdb/common/primitive_dictionary.hpp +++ b/src/include/duckdb/common/primitive_dictionary.hpp @@ -29,6 +29,8 @@ class PrimitiveDictionary { }; public: + static constexpr uint32_t MAXIMUM_POSSIBLE_SIZE = INVALID_INDEX - 1; + //! PrimitiveDictionary is a fixed-size linear probing hash table for primitive types //! It is used to dictionary-encode data in, e.g., Parquet files PrimitiveDictionary(Allocator &allocator, idx_t maximum_size_p, idx_t plain_capacity_p) @@ -47,13 +49,12 @@ class PrimitiveDictionary { public: //! Insert value into dictionary (if not full) void Insert(T value) { - if (full | (size == capacity)) { - full = true; + if (full) { return; } auto &entry = Lookup(value); if (entry.IsEmpty()) { - if (!AddToPlain(value)) { + if (size + 1 > maximum_size || !AddToPlain(value)) { full = true; return; } diff --git a/test/sql/copy/parquet/bloom_filters.test b/test/sql/copy/parquet/bloom_filters.test index 8ca061e9dbb7..05f166c23598 100644 --- a/test/sql/copy/parquet/bloom_filters.test +++ b/test/sql/copy/parquet/bloom_filters.test @@ -226,4 +226,33 @@ statement error copy (select (r1.range*10)::BIGINT r, from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom8.parquet' (format parquet, ROW_GROUP_SIZE 10000, dictionary_size_limit 1000, bloom_filter_false_positive_ratio 0); ---- -bloom_filter_false_positive_ratio must be greater than 0 \ No newline at end of file +bloom_filter_false_positive_ratio must be greater than 0 + +# some tests for string_dictionary_page_size_limit + +# no bloom filter, limit too low +statement ok +copy (select (r1.range*10)::VARCHAR r, +from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom9.parquet' (format parquet, ROW_GROUP_SIZE 10000, string_dictionary_page_size_limit 10); + +query III +select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/bloom9.parquet') order by row_group_id; +---- +0 false false + +# big enough +statement ok +copy (select (r1.range*10)::VARCHAR r, +from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom9.parquet' (format parquet, ROW_GROUP_SIZE 10000, string_dictionary_page_size_limit 100000); + +query III +select row_group_id, bloom_filter_offset IS NOT NULL, bloom_filter_length IS NOT NULL from parquet_metadata('__TEST_DIR__/bloom9.parquet') order by row_group_id; +---- +0 true true + +# too big +statement error +copy (select (r1.range*10)::VARCHAR r, +from range(100) r1, range(100) order by r) to '__TEST_DIR__/bloom9.parquet' (format parquet, ROW_GROUP_SIZE 10000, string_dictionary_page_size_limit 4294967295); +---- +Binder Error From 5d79f5df60c49e40f62569ebc069923d4504d375 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Thu, 13 Feb 2025 15:01:40 +0100 Subject: [PATCH 069/142] some more fast paths/optimizations for parquet writer --- .../writer/templated_column_writer.hpp | 21 +++++++--- .../writer/primitive_column_writer.cpp | 42 ++++++++++++------- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index 544ced9c04df..81aeb659652c 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -229,15 +229,24 @@ class StandardColumnWriter : public PrimitiveColumnWriter { switch (page_state.encoding) { case duckdb_parquet::Encoding::RLE_DICTIONARY: { - for (idx_t r = chunk_start; r < chunk_end; r++) { - if (!mask.RowIsValid(r)) { - continue; - } - if (!page_state.dict_written_value) { - // first value: write the bit-width as a one-byte entry and initialize writer + idx_t r = chunk_start; + if (!page_state.dict_written_value) { + // find first non-null value + for (; r < chunk_end; r++) { + if (!mask.RowIsValid(r)) { + continue; + } + // write the bit-width as a one-byte entry and initialize writer temp_writer.Write(page_state.dict_bit_width); page_state.dict_encoder.BeginWrite(); page_state.dict_written_value = true; + break; + } + } + + for (; r < chunk_end; r++) { + if (!mask.RowIsValid(r)) { + continue; } const auto &src_value = data_ptr[r]; const auto value_index = page_state.dictionary.GetIndex(src_value); diff --git a/extension/parquet/writer/primitive_column_writer.cpp b/extension/parquet/writer/primitive_column_writer.cpp index 357c2138e14d..0bd85d0894a9 100644 --- a/extension/parquet/writer/primitive_column_writer.cpp +++ b/extension/parquet/writer/primitive_column_writer.cpp @@ -40,7 +40,6 @@ void PrimitiveColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterStat auto &state = state_p.Cast(); auto &col_chunk = state.row_group.columns[state.col_idx]; - idx_t start = 0; idx_t vcount = parent ? parent->definition_levels.size() - state.definition_levels.size() : count; idx_t parent_index = state.definition_levels.size(); auto &validity = FlatVector::Validity(vector); @@ -49,24 +48,35 @@ void PrimitiveColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterStat idx_t vector_index = 0; reference page_info_ref = state.page_info.back(); - for (idx_t i = start; i < vcount; i++) { + col_chunk.meta_data.num_values += vcount; + + const bool check_parent_empty = parent && !parent->is_empty.empty(); + if (!check_parent_empty && validity.AllValid() && TypeIsConstantSize(vector.GetType().InternalType()) && + page_info_ref.get().estimated_page_size + GetRowSize(vector, vector_index, state) * vcount < + MAX_UNCOMPRESSED_PAGE_SIZE) { + // Fast path auto &page_info = page_info_ref.get(); - page_info.row_count++; - col_chunk.meta_data.num_values++; - if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index + i]) { - page_info.empty_count++; - continue; - } - if (validity.RowIsValid(vector_index)) { - page_info.estimated_page_size += GetRowSize(vector, vector_index, state); - if (page_info.estimated_page_size >= MAX_UNCOMPRESSED_PAGE_SIZE) { - PageInformation new_info; - new_info.offset = page_info.offset + page_info.row_count; - state.page_info.push_back(new_info); - page_info_ref = state.page_info.back(); + page_info.row_count += vcount; + page_info.estimated_page_size += GetRowSize(vector, vector_index, state) * vcount; + } else { + for (idx_t i = 0; i < vcount; i++) { + auto &page_info = page_info_ref.get(); + page_info.row_count++; + if (check_parent_empty && parent->is_empty[parent_index + i]) { + page_info.empty_count++; + continue; + } + if (validity.RowIsValid(vector_index)) { + page_info.estimated_page_size += GetRowSize(vector, vector_index, state); + if (page_info.estimated_page_size >= MAX_UNCOMPRESSED_PAGE_SIZE) { + PageInformation new_info; + new_info.offset = page_info.offset + page_info.row_count; + state.page_info.push_back(new_info); + page_info_ref = state.page_info.back(); + } } + vector_index++; } - vector_index++; } } From 503d0c0f6fdb479bffb58847363f6ddeef7d8d54 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Thu, 13 Feb 2025 15:39:06 +0100 Subject: [PATCH 070/142] add another fast path --- .../parquet/include/parquet_dlba_encoder.hpp | 2 +- .../writer/templated_column_writer.hpp | 22 ++++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/extension/parquet/include/parquet_dlba_encoder.hpp b/extension/parquet/include/parquet_dlba_encoder.hpp index 5e39c5e1fea2..ef7d19f0cfcb 100644 --- a/extension/parquet/include/parquet_dlba_encoder.hpp +++ b/extension/parquet/include/parquet_dlba_encoder.hpp @@ -69,7 +69,7 @@ void WriteValue(DlbaEncoder &encoder, WriteStream &writer, const string_t &value // helpers to get size from strings template -static idx_t GetDlbaStringSize(const SRC &src_value) { +static idx_t GetDlbaStringSize(const SRC &) { return 0; } diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index 81aeb659652c..6e20c6dd6e23 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -161,17 +161,27 @@ class StandardColumnWriter : public PrimitiveColumnWriter { const auto &validity = FlatVector::Validity(vector); - for (idx_t i = 0; i < vcount; i++) { - if (check_parent_empty && parent->is_empty[parent_index + i]) { - continue; - } - if (validity.RowIsValid(vector_index)) { + if (!check_parent_empty && validity.AllValid()) { + // Fast path + for (; vector_index < vcount; vector_index++) { const auto &src_value = data_ptr[vector_index]; state.dictionary.Insert(src_value); state.total_value_count++; state.total_string_size += dlba_encoder::GetDlbaStringSize(src_value); } - vector_index++; + } else { + for (idx_t i = 0; i < vcount; i++) { + if (check_parent_empty && parent->is_empty[parent_index + i]) { + continue; + } + if (validity.RowIsValid(vector_index)) { + const auto &src_value = data_ptr[vector_index]; + state.dictionary.Insert(src_value); + state.total_value_count++; + state.total_string_size += dlba_encoder::GetDlbaStringSize(src_value); + } + vector_index++; + } } } From 301cc5526d3628296bb448800b25270c689a5f20 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Thu, 13 Feb 2025 17:12:06 +0100 Subject: [PATCH 071/142] WIP generalize rowid column to "virtual columns", and make "filename" one of these virtual columns --- extension/parquet/parquet_extension.cpp | 16 ++-- .../catalog_entry/table_catalog_entry.cpp | 6 ++ src/common/constants.cpp | 5 ++ src/common/multi_file_reader.cpp | 10 ++- src/execution/physical_plan/plan_get.cpp | 6 +- .../catalog_entry/table_catalog_entry.hpp | 7 +- src/include/duckdb/common/column_index.hpp | 3 + src/include/duckdb/common/constants.hpp | 2 + .../duckdb/common/multi_file_reader.hpp | 12 ++- src/include/duckdb/common/table_column.hpp | 26 ++++++ .../duckdb/function/table_function.hpp | 12 ++- .../duckdb/optimizer/late_materialization.hpp | 2 + src/include/duckdb/planner/bind_context.hpp | 2 +- .../duckdb/planner/operator/logical_get.hpp | 14 ++-- src/include/duckdb/planner/table_binding.hpp | 10 +-- src/optimizer/late_materialization.cpp | 25 +++--- src/planner/bind_context.cpp | 16 +++- src/planner/binder/statement/bind_delete.cpp | 9 ++- src/planner/binder/statement/bind_update.cpp | 9 ++- .../binder/tableref/bind_basetableref.cpp | 2 +- .../binder/tableref/bind_table_function.cpp | 9 ++- src/planner/operator/logical_get.cpp | 79 +++++++++++++------ src/planner/table_binding.cpp | 30 ++++--- .../copy/parquet/parquet_virtual_columns.test | 21 +++++ 24 files changed, 235 insertions(+), 98 deletions(-) create mode 100644 src/include/duckdb/common/table_column.hpp create mode 100644 test/sql/copy/parquet/parquet_virtual_columns.test diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index f93b3f04acbe..b7100cf7b363 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -450,11 +450,11 @@ class ParquetScanFunction { return nullptr; } - static unique_ptr ParquetScanBindInternal(ClientContext &context, - unique_ptr multi_file_reader, - shared_ptr file_list, - vector &return_types, vector &names, - ParquetOptions parquet_options) { + static unique_ptr + ParquetScanBindInternal(ClientContext &context, unique_ptr multi_file_reader, + shared_ptr file_list, vector &return_types, + vector &names, ParquetOptions parquet_options, + optional_ptr virtual_columns = nullptr) { auto result = make_uniq(); result->multi_file_reader = std::move(multi_file_reader); result->file_list = std::move(file_list); @@ -463,7 +463,7 @@ class ParquetScanFunction { if (result->multi_file_reader->Bind(parquet_options.file_options, *result->file_list, result->types, result->names, result->reader_bind)) { result->multi_file_reader->BindOptions(parquet_options.file_options, *result->file_list, result->types, - result->names, result->reader_bind); + result->names, result->reader_bind, virtual_columns); // Enable the parquet file_row_number on the parquet options if the file_row_number_idx was set if (result->reader_bind.file_row_number_idx != DConstants::INVALID_INDEX) { parquet_options.file_row_number = true; @@ -476,7 +476,7 @@ class ParquetScanFunction { parquet_options.file_options.AutoDetectHivePartitioning(*result->file_list, context); // Default bind result->reader_bind = result->multi_file_reader->BindReader( - context, result->types, result->names, *result->file_list, *result, parquet_options); + context, result->types, result->names, *result->file_list, *result, parquet_options, virtual_columns); } // Set the explicit cardinality if requested @@ -617,7 +617,7 @@ class ParquetScanFunction { auto file_list = multi_file_reader->CreateFileList(context, input.inputs[0]); return ParquetScanBindInternal(context, std::move(multi_file_reader), std::move(file_list), return_types, names, - parquet_options); + parquet_options, &input.virtual_columns); } static double ParquetProgress(ClientContext &context, const FunctionData *bind_data_p, diff --git a/src/catalog/catalog_entry/table_catalog_entry.cpp b/src/catalog/catalog_entry/table_catalog_entry.cpp index 3070b2e30d48..a7ca2fab69a3 100644 --- a/src/catalog/catalog_entry/table_catalog_entry.cpp +++ b/src/catalog/catalog_entry/table_catalog_entry.cpp @@ -332,4 +332,10 @@ bool TableCatalogEntry::HasPrimaryKey() const { return GetPrimaryKey() != nullptr; } +virtual_column_map_t TableCatalogEntry::GetVirtualColumns() const { + virtual_column_map_t virtual_columns; + virtual_columns.insert(make_pair(COLUMN_IDENTIFIER_ROW_ID, TableColumn("rowid", LogicalType::ROW_TYPE))); + return virtual_columns; +} + } // namespace duckdb diff --git a/src/common/constants.cpp b/src/common/constants.cpp index edafe6b67650..4db7245e235f 100644 --- a/src/common/constants.cpp +++ b/src/common/constants.cpp @@ -10,6 +10,7 @@ constexpr const idx_t DConstants::INVALID_INDEX; const row_t MAX_ROW_ID = 36028797018960000ULL; // 2^55 const row_t MAX_ROW_ID_LOCAL = 72057594037920000ULL; // 2^56 const column_t COLUMN_IDENTIFIER_ROW_ID = (column_t)-1; +const column_t VIRTUAL_COLUMN_START = UINT64_C(9223372036854775808); // 2^63 const double PI = 3.141592653589793; const transaction_t TRANSACTION_ID_START = 4611686018427388000ULL; // 2^62 @@ -56,4 +57,8 @@ bool IsRowIdColumnId(column_t column_id) { return column_id == COLUMN_IDENTIFIER_ROW_ID; } +bool IsVirtualColumn(column_t column_id) { + return column_id >= VIRTUAL_COLUMN_START; +} + } // namespace duckdb diff --git a/src/common/multi_file_reader.cpp b/src/common/multi_file_reader.cpp index be17aaf9516b..9016780e422b 100644 --- a/src/common/multi_file_reader.cpp +++ b/src/common/multi_file_reader.cpp @@ -15,6 +15,8 @@ namespace duckdb { +constexpr column_t MultiFileReader::COLUMN_IDENTIFIER_FILENAME; + MultiFileReaderGlobalState::~MultiFileReaderGlobalState() { } @@ -169,7 +171,8 @@ bool MultiFileReader::Bind(MultiFileReaderOptions &options, MultiFileList &files void MultiFileReader::BindOptions(MultiFileReaderOptions &options, MultiFileList &files, vector &return_types, vector &names, - MultiFileReaderBindData &bind_data) { + MultiFileReaderBindData &bind_data, + optional_ptr virtual_columns) { // Add generated constant column for filename if (options.filename) { if (std::find(names.begin(), names.end(), options.filename_column) != names.end()) { @@ -180,6 +183,10 @@ void MultiFileReader::BindOptions(MultiFileReaderOptions &options, MultiFileList bind_data.filename_idx = names.size(); return_types.emplace_back(LogicalType::VARCHAR); names.emplace_back(options.filename_column); + } else if (virtual_columns) { + // filename is not specified - add it to the virtual columns list + virtual_columns->insert(make_pair(COLUMN_IDENTIFIER_FILENAME, TableColumn("filename", LogicalType::VARCHAR))); + bind_data.filename_idx = COLUMN_IDENTIFIER_FILENAME; } // Add generated constant columns from hive partitioning scheme @@ -253,6 +260,7 @@ void MultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_options, c auto &col_idx = global_column_ids[i]; if (col_idx.IsRowIdColumn()) { // row-id + // FIXME: this should probably be removed reader_data.constant_map.emplace_back(i, Value::BIGINT(42)); continue; } diff --git a/src/execution/physical_plan/plan_get.cpp b/src/execution/physical_plan/plan_get.cpp index 3b5d940eb924..843ad1537172 100644 --- a/src/execution/physical_plan/plan_get.cpp +++ b/src/execution/physical_plan/plan_get.cpp @@ -159,10 +159,8 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalGet &op) { vector types; vector> expressions; for (auto &column_id : column_ids) { - if (column_id.IsRowIdColumn()) { - types.emplace_back(op.GetRowIdType()); - // Now how to make that a constant expression. - expressions.push_back(make_uniq(Value(op.GetRowIdType()))); + if (column_id.IsVirtualColumn()) { + throw NotImplementedException("Virtual columns require projection pushdown"); } else { auto col_id = column_id.GetPrimaryIndex(); auto type = op.returned_types[col_id]; diff --git a/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp b/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp index 398e49974b88..0adb7857b24d 100644 --- a/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp +++ b/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp @@ -18,6 +18,7 @@ #include "duckdb/common/case_insensitive_map.hpp" #include "duckdb/catalog/catalog_entry/table_column_type.hpp" #include "duckdb/catalog/catalog_entry/column_dependency_manager.hpp" +#include "duckdb/common/table_column.hpp" namespace duckdb { @@ -117,10 +118,8 @@ class TableCatalogEntry : public StandardEntry { //! Returns true, if the table has a primary key, else false. bool HasPrimaryKey() const; - //! Returns the rowid type of this table - virtual LogicalType GetRowIdType() const { - return LogicalType::ROW_TYPE; - } + //! Returns the virtual columns for this table + virtual virtual_column_map_t GetVirtualColumns() const; protected: //! A list of columns that are part of this table diff --git a/src/include/duckdb/common/column_index.hpp b/src/include/duckdb/common/column_index.hpp index 32d2e1828462..a563005f43f3 100644 --- a/src/include/duckdb/common/column_index.hpp +++ b/src/include/duckdb/common/column_index.hpp @@ -61,6 +61,9 @@ struct ColumnIndex { bool IsRowIdColumn() const { return index == DConstants::INVALID_INDEX; } + bool IsVirtualColumn() const { + return index >= VIRTUAL_COLUMN_START; + } void Serialize(Serializer &serializer) const; static ColumnIndex Deserialize(Deserializer &deserializer); diff --git a/src/include/duckdb/common/constants.hpp b/src/include/duckdb/common/constants.hpp index d4a0d7cda1c3..387dd4127579 100644 --- a/src/include/duckdb/common/constants.hpp +++ b/src/include/duckdb/common/constants.hpp @@ -40,7 +40,9 @@ DUCKDB_API bool IsInvalidCatalog(const string &str); //! Special value used to signify the ROW ID of a table DUCKDB_API extern const column_t COLUMN_IDENTIFIER_ROW_ID; +DUCKDB_API extern const column_t VIRTUAL_COLUMN_START; DUCKDB_API bool IsRowIdColumnId(column_t column_id); +DUCKDB_API bool IsVirtualColumn(column_t column_id); //! The maximum row identifier used in tables extern const row_t MAX_ROW_ID; diff --git a/src/include/duckdb/common/multi_file_reader.hpp b/src/include/duckdb/common/multi_file_reader.hpp index 942f72c1e277..c1a262fa875f 100644 --- a/src/include/duckdb/common/multi_file_reader.hpp +++ b/src/include/duckdb/common/multi_file_reader.hpp @@ -200,6 +200,10 @@ struct MultiFileReaderData { //! The MultiFileReader class provides a set of helper methods to handle scanning from multiple files struct MultiFileReader { +public: + static constexpr column_t COLUMN_IDENTIFIER_FILENAME = UINT64_C(9223372036854775808); + +public: virtual ~MultiFileReader(); //! Create a MultiFileReader for a specific TableFunction, using its function name for errors @@ -246,7 +250,8 @@ struct MultiFileReader { //! Bind the options of the multi-file reader, potentially emitting any extra columns that are required DUCKDB_API virtual void BindOptions(MultiFileReaderOptions &options, MultiFileList &files, vector &return_types, vector &names, - MultiFileReaderBindData &bind_data); + MultiFileReaderBindData &bind_data, + optional_ptr virtual_columns = nullptr); //! Initialize global state used by the MultiFileReader DUCKDB_API virtual unique_ptr @@ -317,7 +322,8 @@ struct MultiFileReader { template MultiFileReaderBindData BindReader(ClientContext &context, vector &return_types, vector &names, - MultiFileList &files, RESULT_CLASS &result, OPTIONS_CLASS &options) { + MultiFileList &files, RESULT_CLASS &result, OPTIONS_CLASS &options, + optional_ptr virtual_columns = nullptr) { if (options.file_options.union_by_name) { return BindUnionReader(context, return_types, names, files, result, options); } else { @@ -330,7 +336,7 @@ struct MultiFileReader { } result.Initialize(std::move(reader)); MultiFileReaderBindData bind_data; - BindOptions(options.file_options, files, return_types, names, bind_data); + BindOptions(options.file_options, files, return_types, names, bind_data, virtual_columns); return bind_data; } } diff --git a/src/include/duckdb/common/table_column.hpp b/src/include/duckdb/common/table_column.hpp new file mode 100644 index 000000000000..6cc4b8fe7e60 --- /dev/null +++ b/src/include/duckdb/common/table_column.hpp @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/common/table_column.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/common/types.hpp" +#include "duckdb/common/unordered_map.hpp" + +namespace duckdb { + +struct TableColumn { + TableColumn(string name_p, LogicalType type_p) : name(std::move(name_p)), type(std::move(type_p)) { + } + + string name; + LogicalType type; +}; + +using virtual_column_map_t = unordered_map; + +} // namespace duckdb diff --git a/src/include/duckdb/function/table_function.hpp b/src/include/duckdb/function/table_function.hpp index 6293100ef1d1..15625ff61e3a 100644 --- a/src/include/duckdb/function/table_function.hpp +++ b/src/include/duckdb/function/table_function.hpp @@ -12,11 +12,12 @@ #include "duckdb/common/optional_ptr.hpp" #include "duckdb/execution/execution_context.hpp" #include "duckdb/function/function.hpp" -#include "duckdb/planner/bind_context.hpp" #include "duckdb/planner/logical_operator.hpp" #include "duckdb/storage/statistics/node_statistics.hpp" #include "duckdb/common/column_index.hpp" +#include "duckdb/common/table_column.hpp" #include "duckdb/function/partition_stats.hpp" +#include "duckdb/common/exception/binder_exception.hpp" #include @@ -27,7 +28,9 @@ class LogicalDependencyList; class LogicalGet; class TableFunction; class TableFilterSet; +class TableFunctionRef; class TableCatalogEntry; +class SampleOptions; struct MultiFileReader; struct OperatorPartitionData; struct OperatorPartitionInfo; @@ -90,9 +93,11 @@ struct TableFunctionBindInput { TableFunctionBindInput(vector &inputs, named_parameter_map_t &named_parameters, vector &input_table_types, vector &input_table_names, optional_ptr info, optional_ptr binder, - TableFunction &table_function, const TableFunctionRef &ref) + TableFunction &table_function, const TableFunctionRef &ref, + virtual_column_map_t &virtual_columns) : inputs(inputs), named_parameters(named_parameters), input_table_types(input_table_types), - input_table_names(input_table_names), info(info), binder(binder), table_function(table_function), ref(ref) { + input_table_names(input_table_names), info(info), binder(binder), table_function(table_function), ref(ref), + virtual_columns(virtual_columns) { } vector &inputs; @@ -103,6 +108,7 @@ struct TableFunctionBindInput { optional_ptr binder; TableFunction &table_function; const TableFunctionRef &ref; + virtual_column_map_t &virtual_columns; }; struct TableFunctionInitInput { diff --git a/src/include/duckdb/optimizer/late_materialization.hpp b/src/include/duckdb/optimizer/late_materialization.hpp index 76f4f05e86cc..16350601a807 100644 --- a/src/include/duckdb/optimizer/late_materialization.hpp +++ b/src/include/duckdb/optimizer/late_materialization.hpp @@ -40,6 +40,8 @@ class LateMaterialization : public BaseColumnPruner { Optimizer &optimizer; //! The max row count for which we will consider late materialization idx_t max_row_count; + //! The type of the row id column + LogicalType row_id_type; }; } // namespace duckdb diff --git a/src/include/duckdb/planner/bind_context.hpp b/src/include/duckdb/planner/bind_context.hpp index 8234805e7ba9..ff070ceb49d6 100644 --- a/src/include/duckdb/planner/bind_context.hpp +++ b/src/include/duckdb/planner/bind_context.hpp @@ -100,7 +100,7 @@ class BindContext { //! Adds a call to a table function with the given alias to the BindContext. void AddTableFunction(idx_t index, const string &alias, const vector &names, const vector &types, vector &bound_column_ids, - optional_ptr entry); + optional_ptr entry, virtual_column_map_t virtual_columns); //! Adds a table view with a given alias to the BindContext. void AddView(idx_t index, const string &alias, SubqueryRef &ref, BoundQueryNode &subquery, ViewCatalogEntry &view); //! Adds a subquery with a given alias to the BindContext. diff --git a/src/include/duckdb/planner/operator/logical_get.hpp b/src/include/duckdb/planner/operator/logical_get.hpp index 81b93accb48b..65f90fe85ae2 100644 --- a/src/include/duckdb/planner/operator/logical_get.hpp +++ b/src/include/duckdb/planner/operator/logical_get.hpp @@ -24,7 +24,7 @@ class LogicalGet : public LogicalOperator { public: LogicalGet(idx_t table_index, TableFunction function, unique_ptr bind_data, vector returned_types, vector returned_names, - LogicalType rowid_type = LogicalType(LogicalType::ROW_TYPE)); + virtual_column_map_t virtual_columns = virtual_column_map_t()); //! The table index in the current bind context idx_t table_index; @@ -36,6 +36,8 @@ class LogicalGet : public LogicalOperator { vector returned_types; //! The names of ALL columns that can be returned by the table function vector names; + //! A mapping of column index -> type/name for all virtual columns + virtual_column_map_t virtual_columns; //! Columns that are used outside the scan vector projection_ids; //! Filters pushed down for table scan @@ -62,6 +64,9 @@ class LogicalGet : public LogicalOperator { //! Returns the underlying table that is being scanned, or nullptr if there is none optional_ptr GetTable() const; + const LogicalType &GetColumnType(const ColumnIndex &column_index) const; + const string &GetColumnName(const ColumnIndex &column_index) const; + public: void SetColumnIds(vector &&column_ids); void AddColumnId(column_t column_id); @@ -80,10 +85,6 @@ class LogicalGet : public LogicalOperator { void Serialize(Serializer &serializer) const override; static unique_ptr Deserialize(Deserializer &deserializer); - const LogicalType &GetRowIdType() const { - return rowid_type; - } - protected: void ResolveTypes() override; @@ -93,8 +94,5 @@ class LogicalGet : public LogicalOperator { private: //! Bound column IDs vector column_ids; - - //! The type of the rowid column - LogicalType rowid_type = LogicalType(LogicalType::ROW_TYPE); }; } // namespace duckdb diff --git a/src/include/duckdb/planner/table_binding.hpp b/src/include/duckdb/planner/table_binding.hpp index 50631f57ac45..9aedc7e70058 100644 --- a/src/include/duckdb/planner/table_binding.hpp +++ b/src/include/duckdb/planner/table_binding.hpp @@ -16,6 +16,7 @@ #include "duckdb/catalog/catalog_entry/table_column_type.hpp" #include "duckdb/planner/binding_alias.hpp" #include "duckdb/common/column_index.hpp" +#include "duckdb/common/table_column.hpp" namespace duckdb { class BindContext; @@ -33,8 +34,7 @@ enum class BindingType { BASE, TABLE, DUMMY, CATALOG_ENTRY }; //! A Binding represents a binding to a table, table-producing function or subquery with a specified table index. struct Binding { - Binding(BindingType binding_type, BindingAlias alias, vector types, vector names, idx_t index, - LogicalType rowid_type = LogicalType(LogicalType::ROW_TYPE)); + Binding(BindingType binding_type, BindingAlias alias, vector types, vector names, idx_t index); virtual ~Binding() = default; //! The type of Binding @@ -50,8 +50,6 @@ struct Binding { //! Name -> index for the names case_insensitive_map_t name_map; - LogicalType rowid_type; - public: bool TryGetBindingIndex(const string &column_name, column_t &column_index); column_t GetBindingIndex(const string &column_name); @@ -104,12 +102,14 @@ struct TableBinding : public Binding { public: TableBinding(const string &alias, vector types, vector names, vector &bound_column_ids, optional_ptr entry, idx_t index, - bool add_row_id = false); + virtual_column_map_t virtual_columns); //! A reference to the set of bound column ids vector &bound_column_ids; //! The underlying catalog entry (if any) optional_ptr entry; + //! Virtual columns + virtual_column_map_t virtual_columns; public: unique_ptr ExpandGeneratedColumn(const string &column_name); diff --git a/src/optimizer/late_materialization.cpp b/src/optimizer/late_materialization.cpp index aa4e5c4c1fe4..01f8118f5257 100644 --- a/src/optimizer/late_materialization.cpp +++ b/src/optimizer/late_materialization.cpp @@ -35,7 +35,7 @@ idx_t LateMaterialization::GetOrInsertRowId(LogicalGet &get) { get.projection_ids.push_back(column_ids.size() - 1); } if (!get.types.empty()) { - get.types.push_back(get.GetRowIdType()); + get.types.push_back(row_id_type); } return column_ids.size() - 1; } @@ -44,7 +44,7 @@ unique_ptr LateMaterialization::ConstructLHS(LogicalGet &get) { // we need to construct a new scan of the same table auto table_index = optimizer.binder.GenerateTableIndex(); auto new_get = make_uniq(table_index, get.function, get.bind_data->Copy(), get.returned_types, - get.names, get.GetRowIdType()); + get.names, get.virtual_columns); new_get->GetMutableColumnIds() = get.GetColumnIds(); new_get->projection_ids = get.projection_ids; return new_get; @@ -73,8 +73,7 @@ ColumnBinding LateMaterialization::ConstructRHS(unique_ptr &op) case LogicalOperatorType::LOGICAL_PROJECTION: { auto &proj = op.Cast(); // push a projection of the row-id column - proj.expressions.push_back( - make_uniq("rowid", get.GetRowIdType(), row_id_binding)); + proj.expressions.push_back(make_uniq("rowid", row_id_type, row_id_binding)); // modify the row-id-binding to push to the new projection row_id_binding = ColumnBinding(proj.table_index, proj.expressions.size() - 1); column_count = proj.expressions.size(); @@ -153,9 +152,8 @@ unique_ptr LateMaterialization::GetExpression(LogicalOperator &op, i case LogicalOperatorType::LOGICAL_GET: { auto &get = op.Cast(); auto &column_id = get.GetColumnIds()[column_index]; - auto is_row_id = column_id.IsRowIdColumn(); - auto column_name = is_row_id ? "rowid" : get.names[column_id.GetPrimaryIndex()]; - auto &column_type = is_row_id ? get.GetRowIdType() : get.returned_types[column_id.GetPrimaryIndex()]; + auto column_name = get.GetColumnName(column_id); + auto &column_type = get.GetColumnType(column_id); auto expr = make_uniq(column_name, column_type, ColumnBinding(get.table_index, column_index)); return std::move(expr); @@ -235,16 +233,17 @@ bool LateMaterialization::TryLateMaterialization(unique_ptr &op } } auto &get = child.get().Cast(); - auto table = get.GetTable(); - if (!table || !table->IsDuckTable()) { - // we can only do the late-materialization optimization for DuckDB tables currently - return false; - } if (column_references.size() >= get.GetColumnIds().size()) { // we do not benefit from late materialization // we need all of the columns to compute the root node anyway (Top-N/Limit/etc) return false; } + auto entry = get.virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); + if (entry == get.virtual_columns.end()) { + // we can only do the late-materialization optimization for tables that support the rowid column + return false; + } + row_id_type = entry->second.type; // we benefit from late materialization // we need to transform this plan into a semi-join with the row-id // we need to ensure the operator returns exactly the same column bindings as before @@ -258,8 +257,6 @@ bool LateMaterialization::TryLateMaterialization(unique_ptr &op auto lhs_row_idx = GetOrInsertRowId(lhs_get); ColumnBinding lhs_binding(lhs_index, lhs_row_idx); - auto &row_id_type = get.GetRowIdType(); - // after constructing the LHS but before constructing the RHS we construct the final projections/orders // - we do this before constructing the RHS because that alter the original plan vector> final_proj_list; diff --git a/src/planner/bind_context.cpp b/src/planner/bind_context.cpp index d135ed2ae604..d22fe4799b7b 100644 --- a/src/planner/bind_context.cpp +++ b/src/planner/bind_context.cpp @@ -607,20 +607,28 @@ void BindContext::AddBinding(unique_ptr binding) { void BindContext::AddBaseTable(idx_t index, const string &alias, const vector &names, const vector &types, vector &bound_column_ids, StandardEntry &entry, bool add_row_id) { - AddBinding(make_uniq(alias, types, names, bound_column_ids, &entry, index, add_row_id)); + virtual_column_map_t virtual_columns; + if (add_row_id) { + virtual_columns.insert(make_pair(COLUMN_IDENTIFIER_ROW_ID, TableColumn("rowid", LogicalType::ROW_TYPE))); + } + AddBinding( + make_uniq(alias, types, names, bound_column_ids, &entry, index, std::move(virtual_columns))); } void BindContext::AddBaseTable(idx_t index, const string &alias, const vector &names, const vector &types, vector &bound_column_ids, const string &table_name) { + virtual_column_map_t virtual_columns; + virtual_columns.insert(make_pair(COLUMN_IDENTIFIER_ROW_ID, TableColumn("rowid", LogicalType::ROW_TYPE))); AddBinding(make_uniq(alias.empty() ? table_name : alias, types, names, bound_column_ids, nullptr, - index, true)); + index, std::move(virtual_columns))); } void BindContext::AddTableFunction(idx_t index, const string &alias, const vector &names, const vector &types, vector &bound_column_ids, - optional_ptr entry) { - AddBinding(make_uniq(alias, types, names, bound_column_ids, entry, index)); + optional_ptr entry, virtual_column_map_t virtual_columns) { + AddBinding( + make_uniq(alias, types, names, bound_column_ids, entry, index, std::move(virtual_columns))); } static string AddColumnNameToBinding(const string &base_name, case_insensitive_set_t ¤t_names) { diff --git a/src/planner/binder/statement/bind_delete.cpp b/src/planner/binder/statement/bind_delete.cpp index 822f487edcd6..a27d5b9d5209 100644 --- a/src/planner/binder/statement/bind_delete.cpp +++ b/src/planner/binder/statement/bind_delete.cpp @@ -73,10 +73,15 @@ BoundStatement Binder::Bind(DeleteStatement &stmt) { del->bound_constraints = BindConstraints(table); del->AddChild(std::move(root)); + auto virtual_columns = table.GetVirtualColumns(); + auto row_id_entry = virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); + if (row_id_entry == virtual_columns.end()) { + throw InternalException("BindDelete could not find the row id column in the virtual columns list of the table"); + } // set up the delete expression auto &column_ids = get.GetColumnIds(); - del->expressions.push_back( - make_uniq(table.GetRowIdType(), ColumnBinding(get.table_index, column_ids.size()))); + del->expressions.push_back(make_uniq(row_id_entry->second.type, + ColumnBinding(get.table_index, column_ids.size()))); get.AddColumnId(COLUMN_IDENTIFIER_ROW_ID); if (!stmt.returning_list.empty()) { diff --git a/src/planner/binder/statement/bind_update.cpp b/src/planner/binder/statement/bind_update.cpp index 75dd39074e0a..eb0f0f7c5339 100644 --- a/src/planner/binder/statement/bind_update.cpp +++ b/src/planner/binder/statement/bind_update.cpp @@ -133,9 +133,14 @@ BoundStatement Binder::Bind(UpdateStatement &stmt) { table.BindUpdateConstraints(*this, *get, *proj, *update, context); // finally add the row id column to the projection list + auto virtual_columns = table.GetVirtualColumns(); + auto row_id_entry = virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); + if (row_id_entry == virtual_columns.end()) { + throw InternalException("BindDelete could not find the row id column in the virtual columns list of the table"); + } auto &column_ids = get->GetColumnIds(); - proj->expressions.push_back( - make_uniq(table.GetRowIdType(), ColumnBinding(get->table_index, column_ids.size()))); + proj->expressions.push_back(make_uniq( + row_id_entry->second.type, ColumnBinding(get->table_index, column_ids.size()))); get->AddColumnId(COLUMN_IDENTIFIER_ROW_ID); // set the projection as child of the update node and finalize the result diff --git a/src/planner/binder/tableref/bind_basetableref.cpp b/src/planner/binder/tableref/bind_basetableref.cpp index d0302a1086e5..8209b7ef85af 100644 --- a/src/planner/binder/tableref/bind_basetableref.cpp +++ b/src/planner/binder/tableref/bind_basetableref.cpp @@ -262,7 +262,7 @@ unique_ptr Binder::Bind(BaseTableRef &ref) { auto logical_get = make_uniq(table_index, scan_function, std::move(bind_data), std::move(return_types), - std::move(return_names), table.GetRowIdType()); + std::move(return_names), table.GetVirtualColumns()); auto table_entry = logical_get->GetTable(); auto &col_ids = logical_get->GetMutableColumnIds(); if (!table_entry) { diff --git a/src/planner/binder/tableref/bind_table_function.cpp b/src/planner/binder/tableref/bind_table_function.cpp index ace96207bf98..b17884c76532 100644 --- a/src/planner/binder/tableref/bind_table_function.cpp +++ b/src/planner/binder/tableref/bind_table_function.cpp @@ -197,9 +197,11 @@ unique_ptr Binder::BindTableFunctionInternal(TableFunction &tab unique_ptr bind_data; vector return_types; vector return_names; + unordered_map virtual_columns; if (table_function.bind || table_function.bind_replace) { TableFunctionBindInput bind_input(parameters, named_parameters, input_table_types, input_table_names, - table_function.function_info.get(), this, table_function, ref); + table_function.function_info.get(), this, table_function, ref, + virtual_columns); if (table_function.bind_replace) { auto new_plan = table_function.bind_replace(context, bind_input); if (new_plan) { @@ -237,7 +239,8 @@ unique_ptr Binder::BindTableFunctionInternal(TableFunction &tab } } - auto get = make_uniq(bind_index, table_function, std::move(bind_data), return_types, return_names); + auto get = make_uniq(bind_index, table_function, std::move(bind_data), return_types, return_names, + virtual_columns); get->parameters = parameters; get->named_parameters = named_parameters; get->input_table_types = input_table_types; @@ -249,7 +252,7 @@ unique_ptr Binder::BindTableFunctionInternal(TableFunction &tab } // now add the table function to the bind context so its columns can be bound bind_context.AddTableFunction(bind_index, function_name, return_names, return_types, get->GetMutableColumnIds(), - get->GetTable().get()); + get->GetTable().get(), std::move(virtual_columns)); return std::move(get); } diff --git a/src/planner/operator/logical_get.cpp b/src/planner/operator/logical_get.cpp index be7b5aa5d796..37e475e66a89 100644 --- a/src/planner/operator/logical_get.cpp +++ b/src/planner/operator/logical_get.cpp @@ -17,10 +17,11 @@ LogicalGet::LogicalGet() : LogicalOperator(LogicalOperatorType::LOGICAL_GET) { } LogicalGet::LogicalGet(idx_t table_index, TableFunction function, unique_ptr bind_data, - vector returned_types, vector returned_names, LogicalType rowid_type) + vector returned_types, vector returned_names, + virtual_column_map_t virtual_columns_p) : LogicalOperator(LogicalOperatorType::LOGICAL_GET), table_index(table_index), function(std::move(function)), bind_data(std::move(bind_data)), returned_types(std::move(returned_types)), names(std::move(returned_names)), - extra_info(), rowid_type(std::move(rowid_type)) { + virtual_columns(std::move(virtual_columns_p)), extra_info() { } optional_ptr LogicalGet::GetTable() const { @@ -118,6 +119,28 @@ vector LogicalGet::GetColumnBindings() { return result; } +const LogicalType &LogicalGet::GetColumnType(const ColumnIndex &index) const { + if (index.IsVirtualColumn()) { + auto entry = virtual_columns.find(index.GetPrimaryIndex()); + if (entry == virtual_columns.end()) { + throw InternalException("Failed to find referenced virtual column %d", index.GetPrimaryIndex()); + } + return entry->second.type; + } + return returned_types[index.GetPrimaryIndex()]; +} + +const string &LogicalGet::GetColumnName(const ColumnIndex &index) const { + if (index.IsVirtualColumn()) { + auto entry = virtual_columns.find(index.GetPrimaryIndex()); + if (entry == virtual_columns.end()) { + throw InternalException("Failed to find referenced virtual column %d", index.GetPrimaryIndex()); + } + return entry->second.name; + } + return names[index.GetPrimaryIndex()]; +} + void LogicalGet::ResolveTypes() { if (column_ids.empty()) { column_ids.emplace_back(COLUMN_IDENTIFIER_ROW_ID); @@ -125,20 +148,12 @@ void LogicalGet::ResolveTypes() { types.clear(); if (projection_ids.empty()) { for (auto &index : column_ids) { - if (index.IsRowIdColumn()) { - types.emplace_back(LogicalType(rowid_type)); - } else { - types.push_back(returned_types[index.GetPrimaryIndex()]); - } + types.push_back(GetColumnType(index)); } } else { for (auto &proj_index : projection_ids) { auto &index = column_ids[proj_index]; - if (index.IsRowIdColumn()) { - types.emplace_back(LogicalType(rowid_type)); - } else { - types.push_back(returned_types[index.GetPrimaryIndex()]); - } + types.push_back(GetColumnType(index)); } } if (!projected_input.empty()) { @@ -227,9 +242,10 @@ unique_ptr LogicalGet::Deserialize(Deserializer &deserializer) } if (!has_serialize) { TableFunctionRef empty_ref; + unordered_map virtual_columns; TableFunctionBindInput input(result->parameters, result->named_parameters, result->input_table_types, result->input_table_names, function.function_info.get(), nullptr, result->function, - empty_ref); + empty_ref, virtual_columns); vector bind_return_types; vector bind_names; @@ -239,20 +255,35 @@ unique_ptr LogicalGet::Deserialize(Deserializer &deserializer) bind_data = function.bind(deserializer.Get(), input, bind_return_types, bind_names); for (auto &col_id : result->column_ids) { - if (col_id.IsRowIdColumn()) { - // rowid - continue; - } - auto idx = col_id.GetPrimaryIndex(); - auto &ret_type = result->returned_types[idx]; - auto &col_name = result->names[idx]; - if (bind_return_types[idx] != ret_type) { - throw SerializationException("Table function deserialization failure in function \"%s\" - column with " - "name %s was serialized with type %s, but now has type %s", - function.name, col_name, ret_type, bind_return_types[idx]); + if (col_id.IsVirtualColumn()) { + auto idx = col_id.GetPrimaryIndex(); + auto ventry = virtual_columns.find(idx); + if (ventry == virtual_columns.end()) { + throw SerializationException( + "Table function deserialization failure - could not find virtual column with id %d", idx); + } + auto &ret_type = ventry->second.type; + auto &col_name = ventry->second.name; + if (bind_return_types[idx] != ret_type) { + throw SerializationException( + "Table function deserialization failure in function \"%s\" - virtual column with " + "name %s was serialized with type %s, but now has type %s", + function.name, col_name, ret_type, bind_return_types[idx]); + } + } else { + auto idx = col_id.GetPrimaryIndex(); + auto &ret_type = result->returned_types[idx]; + auto &col_name = result->names[idx]; + if (bind_return_types[idx] != ret_type) { + throw SerializationException( + "Table function deserialization failure in function \"%s\" - column with " + "name %s was serialized with type %s, but now has type %s", + function.name, col_name, ret_type, bind_return_types[idx]); + } } } result->returned_types = std::move(bind_return_types); + result->virtual_columns = std::move(virtual_columns); } result->bind_data = std::move(bind_data); return std::move(result); diff --git a/src/planner/table_binding.cpp b/src/planner/table_binding.cpp index 455834a4bb0d..934814ec7516 100644 --- a/src/planner/table_binding.cpp +++ b/src/planner/table_binding.cpp @@ -16,9 +16,9 @@ namespace duckdb { Binding::Binding(BindingType binding_type, BindingAlias alias_p, vector coltypes, vector colnames, - idx_t index, LogicalType rowid_type) + idx_t index) : binding_type(binding_type), alias(std::move(alias_p)), index(index), types(std::move(coltypes)), - names(std::move(colnames)), rowid_type(std::move(rowid_type)) { + names(std::move(colnames)) { D_ASSERT(types.size() == names.size()); for (idx_t i = 0; i < names.size(); i++) { auto &name = names[i]; @@ -114,13 +114,19 @@ optional_ptr EntryBinding::GetStandardEntry() { TableBinding::TableBinding(const string &alias, vector types_p, vector names_p, vector &bound_column_ids, optional_ptr entry, idx_t index, - bool add_row_id) - : Binding(BindingType::TABLE, GetAlias(alias, entry), std::move(types_p), std::move(names_p), index, - (add_row_id && entry) ? entry->Cast().GetRowIdType() : LogicalType::ROW_TYPE), - bound_column_ids(bound_column_ids), entry(entry) { - if (add_row_id) { - if (name_map.find("rowid") == name_map.end()) { - name_map["rowid"] = COLUMN_IDENTIFIER_ROW_ID; + virtual_column_map_t virtual_columns_p) + : Binding(BindingType::TABLE, GetAlias(alias, entry), std::move(types_p), std::move(names_p), index), + bound_column_ids(bound_column_ids), entry(entry), virtual_columns(std::move(virtual_columns_p)) { + for (auto &ventry : virtual_columns) { + auto idx = ventry.first; + auto &name = ventry.second.name; + if (idx < VIRTUAL_COLUMN_START) { + throw BinderException( + "Virtual column index must be larger than VIRTUAL_COLUMN_START - found %d for column \"%s\"", idx, + name); + } + if (name_map.find(name) == name_map.end()) { + name_map[name] = idx; } } } @@ -238,8 +244,10 @@ BindResult TableBinding::Bind(ColumnRefExpression &colref, idx_t depth) { } // fetch the type of the column LogicalType col_type; - if (column_index == COLUMN_IDENTIFIER_ROW_ID) { - col_type = LogicalType(rowid_type); + auto ventry = virtual_columns.find(column_index); + if (ventry != virtual_columns.end()) { + // virtual column - fetch type from there + col_type = ventry->second.type; } else { // normal column: fetch type from base column col_type = types[column_index]; diff --git a/test/sql/copy/parquet/parquet_virtual_columns.test b/test/sql/copy/parquet/parquet_virtual_columns.test new file mode 100644 index 000000000000..1abddbe36928 --- /dev/null +++ b/test/sql/copy/parquet/parquet_virtual_columns.test @@ -0,0 +1,21 @@ +# name: test/sql/copy/parquet/parquet_virtual_columns.test +# description: Test virtual columns +# group: [parquet] + +require parquet + +# Filename without the filename option +query III +select i, j, replace(filename, '\', '/') from 'data/parquet-testing/glob*/t?.parquet' order by i; +---- +1 a data/parquet-testing/glob/t1.parquet +2 b data/parquet-testing/glob/t2.parquet +3 c data/parquet-testing/glob2/t1.parquet + +# not projected in * +query II +select * from 'data/parquet-testing/glob*/t?.parquet' order by i; +---- +1 a +2 b +3 c From b3a67cd3aeb7b8b5dce45eb5f922963097314c5d Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 11 Feb 2025 10:33:21 +0100 Subject: [PATCH 072/142] Backport #16115 --- .../duckdb/optimizer/optimizer_extension.hpp | 16 +++++++++++----- src/optimizer/optimizer.cpp | 13 ++++++++++++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/include/duckdb/optimizer/optimizer_extension.hpp b/src/include/duckdb/optimizer/optimizer_extension.hpp index 6ccc6277c1a8..d2202fcb11a1 100644 --- a/src/include/duckdb/optimizer/optimizer_extension.hpp +++ b/src/include/duckdb/optimizer/optimizer_extension.hpp @@ -29,14 +29,20 @@ struct OptimizerExtensionInput { }; typedef void (*optimize_function_t)(OptimizerExtensionInput &input, unique_ptr &plan); +typedef void (*pre_optimize_function_t)(OptimizerExtensionInput &input, unique_ptr &plan); class OptimizerExtension { public: - //! The parse function of the parser extension. - //! Takes a query string as input and returns ParserExtensionParseData (on success) or an error - optimize_function_t optimize_function; - - //! Additional parser info passed to the parse function + //! The optimize function of the optimizer extension. + //! Takes a logical query plan as an input, which it can modify in place + //! This runs, after the DuckDB optimizers have run + optimize_function_t optimize_function = nullptr; + //! The pre-optimize function of the optimizer extension. + //! Takes a logical query plan as an input, which it can modify in place + //! This runs, before the DuckDB optimizers have run + pre_optimize_function_t pre_optimize_function = nullptr; + + //! Additional optimizer info passed to the optimize functions shared_ptr optimizer_info; }; diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index 8ac4cdd87da8..dc1ddfa59224 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -272,12 +272,23 @@ unique_ptr Optimizer::Optimize(unique_ptr plan this->plan = std::move(plan_p); + for (auto &pre_optimizer_extension : DBConfig::GetConfig(context).optimizer_extensions) { + RunOptimizer(OptimizerType::EXTENSION, [&]() { + OptimizerExtensionInput input {GetContext(), *this, pre_optimizer_extension.optimizer_info.get()}; + if (pre_optimizer_extension.pre_optimize_function) { + pre_optimizer_extension.pre_optimize_function(input, plan); + } + }); + } + RunBuiltInOptimizers(); for (auto &optimizer_extension : DBConfig::GetConfig(context).optimizer_extensions) { RunOptimizer(OptimizerType::EXTENSION, [&]() { OptimizerExtensionInput input {GetContext(), *this, optimizer_extension.optimizer_info.get()}; - optimizer_extension.optimize_function(input, plan); + if (optimizer_extension.optimize_function) { + optimizer_extension.optimize_function(input, plan); + } }); } From 32a0f490f146b367dca05cd9d0db4c59e6a36d77 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Thu, 13 Feb 2025 19:36:05 +0100 Subject: [PATCH 073/142] Move get_virtual_columns to a separate table function instead of trying to move it into the bind --- extension/parquet/parquet_extension.cpp | 24 ++++++++++++------- src/common/multi_file_reader.cpp | 21 +++++++--------- src/function/table_function.cpp | 11 +++++---- .../duckdb/common/multi_file_reader.hpp | 11 +++++---- .../duckdb/function/table_function.hpp | 12 ++++++---- .../binder/tableref/bind_table_function.cpp | 8 ++++--- src/planner/operator/logical_get.cpp | 24 +++++++++++++++---- 7 files changed, 68 insertions(+), 43 deletions(-) diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index b7100cf7b363..294e35c021a2 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -359,6 +359,13 @@ TablePartitionInfo ParquetGetPartitionInfo(ClientContext &context, TableFunction return parquet_bind.multi_file_reader->GetPartitionInfo(context, parquet_bind.reader_bind, input); } +virtual_column_map_t ParquetGetVirtualColumns(ClientContext &context, optional_ptr bind_data) { + auto &parquet_bind = bind_data->Cast(); + virtual_column_map_t result; + parquet_bind.multi_file_reader->GetVirtualColumns(context, parquet_bind.reader_bind, result); + return result; +} + class ParquetScanFunction { public: static TableFunctionSet GetFunctionSet() { @@ -384,6 +391,7 @@ class ParquetScanFunction { table_function.filter_prune = true; table_function.pushdown_complex_filter = ParquetComplexFilterPushdown; table_function.get_partition_info = ParquetGetPartitionInfo; + table_function.get_virtual_columns = ParquetGetVirtualColumns; MultiFileReader::AddParameters(table_function); @@ -450,11 +458,11 @@ class ParquetScanFunction { return nullptr; } - static unique_ptr - ParquetScanBindInternal(ClientContext &context, unique_ptr multi_file_reader, - shared_ptr file_list, vector &return_types, - vector &names, ParquetOptions parquet_options, - optional_ptr virtual_columns = nullptr) { + static unique_ptr ParquetScanBindInternal(ClientContext &context, + unique_ptr multi_file_reader, + shared_ptr file_list, + vector &return_types, vector &names, + ParquetOptions parquet_options) { auto result = make_uniq(); result->multi_file_reader = std::move(multi_file_reader); result->file_list = std::move(file_list); @@ -463,7 +471,7 @@ class ParquetScanFunction { if (result->multi_file_reader->Bind(parquet_options.file_options, *result->file_list, result->types, result->names, result->reader_bind)) { result->multi_file_reader->BindOptions(parquet_options.file_options, *result->file_list, result->types, - result->names, result->reader_bind, virtual_columns); + result->names, result->reader_bind); // Enable the parquet file_row_number on the parquet options if the file_row_number_idx was set if (result->reader_bind.file_row_number_idx != DConstants::INVALID_INDEX) { parquet_options.file_row_number = true; @@ -476,7 +484,7 @@ class ParquetScanFunction { parquet_options.file_options.AutoDetectHivePartitioning(*result->file_list, context); // Default bind result->reader_bind = result->multi_file_reader->BindReader( - context, result->types, result->names, *result->file_list, *result, parquet_options, virtual_columns); + context, result->types, result->names, *result->file_list, *result, parquet_options); } // Set the explicit cardinality if requested @@ -617,7 +625,7 @@ class ParquetScanFunction { auto file_list = multi_file_reader->CreateFileList(context, input.inputs[0]); return ParquetScanBindInternal(context, std::move(multi_file_reader), std::move(file_list), return_types, names, - parquet_options, &input.virtual_columns); + parquet_options); } static double ParquetProgress(ClientContext &context, const FunctionData *bind_data_p, diff --git a/src/common/multi_file_reader.cpp b/src/common/multi_file_reader.cpp index 9016780e422b..8a819505d86e 100644 --- a/src/common/multi_file_reader.cpp +++ b/src/common/multi_file_reader.cpp @@ -171,8 +171,7 @@ bool MultiFileReader::Bind(MultiFileReaderOptions &options, MultiFileList &files void MultiFileReader::BindOptions(MultiFileReaderOptions &options, MultiFileList &files, vector &return_types, vector &names, - MultiFileReaderBindData &bind_data, - optional_ptr virtual_columns) { + MultiFileReaderBindData &bind_data) { // Add generated constant column for filename if (options.filename) { if (std::find(names.begin(), names.end(), options.filename_column) != names.end()) { @@ -183,10 +182,6 @@ void MultiFileReader::BindOptions(MultiFileReaderOptions &options, MultiFileList bind_data.filename_idx = names.size(); return_types.emplace_back(LogicalType::VARCHAR); names.emplace_back(options.filename_column); - } else if (virtual_columns) { - // filename is not specified - add it to the virtual columns list - virtual_columns->insert(make_pair(COLUMN_IDENTIFIER_FILENAME, TableColumn("filename", LogicalType::VARCHAR))); - bind_data.filename_idx = COLUMN_IDENTIFIER_FILENAME; } // Add generated constant columns from hive partitioning scheme @@ -242,6 +237,14 @@ void MultiFileReader::BindOptions(MultiFileReaderOptions &options, MultiFileList } } +void MultiFileReader::GetVirtualColumns(ClientContext &context, MultiFileReaderBindData &bind_data, + virtual_column_map_t &result) { + if (bind_data.filename_idx == DConstants::INVALID_INDEX) { + bind_data.filename_idx = COLUMN_IDENTIFIER_FILENAME; + result.insert(make_pair(COLUMN_IDENTIFIER_FILENAME, TableColumn("filename", LogicalType::VARCHAR))); + } +} + void MultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_options, const MultiFileReaderBindData &options, const string &filename, const vector &local_columns, const vector &global_columns, @@ -258,12 +261,6 @@ void MultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_options, c } for (idx_t i = 0; i < global_column_ids.size(); i++) { auto &col_idx = global_column_ids[i]; - if (col_idx.IsRowIdColumn()) { - // row-id - // FIXME: this should probably be removed - reader_data.constant_map.emplace_back(i, Value::BIGINT(42)); - continue; - } auto column_id = col_idx.GetPrimaryIndex(); if (column_id == options.filename_idx) { // filename diff --git a/src/function/table_function.cpp b/src/function/table_function.cpp index 6e9df81943ae..cb8af192017c 100644 --- a/src/function/table_function.cpp +++ b/src/function/table_function.cpp @@ -22,8 +22,9 @@ TableFunction::TableFunction(string name, vector arguments, table_f in_out_function_final(nullptr), statistics(nullptr), dependency(nullptr), cardinality(nullptr), pushdown_complex_filter(nullptr), to_string(nullptr), table_scan_progress(nullptr), get_partition_data(nullptr), get_bind_info(nullptr), type_pushdown(nullptr), get_multi_file_reader(nullptr), supports_pushdown_type(nullptr), - get_partition_info(nullptr), get_partition_stats(nullptr), serialize(nullptr), deserialize(nullptr), - projection_pushdown(false), filter_pushdown(false), filter_prune(false), sampling_pushdown(false) { + get_partition_info(nullptr), get_partition_stats(nullptr), get_virtual_columns(nullptr), serialize(nullptr), + deserialize(nullptr), projection_pushdown(false), filter_pushdown(false), filter_prune(false), + sampling_pushdown(false) { } TableFunction::TableFunction(const vector &arguments, table_function_t function, @@ -36,9 +37,9 @@ TableFunction::TableFunction() init_local(nullptr), function(nullptr), in_out_function(nullptr), statistics(nullptr), dependency(nullptr), cardinality(nullptr), pushdown_complex_filter(nullptr), to_string(nullptr), table_scan_progress(nullptr), get_partition_data(nullptr), get_bind_info(nullptr), type_pushdown(nullptr), get_multi_file_reader(nullptr), - supports_pushdown_type(nullptr), get_partition_info(nullptr), get_partition_stats(nullptr), serialize(nullptr), - deserialize(nullptr), projection_pushdown(false), filter_pushdown(false), filter_prune(false), - sampling_pushdown(false) { + supports_pushdown_type(nullptr), get_partition_info(nullptr), get_partition_stats(nullptr), + get_virtual_columns(nullptr), serialize(nullptr), deserialize(nullptr), projection_pushdown(false), + filter_pushdown(false), filter_prune(false), sampling_pushdown(false) { } bool TableFunction::Equal(const TableFunction &rhs) const { diff --git a/src/include/duckdb/common/multi_file_reader.hpp b/src/include/duckdb/common/multi_file_reader.hpp index c1a262fa875f..b6716856dda0 100644 --- a/src/include/duckdb/common/multi_file_reader.hpp +++ b/src/include/duckdb/common/multi_file_reader.hpp @@ -250,8 +250,7 @@ struct MultiFileReader { //! Bind the options of the multi-file reader, potentially emitting any extra columns that are required DUCKDB_API virtual void BindOptions(MultiFileReaderOptions &options, MultiFileList &files, vector &return_types, vector &names, - MultiFileReaderBindData &bind_data, - optional_ptr virtual_columns = nullptr); + MultiFileReaderBindData &bind_data); //! Initialize global state used by the MultiFileReader DUCKDB_API virtual unique_ptr @@ -294,6 +293,9 @@ struct MultiFileReader { const OperatorPartitionInfo &partition_info, OperatorPartitionData &partition_data); + DUCKDB_API virtual void GetVirtualColumns(ClientContext &context, MultiFileReaderBindData &bind_data, + virtual_column_map_t &result); + template MultiFileReaderBindData BindUnionReader(ClientContext &context, vector &return_types, vector &names, MultiFileList &files, RESULT_CLASS &result, @@ -322,8 +324,7 @@ struct MultiFileReader { template MultiFileReaderBindData BindReader(ClientContext &context, vector &return_types, vector &names, - MultiFileList &files, RESULT_CLASS &result, OPTIONS_CLASS &options, - optional_ptr virtual_columns = nullptr) { + MultiFileList &files, RESULT_CLASS &result, OPTIONS_CLASS &options) { if (options.file_options.union_by_name) { return BindUnionReader(context, return_types, names, files, result, options); } else { @@ -336,7 +337,7 @@ struct MultiFileReader { } result.Initialize(std::move(reader)); MultiFileReaderBindData bind_data; - BindOptions(options.file_options, files, return_types, names, bind_data, virtual_columns); + BindOptions(options.file_options, files, return_types, names, bind_data); return bind_data; } } diff --git a/src/include/duckdb/function/table_function.hpp b/src/include/duckdb/function/table_function.hpp index 15625ff61e3a..0ac43fe73812 100644 --- a/src/include/duckdb/function/table_function.hpp +++ b/src/include/duckdb/function/table_function.hpp @@ -93,11 +93,9 @@ struct TableFunctionBindInput { TableFunctionBindInput(vector &inputs, named_parameter_map_t &named_parameters, vector &input_table_types, vector &input_table_names, optional_ptr info, optional_ptr binder, - TableFunction &table_function, const TableFunctionRef &ref, - virtual_column_map_t &virtual_columns) + TableFunction &table_function, const TableFunctionRef &ref) : inputs(inputs), named_parameters(named_parameters), input_table_types(input_table_types), - input_table_names(input_table_names), info(info), binder(binder), table_function(table_function), ref(ref), - virtual_columns(virtual_columns) { + input_table_names(input_table_names), info(info), binder(binder), table_function(table_function), ref(ref) { } vector &inputs; @@ -108,7 +106,6 @@ struct TableFunctionBindInput { optional_ptr binder; TableFunction &table_function; const TableFunctionRef &ref; - virtual_column_map_t &virtual_columns; }; struct TableFunctionInitInput { @@ -298,6 +295,9 @@ typedef TablePartitionInfo (*table_function_get_partition_info_t)(ClientContext typedef vector (*table_function_get_partition_stats_t)(ClientContext &context, GetPartitionStatsInput &input); +typedef virtual_column_map_t (*table_function_get_virtual_columns_t)(ClientContext &context, + optional_ptr bind_data); + //! When to call init_global to initialize the table function enum class TableFunctionInitialization { INITIALIZE_ON_EXECUTE, INITIALIZE_ON_SCHEDULE }; @@ -366,6 +366,8 @@ class TableFunction : public SimpleNamedParameterFunction { // NOLINT: work-arou table_function_get_partition_info_t get_partition_info; //! (Optional) get a list of all the partition stats of the table table_function_get_partition_stats_t get_partition_stats; + //! (Optional) returns a list of virtual columns emitted by the table function + table_function_get_virtual_columns_t get_virtual_columns; table_function_serialize_t serialize; table_function_deserialize_t deserialize; diff --git a/src/planner/binder/tableref/bind_table_function.cpp b/src/planner/binder/tableref/bind_table_function.cpp index b17884c76532..dbb162495088 100644 --- a/src/planner/binder/tableref/bind_table_function.cpp +++ b/src/planner/binder/tableref/bind_table_function.cpp @@ -197,11 +197,9 @@ unique_ptr Binder::BindTableFunctionInternal(TableFunction &tab unique_ptr bind_data; vector return_types; vector return_names; - unordered_map virtual_columns; if (table_function.bind || table_function.bind_replace) { TableFunctionBindInput bind_input(parameters, named_parameters, input_table_types, input_table_names, - table_function.function_info.get(), this, table_function, ref, - virtual_columns); + table_function.function_info.get(), this, table_function, ref); if (table_function.bind_replace) { auto new_plan = table_function.bind_replace(context, bind_input); if (new_plan) { @@ -238,6 +236,10 @@ unique_ptr Binder::BindTableFunctionInternal(TableFunction &tab return_names[i] = "C" + to_string(i); } } + virtual_column_map_t virtual_columns; + if (table_function.get_virtual_columns) { + virtual_columns = table_function.get_virtual_columns(context, bind_data.get()); + } auto get = make_uniq(bind_index, table_function, std::move(bind_data), return_types, return_names, virtual_columns); diff --git a/src/planner/operator/logical_get.cpp b/src/planner/operator/logical_get.cpp index 37e475e66a89..c15fde6b923a 100644 --- a/src/planner/operator/logical_get.cpp +++ b/src/planner/operator/logical_get.cpp @@ -143,7 +143,15 @@ const string &LogicalGet::GetColumnName(const ColumnIndex &index) const { void LogicalGet::ResolveTypes() { if (column_ids.empty()) { - column_ids.emplace_back(COLUMN_IDENTIFIER_ROW_ID); + // no projection - we need to push a column + auto entry = virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); + if (entry != virtual_columns.end()) { + // push the rowid column if the projection supports it + column_ids.emplace_back(COLUMN_IDENTIFIER_ROW_ID); + } else { + // otherwise push the first column + column_ids.emplace_back(0); + } } types.clear(); if (projection_ids.empty()) { @@ -240,19 +248,23 @@ unique_ptr LogicalGet::Deserialize(Deserializer &deserializer) result->column_ids.emplace_back(col_id); } } + auto &context = deserializer.Get(); + virtual_column_map_t virtual_columns; if (!has_serialize) { TableFunctionRef empty_ref; - unordered_map virtual_columns; TableFunctionBindInput input(result->parameters, result->named_parameters, result->input_table_types, result->input_table_names, function.function_info.get(), nullptr, result->function, - empty_ref, virtual_columns); + empty_ref); vector bind_return_types; vector bind_names; if (!function.bind) { throw InternalException("Table function \"%s\" has neither bind nor (de)serialize", function.name); } - bind_data = function.bind(deserializer.Get(), input, bind_return_types, bind_names); + bind_data = function.bind(context, input, bind_return_types, bind_names); + if (function.get_virtual_columns) { + virtual_columns = function.get_virtual_columns(context, bind_data.get()); + } for (auto &col_id : result->column_ids) { if (col_id.IsVirtualColumn()) { @@ -283,8 +295,10 @@ unique_ptr LogicalGet::Deserialize(Deserializer &deserializer) } } result->returned_types = std::move(bind_return_types); - result->virtual_columns = std::move(virtual_columns); + } else if (function.get_virtual_columns) { + virtual_columns = function.get_virtual_columns(context, bind_data.get()); } + result->virtual_columns = std::move(virtual_columns); result->bind_data = std::move(bind_data); return std::move(result); } From cd514e9c7e3dd0e8702ed490958f7bc1e0d32fa3 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 13 Feb 2025 20:03:05 +0100 Subject: [PATCH 074/142] format-fix --- src/execution/operator/helper/physical_streaming_sample.cpp | 4 ++-- src/execution/physical_plan/plan_sample.cpp | 3 +-- .../execution/operator/helper/physical_streaming_sample.hpp | 3 +-- .../filter_pushdown/prepared_statement_in_pushdown.test | 0 test/sql/sample/bernoulli_sampling.test | 1 - 5 files changed, 4 insertions(+), 7 deletions(-) delete mode 100644 test/optimizer/filter_pushdown/prepared_statement_in_pushdown.test diff --git a/src/execution/operator/helper/physical_streaming_sample.cpp b/src/execution/operator/helper/physical_streaming_sample.cpp index 1062deb27ed7..721717989f88 100644 --- a/src/execution/operator/helper/physical_streaming_sample.cpp +++ b/src/execution/operator/helper/physical_streaming_sample.cpp @@ -6,9 +6,9 @@ namespace duckdb { PhysicalStreamingSample::PhysicalStreamingSample(vector types, unique_ptr options, - idx_t estimated_cardinality) + idx_t estimated_cardinality) : PhysicalOperator(PhysicalOperatorType::STREAMING_SAMPLE, std::move(types), estimated_cardinality), - sample_options(std::move(options)) { + sample_options(std::move(options)) { percentage = sample_options->sample_size.GetValue() / 100; } diff --git a/src/execution/physical_plan/plan_sample.cpp b/src/execution/physical_plan/plan_sample.cpp index 2ccfacb8ac8c..883c7055d46f 100644 --- a/src/execution/physical_plan/plan_sample.cpp +++ b/src/execution/physical_plan/plan_sample.cpp @@ -28,8 +28,7 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalSample &op "reservoir sampling or use a sample_size", EnumUtil::ToString(op.sample_options->method)); } - sample = make_uniq( - op.types, std::move(op.sample_options), op.estimated_cardinality); + sample = make_uniq(op.types, std::move(op.sample_options), op.estimated_cardinality); break; default: throw InternalException("Unimplemented sample method"); diff --git a/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp b/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp index 68df848fec9f..6f75b2cf1964 100644 --- a/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp +++ b/src/include/duckdb/execution/operator/helper/physical_streaming_sample.hpp @@ -19,8 +19,7 @@ class PhysicalStreamingSample : public PhysicalOperator { static constexpr const PhysicalOperatorType TYPE = PhysicalOperatorType::STREAMING_SAMPLE; public: - PhysicalStreamingSample(vector types, unique_ptr options, - idx_t estimated_cardinality); + PhysicalStreamingSample(vector types, unique_ptr options, idx_t estimated_cardinality); unique_ptr sample_options; double percentage; diff --git a/test/optimizer/filter_pushdown/prepared_statement_in_pushdown.test b/test/optimizer/filter_pushdown/prepared_statement_in_pushdown.test deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/test/sql/sample/bernoulli_sampling.test b/test/sql/sample/bernoulli_sampling.test index a00ff7311325..e8953bb38e40 100644 --- a/test/sql/sample/bernoulli_sampling.test +++ b/test/sql/sample/bernoulli_sampling.test @@ -2,7 +2,6 @@ # description: Test reservoir sample crash on large data sets # group: [sample] - statement ok create table output (num_rows INT); From 6549a0ea76cc4e3cc29c65ea37b9485e1f18e4cb Mon Sep 17 00:00:00 2001 From: Mytherin Date: Thu, 13 Feb 2025 21:57:32 +0100 Subject: [PATCH 075/142] Correctly deal with and propagate virtual columns --- extension/parquet/parquet_extension.cpp | 15 ++++++++++---- src/common/multi_file_list.cpp | 8 ++++++-- .../operator/scan/physical_table_scan.cpp | 13 ++++++++---- src/execution/physical_plan/plan_get.cpp | 10 ++++++---- src/function/table/table_scan.cpp | 6 ++++++ .../operator/scan/physical_table_scan.hpp | 4 +++- .../duckdb/planner/operator/logical_get.hpp | 3 +++ src/optimizer/remove_unused_columns.cpp | 5 ++--- src/planner/operator/logical_get.cpp | 20 +++++++++++-------- .../copy/parquet/parquet_filename_filter.test | 12 +++++++++-- .../copy/parquet/parquet_virtual_columns.test | 8 ++++++++ 11 files changed, 76 insertions(+), 28 deletions(-) diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index 294e35c021a2..e88f111564b1 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -61,6 +61,7 @@ struct ParquetReadBindData : public TableFunctionData { atomic chunk_count; vector names; vector types; + virtual_column_map_t virtual_columns; vector columns; //! Table column names - set when using COPY tbl FROM file.parquet vector table_columns; @@ -363,6 +364,7 @@ virtual_column_map_t ParquetGetVirtualColumns(ClientContext &context, optional_p auto &parquet_bind = bind_data->Cast(); virtual_column_map_t result; parquet_bind.multi_file_reader->GetVirtualColumns(context, parquet_bind.reader_bind, result); + parquet_bind.virtual_columns = result; return result; } @@ -754,12 +756,17 @@ class ParquetScanFunction { iota(begin(result->projection_ids), end(result->projection_ids), 0); } - const auto table_types = bind_data.types; + const auto &table_types = bind_data.types; for (const auto &col_idx : input.column_indexes) { - if (col_idx.IsRowIdColumn()) { - result->scanned_types.emplace_back(LogicalType::ROW_TYPE); + auto column_id = col_idx.GetPrimaryIndex(); + if (col_idx.IsVirtualColumn()) { + auto entry = bind_data.virtual_columns.find(column_id); + if (entry == bind_data.virtual_columns.end()) { + throw InternalException("Parquet - virtual column definition not found"); + } + result->scanned_types.emplace_back(entry->second.type); } else { - result->scanned_types.push_back(table_types[col_idx.GetPrimaryIndex()]); + result->scanned_types.push_back(table_types[column_id]); } } } diff --git a/src/common/multi_file_list.cpp b/src/common/multi_file_list.cpp index 668a5b36399e..fd5d84a856ec 100644 --- a/src/common/multi_file_list.cpp +++ b/src/common/multi_file_list.cpp @@ -31,9 +31,10 @@ bool PushdownInternal(ClientContext &context, const MultiFileReaderOptions &opti vector> &filters, vector &expanded_files) { HivePartitioningFilterInfo filter_info; for (idx_t i = 0; i < info.column_ids.size(); i++) { - if (!IsRowIdColumnId(info.column_ids[i])) { - filter_info.column_map.insert({info.column_names[info.column_ids[i]], i}); + if (IsVirtualColumn(info.column_ids[i])) { + continue; } + filter_info.column_map.insert({info.column_names[info.column_ids[i]], i}); } filter_info.hive_enabled = options.hive_partitioning; filter_info.filename_enabled = options.filename; @@ -61,6 +62,9 @@ bool PushdownInternal(ClientContext &context, const MultiFileReaderOptions &opti vector> filter_expressions; for (auto &entry : filters.filters) { auto column_idx = column_ids[entry.first]; + if (IsVirtualColumn(column_idx)) { + continue; + } auto column_ref = make_uniq(types[column_idx], ColumnBinding(table_index, entry.first)); auto filter_expr = entry.second->ToExpression(*column_ref); diff --git a/src/execution/operator/scan/physical_table_scan.cpp b/src/execution/operator/scan/physical_table_scan.cpp index 0ea996d341c8..2c821526cee6 100644 --- a/src/execution/operator/scan/physical_table_scan.cpp +++ b/src/execution/operator/scan/physical_table_scan.cpp @@ -14,11 +14,12 @@ PhysicalTableScan::PhysicalTableScan(vector types, TableFunction fu vector column_ids_p, vector projection_ids_p, vector names_p, unique_ptr table_filters_p, idx_t estimated_cardinality, ExtraOperatorInfo extra_info, - vector parameters_p) + vector parameters_p, virtual_column_map_t virtual_columns_p) : PhysicalOperator(PhysicalOperatorType::TABLE_SCAN, std::move(types), estimated_cardinality), function(std::move(function_p)), bind_data(std::move(bind_data_p)), returned_types(std::move(returned_types_p)), column_ids(std::move(column_ids_p)), projection_ids(std::move(projection_ids_p)), names(std::move(names_p)), - table_filters(std::move(table_filters_p)), extra_info(extra_info), parameters(std::move(parameters_p)) { + table_filters(std::move(table_filters_p)), extra_info(extra_info), parameters(std::move(parameters_p)), + virtual_columns(std::move(virtual_columns_p)) { } class TableScanGlobalSourceState : public GlobalSourceState { @@ -214,8 +215,12 @@ InsertionOrderPreservingMap PhysicalTableScan::ParamsToString() const { first_item = false; const auto col_id = column_ids[column_index].GetPrimaryIndex(); - if (col_id == COLUMN_IDENTIFIER_ROW_ID) { - filters_info += filter->ToString("rowid"); + if (IsVirtualColumn(col_id)) { + auto entry = virtual_columns.find(col_id); + if (entry == virtual_columns.end()) { + throw InternalException("Virtual column not found"); + } + filters_info += filter->ToString(entry->second.name); } else { filters_info += filter->ToString(names[col_id]); } diff --git a/src/execution/physical_plan/plan_get.cpp b/src/execution/physical_plan/plan_get.cpp index 843ad1537172..978bf4b70220 100644 --- a/src/execution/physical_plan/plan_get.cpp +++ b/src/execution/physical_plan/plan_get.cpp @@ -133,9 +133,10 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalGet &op) { // create the table scan node if (!op.function.projection_pushdown) { // function does not support projection pushdown - auto node = make_uniq( - op.returned_types, op.function, std::move(op.bind_data), op.returned_types, column_ids, vector(), - op.names, std::move(table_filters), op.estimated_cardinality, op.extra_info, std::move(op.parameters)); + auto node = make_uniq(op.returned_types, op.function, std::move(op.bind_data), + op.returned_types, column_ids, vector(), op.names, + std::move(table_filters), op.estimated_cardinality, op.extra_info, + std::move(op.parameters), std::move(op.virtual_columns)); // first check if an additional projection is necessary if (column_ids.size() == op.returned_types.size()) { bool projection_necessary = false; @@ -180,7 +181,8 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalGet &op) { } else { auto node = make_uniq(op.types, op.function, std::move(op.bind_data), op.returned_types, column_ids, op.projection_ids, op.names, std::move(table_filters), - op.estimated_cardinality, op.extra_info, std::move(op.parameters)); + op.estimated_cardinality, op.extra_info, std::move(op.parameters), + std::move(op.virtual_columns)); node->dynamic_filters = op.dynamic_filters; if (filter) { filter->children.push_back(std::move(node)); diff --git a/src/function/table/table_scan.cpp b/src/function/table/table_scan.cpp index 0cf85d64a2b9..41b95f1269eb 100644 --- a/src/function/table/table_scan.cpp +++ b/src/function/table/table_scan.cpp @@ -703,6 +703,11 @@ static unique_ptr TableScanDeserialize(Deserializer &deserializer, return std::move(result); } +virtual_column_map_t TableScanGetVirtualColumns(ClientContext &context, optional_ptr bind_data_p) { + auto &bind_data = bind_data_p->Cast(); + return bind_data.table.GetVirtualColumns(); +} + TableFunction TableScanFunction::GetFunction() { TableFunction scan_function("seq_scan", {}, TableScanFunc); scan_function.init_local = TableScanInitLocal; @@ -722,6 +727,7 @@ TableFunction TableScanFunction::GetFunction() { scan_function.sampling_pushdown = true; scan_function.serialize = TableScanSerialize; scan_function.deserialize = TableScanDeserialize; + scan_function.get_virtual_columns = TableScanGetVirtualColumns; return scan_function; } diff --git a/src/include/duckdb/execution/operator/scan/physical_table_scan.hpp b/src/include/duckdb/execution/operator/scan/physical_table_scan.hpp index 45ac1e34c5b7..ca7cd4a0db34 100644 --- a/src/include/duckdb/execution/operator/scan/physical_table_scan.hpp +++ b/src/include/duckdb/execution/operator/scan/physical_table_scan.hpp @@ -27,7 +27,7 @@ class PhysicalTableScan : public PhysicalOperator { PhysicalTableScan(vector types, TableFunction function, unique_ptr bind_data, vector returned_types, vector column_ids, vector projection_ids, vector names, unique_ptr table_filters, idx_t estimated_cardinality, - ExtraOperatorInfo extra_info, vector parameters); + ExtraOperatorInfo extra_info, vector parameters, virtual_column_map_t virtual_columns); //! The table function TableFunction function; @@ -50,6 +50,8 @@ class PhysicalTableScan : public PhysicalOperator { vector parameters; //! Contains a reference to dynamically generated table filters (through e.g. a join up in the tree) shared_ptr dynamic_filters; + //! Virtual columns + virtual_column_map_t virtual_columns; public: string GetName() const override; diff --git a/src/include/duckdb/planner/operator/logical_get.hpp b/src/include/duckdb/planner/operator/logical_get.hpp index 65f90fe85ae2..395224bc89dd 100644 --- a/src/include/duckdb/planner/operator/logical_get.hpp +++ b/src/include/duckdb/planner/operator/logical_get.hpp @@ -63,6 +63,9 @@ class LogicalGet : public LogicalOperator { InsertionOrderPreservingMap ParamsToString() const override; //! Returns the underlying table that is being scanned, or nullptr if there is none optional_ptr GetTable() const; + //! Returns any column to query - preferably the cheapest column + //! This is used when we are running e.g. a COUNT(*) and don't care about the contents of any columns in the table + column_t GetAnyColumn() const; const LogicalType &GetColumnType(const ColumnIndex &column_index) const; const string &GetColumnName(const ColumnIndex &column_index) const; diff --git a/src/optimizer/remove_unused_columns.cpp b/src/optimizer/remove_unused_columns.cpp index 48ea7cbffe83..eb93856e9e4a 100644 --- a/src/optimizer/remove_unused_columns.cpp +++ b/src/optimizer/remove_unused_columns.cpp @@ -240,8 +240,7 @@ void RemoveUnusedColumns::VisitOperator(LogicalOperator &op) { throw InternalException("Could not find column index for table filter"); } - auto column_type = - filter.first == COLUMN_IDENTIFIER_ROW_ID ? LogicalType::ROW_TYPE : get.returned_types[filter.first]; + auto column_type = get.GetColumnType(ColumnIndex(filter.first)); ColumnBinding filter_binding(get.table_index, index.GetIndex()); auto column_ref = make_uniq(std::move(column_type), filter_binding); @@ -268,7 +267,7 @@ void RemoveUnusedColumns::VisitOperator(LogicalOperator &op) { // this generally means we are only interested in whether or not anything exists in the table (e.g. // EXISTS(SELECT * FROM tbl)) in this case, we just scan the row identifier column as it means we do not // need to read any of the columns - column_ids.emplace_back(COLUMN_IDENTIFIER_ROW_ID); + column_ids.emplace_back(get.GetAnyColumn()); } get.SetColumnIds(std::move(column_ids)); diff --git a/src/planner/operator/logical_get.cpp b/src/planner/operator/logical_get.cpp index c15fde6b923a..c4b5a2f47ffc 100644 --- a/src/planner/operator/logical_get.cpp +++ b/src/planner/operator/logical_get.cpp @@ -141,17 +141,21 @@ const string &LogicalGet::GetColumnName(const ColumnIndex &index) const { return names[index.GetPrimaryIndex()]; } +column_t LogicalGet::GetAnyColumn() const { + auto entry = virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); + if (entry != virtual_columns.end()) { + // return the rowid column if the projection supports it + return COLUMN_IDENTIFIER_ROW_ID; + } else { + // otherwise return the first column + return 0; + } +} + void LogicalGet::ResolveTypes() { if (column_ids.empty()) { // no projection - we need to push a column - auto entry = virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); - if (entry != virtual_columns.end()) { - // push the rowid column if the projection supports it - column_ids.emplace_back(COLUMN_IDENTIFIER_ROW_ID); - } else { - // otherwise push the first column - column_ids.emplace_back(0); - } + column_ids.emplace_back(GetAnyColumn()); } types.clear(); if (projection_ids.empty()) { diff --git a/test/sql/copy/parquet/parquet_filename_filter.test b/test/sql/copy/parquet/parquet_filename_filter.test index f236ed81de2e..a10611ef267b 100644 --- a/test/sql/copy/parquet/parquet_filename_filter.test +++ b/test/sql/copy/parquet/parquet_filename_filter.test @@ -4,6 +4,10 @@ require parquet +query III +select id, value as f, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where filename='value1'; +---- + # requires notwindows for windows-style path backslash reasons require notwindows @@ -49,15 +53,19 @@ select id, value, date, filename from parquet_scan('data/parquet-testing/hive-pa # Ensure we don't somehow endup mixing things up query III -select id, value as filename, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where filename='value2'; +select id, value as f, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where f='value2'; ---- 2 value2 2013-01-01 query III -select id, value as filename, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where filename='value1'; +select id, value as f, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where f='value1'; ---- 1 value1 2012-01-01 +query III +select id, value as f, date from parquet_scan('data/parquet-testing/hive-partitioning/different_order/*/*/test.parquet', HIVE_PARTITIONING=1) where filename='value1'; +---- + # These tests confirm that the ParquetScanStats will properly handle the pruned files list statement ok diff --git a/test/sql/copy/parquet/parquet_virtual_columns.test b/test/sql/copy/parquet/parquet_virtual_columns.test index 1abddbe36928..f1f84155036b 100644 --- a/test/sql/copy/parquet/parquet_virtual_columns.test +++ b/test/sql/copy/parquet/parquet_virtual_columns.test @@ -19,3 +19,11 @@ select * from 'data/parquet-testing/glob*/t?.parquet' order by i; 1 a 2 b 3 c + +require notwindows + +# filename in filter +query III +select i, j, replace(filename, '\', '/') from 'data/parquet-testing/glob*/t?.parquet' where filename='data/parquet-testing/glob/t1.parquet' +---- +1 a data/parquet-testing/glob/t1.parquet From 2a75b22e52fcea3fd175774e0271d96532e7d5cc Mon Sep 17 00:00:00 2001 From: Richard Wesley <13156216+hawkfish@users.noreply.github.com> Date: Thu, 13 Feb 2025 13:35:41 -0800 Subject: [PATCH 076/142] Issue #8265: AsOf Nested Loop * Fix unique pointer cast --- src/execution/physical_plan/plan_asof_join.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/execution/physical_plan/plan_asof_join.cpp b/src/execution/physical_plan/plan_asof_join.cpp index 10fc0d47d059..32fa52280e62 100644 --- a/src/execution/physical_plan/plan_asof_join.cpp +++ b/src/execution/physical_plan/plan_asof_join.cpp @@ -230,7 +230,7 @@ static unique_ptr PlanAsOfLoopJoin(LogicalComparisonJoin &op, auto proj = make_uniq(op.types, std::move(project_list), probe_cardinality); proj->children.emplace_back(std::move(aggr)); - return proj; + return std::move(proj); } unique_ptr PhysicalPlanGenerator::PlanAsOfJoin(LogicalComparisonJoin &op) { From b6e8b3339f64dd386f911bc42de33bb991692d11 Mon Sep 17 00:00:00 2001 From: cfis Date: Fri, 14 Feb 2025 00:05:06 -0800 Subject: [PATCH 077/142] Fix building Duckdb on Windows with MSVC 2022. _win32 is the correct define for MSVC (and I believe mingw64 these days) - see https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170 --- extension/tpcds/dsdgen/include/dsdgen-c/porting.h | 4 ++-- extension/tpch/dbgen/text.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/extension/tpcds/dsdgen/include/dsdgen-c/porting.h b/extension/tpcds/dsdgen/include/dsdgen-c/porting.h index 6923a0f35286..cf27a036f813 100644 --- a/extension/tpcds/dsdgen/include/dsdgen-c/porting.h +++ b/extension/tpcds/dsdgen/include/dsdgen-c/porting.h @@ -57,7 +57,7 @@ #include -#ifdef WIN32 +#ifdef _WIN32 #include #define timeb _timeb #define ftime _ftime @@ -76,7 +76,7 @@ typedef HUGE_TYPE ds_key_t; char *strdup(const char *); #endif -#ifdef WIN32 +#ifdef _WIN32 #include #include #include diff --git a/extension/tpch/dbgen/text.cpp b/extension/tpch/dbgen/text.cpp index df67048b54ea..cfe9f8ef370b 100644 --- a/extension/tpch/dbgen/text.cpp +++ b/extension/tpch/dbgen/text.cpp @@ -22,10 +22,10 @@ #include "dbgen/config.h" #include -#ifndef WIN32 +#ifndef _WIN32 /* Change for Windows NT */ #include -#endif /* WIN32 */ +#endif /* _WIN32 */ #include #include #include From 012b6933194989e54719b504927417843af092c7 Mon Sep 17 00:00:00 2001 From: cfis Date: Fri, 14 Feb 2025 00:08:07 -0800 Subject: [PATCH 078/142] Building the Python bindings on Windows fails with MSVC and having Python installed in program files by the python installer. CMake finds python just fine, but the python executable is python.exe not python3.exe. --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e4a73176cb9..c61c7f9def65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1065,7 +1065,7 @@ endfunction() macro(register_external_extension NAME URL COMMIT DONT_LINK DONT_BUILD LOAD_TESTS PATH INCLUDE_PATH TEST_PATH APPLY_PATCHES LINKED_LIBS SUBMODULES EXTENSION_VERSION) include(FetchContent) if (${APPLY_PATCHES}) - set(PATCH_COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/apply_extension_patches.py ${CMAKE_SOURCE_DIR}/.github/patches/extensions/${NAME}/) + set(PATCH_COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/scripts/apply_extension_patches.py ${CMAKE_SOURCE_DIR}/.github/patches/extensions/${NAME}/) endif() FETCHCONTENT_DECLARE( ${NAME}_extension_fc @@ -1389,7 +1389,7 @@ if(${EXTENSION_CONFIG_BUILD}) add_custom_target( duckdb_merge_vcpkg_manifests ALL - COMMAND python3 scripts/merge_vcpkg_deps.py ${VCPKG_PATHS} ${EXT_NAMES} + COMMAND ${Python3_EXECUTABLE} scripts/merge_vcpkg_deps.py ${VCPKG_PATHS} ${EXT_NAMES} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} COMMENT Generates a shared vcpkg manifest from the individual extensions) string(REPLACE ";" ", " VCPKG_NAMES_COMMAS "${VCPKG_NAMES}") @@ -1432,9 +1432,9 @@ if(BUILD_PYTHON) ) if(PYTHON_EDITABLE_BUILD) - set(PIP_COMMAND ${PIP_COMMAND} python3 -m pip install --editable .) + set(PIP_COMMAND ${PIP_COMMAND} ${Python3_EXECUTABLE} -m pip install --editable .) else() - set(PIP_COMMAND ${PIP_COMMAND} python3 -m pip install .) + set(PIP_COMMAND ${PIP_COMMAND} ${Python3_EXECUTABLE} -m pip install .) endif() if(USER_SPACE) From 535fe5a77c01d038384ddd5cd5cc217397faa5a9 Mon Sep 17 00:00:00 2001 From: cfis Date: Fri, 14 Feb 2025 00:14:17 -0800 Subject: [PATCH 079/142] -std=c++11 is invalid with MSVC. It it is set correctly here - https://github.com/duckdb/duckdb/blob/main/tools/pythonpkg/setup.py#L162 and L165, but then reset again at https://github.com/duckdb/duckdb/blob/main/tools/pythonpkg/setup.py#L176 but incorrectly for all compilers. --- tools/pythonpkg/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pythonpkg/setup.py b/tools/pythonpkg/setup.py index 0152d0d6b06b..6273472da676 100644 --- a/tools/pythonpkg/setup.py +++ b/tools/pythonpkg/setup.py @@ -173,7 +173,7 @@ def open_utf8(fpath, flags): if 'DUCKDB_BINARY_DIR' in os.environ: existing_duckdb_dir = os.environ['DUCKDB_BINARY_DIR'] if 'DUCKDB_COMPILE_FLAGS' in os.environ: - toolchain_args = ['-std=c++11'] + os.environ['DUCKDB_COMPILE_FLAGS'].split() + toolchain_args = os.environ['DUCKDB_COMPILE_FLAGS'].split() if 'DUCKDB_LIBS' in os.environ: libraries = os.environ['DUCKDB_LIBS'].split(' ') From 057c2d4a9d7737da6d0196832d17589b0cc7c449 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Fri, 14 Feb 2025 09:24:35 +0100 Subject: [PATCH 080/142] Various fixes for virtual columns <> MultiFileReader interaction --- extension/parquet/parquet_extension.cpp | 2 +- src/common/constants.cpp | 2 +- src/common/multi_file_reader.cpp | 7 +++++++ src/function/table/read_csv.cpp | 12 +++++++++++- src/function/table/table_scan.cpp | 1 + src/function/table_function.cpp | 4 ++-- src/include/duckdb/common/multi_file_reader.hpp | 2 +- src/include/duckdb/function/table_function.hpp | 2 ++ src/optimizer/late_materialization.cpp | 7 +++++-- 9 files changed, 31 insertions(+), 8 deletions(-) diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index e88f111564b1..1a05891eb314 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -363,7 +363,7 @@ TablePartitionInfo ParquetGetPartitionInfo(ClientContext &context, TableFunction virtual_column_map_t ParquetGetVirtualColumns(ClientContext &context, optional_ptr bind_data) { auto &parquet_bind = bind_data->Cast(); virtual_column_map_t result; - parquet_bind.multi_file_reader->GetVirtualColumns(context, parquet_bind.reader_bind, result); + MultiFileReader::GetVirtualColumns(context, parquet_bind.reader_bind, result); parquet_bind.virtual_columns = result; return result; } diff --git a/src/common/constants.cpp b/src/common/constants.cpp index 4db7245e235f..ee51460033ff 100644 --- a/src/common/constants.cpp +++ b/src/common/constants.cpp @@ -9,7 +9,7 @@ namespace duckdb { constexpr const idx_t DConstants::INVALID_INDEX; const row_t MAX_ROW_ID = 36028797018960000ULL; // 2^55 const row_t MAX_ROW_ID_LOCAL = 72057594037920000ULL; // 2^56 -const column_t COLUMN_IDENTIFIER_ROW_ID = (column_t)-1; +const column_t COLUMN_IDENTIFIER_ROW_ID = UINT64_C(18446744073709551615); const column_t VIRTUAL_COLUMN_START = UINT64_C(9223372036854775808); // 2^63 const double PI = 3.141592653589793; diff --git a/src/common/multi_file_reader.cpp b/src/common/multi_file_reader.cpp index 8a819505d86e..e9e8f40ca3f4 100644 --- a/src/common/multi_file_reader.cpp +++ b/src/common/multi_file_reader.cpp @@ -267,6 +267,9 @@ void MultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_options, c reader_data.constant_map.emplace_back(i, Value(filename)); continue; } + if (IsVirtualColumn(column_id)) { + continue; + } if (!options.hive_partitioning_indexes.empty()) { // hive partition constants auto partitions = HivePartitioning::Parse(filename); @@ -340,6 +343,10 @@ void MultiFileReader::CreateColumnMappingByName(const string &file_name, // not constant - look up the column in the name map auto &global_idx = global_column_ids[i]; auto global_id = global_idx.GetPrimaryIndex(); + if (IsVirtualColumn(global_id)) { + // virtual column - these are emitted for every file + continue; + } if (global_id >= global_columns.size()) { throw InternalException( "MultiFileReader::CreateColumnMappingByName - global_id is out of range in global_types for this file"); diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index e5bce2264db1..d9603abfe81b 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -140,7 +140,7 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio auto result = make_uniq(); auto &options = result->options; - const auto multi_file_reader = MultiFileReader::Create(input.table_function); + auto multi_file_reader = MultiFileReader::Create(input.table_function); const auto multi_file_list = multi_file_reader->CreateFileList(context, input.inputs[0]); if (multi_file_list->GetTotalFileCount() > 1) { options.multi_file_reader = true; @@ -415,6 +415,15 @@ void PushdownTypeToCSVScanner(ClientContext &context, optional_ptr } } +virtual_column_map_t ReadCSVGetVirtualColumns(ClientContext &context, optional_ptr bind_data) { + auto &csv_bind = bind_data->Cast(); + virtual_column_map_t result; + MultiFileReader::GetVirtualColumns(context, csv_bind.reader_bind, result); + result.insert(make_pair(COLUMN_IDENTIFIER_ROW_ID, TableColumn("rowid", LogicalType::ROW_TYPE))); + return result; +} + + TableFunction ReadCSVTableFunction::GetFunction() { TableFunction read_csv("read_csv", {LogicalType::VARCHAR}, ReadCSVFunction, ReadCSVBind, ReadCSVInitGlobal, ReadCSVInitLocal); @@ -426,6 +435,7 @@ TableFunction ReadCSVTableFunction::GetFunction() { read_csv.cardinality = CSVReaderCardinality; read_csv.projection_pushdown = true; read_csv.type_pushdown = PushdownTypeToCSVScanner; + read_csv.get_virtual_columns = ReadCSVGetVirtualColumns; ReadCSVAddNamedParameters(read_csv); return read_csv; } diff --git a/src/function/table/table_scan.cpp b/src/function/table/table_scan.cpp index 41b95f1269eb..178a2a6a8bb0 100644 --- a/src/function/table/table_scan.cpp +++ b/src/function/table/table_scan.cpp @@ -725,6 +725,7 @@ TableFunction TableScanFunction::GetFunction() { scan_function.filter_pushdown = true; scan_function.filter_prune = true; scan_function.sampling_pushdown = true; + scan_function.late_materialization = true; scan_function.serialize = TableScanSerialize; scan_function.deserialize = TableScanDeserialize; scan_function.get_virtual_columns = TableScanGetVirtualColumns; diff --git a/src/function/table_function.cpp b/src/function/table_function.cpp index cb8af192017c..e2172cef2b79 100644 --- a/src/function/table_function.cpp +++ b/src/function/table_function.cpp @@ -24,7 +24,7 @@ TableFunction::TableFunction(string name, vector arguments, table_f get_bind_info(nullptr), type_pushdown(nullptr), get_multi_file_reader(nullptr), supports_pushdown_type(nullptr), get_partition_info(nullptr), get_partition_stats(nullptr), get_virtual_columns(nullptr), serialize(nullptr), deserialize(nullptr), projection_pushdown(false), filter_pushdown(false), filter_prune(false), - sampling_pushdown(false) { + sampling_pushdown(false), late_materialization(false) { } TableFunction::TableFunction(const vector &arguments, table_function_t function, @@ -39,7 +39,7 @@ TableFunction::TableFunction() get_partition_data(nullptr), get_bind_info(nullptr), type_pushdown(nullptr), get_multi_file_reader(nullptr), supports_pushdown_type(nullptr), get_partition_info(nullptr), get_partition_stats(nullptr), get_virtual_columns(nullptr), serialize(nullptr), deserialize(nullptr), projection_pushdown(false), - filter_pushdown(false), filter_prune(false), sampling_pushdown(false) { + filter_pushdown(false), filter_prune(false), sampling_pushdown(false), late_materialization(false) { } bool TableFunction::Equal(const TableFunction &rhs) const { diff --git a/src/include/duckdb/common/multi_file_reader.hpp b/src/include/duckdb/common/multi_file_reader.hpp index b6716856dda0..39fd78654d28 100644 --- a/src/include/duckdb/common/multi_file_reader.hpp +++ b/src/include/duckdb/common/multi_file_reader.hpp @@ -293,7 +293,7 @@ struct MultiFileReader { const OperatorPartitionInfo &partition_info, OperatorPartitionData &partition_data); - DUCKDB_API virtual void GetVirtualColumns(ClientContext &context, MultiFileReaderBindData &bind_data, + DUCKDB_API static void GetVirtualColumns(ClientContext &context, MultiFileReaderBindData &bind_data, virtual_column_map_t &result); template diff --git a/src/include/duckdb/function/table_function.hpp b/src/include/duckdb/function/table_function.hpp index 0ac43fe73812..e432e0f9568c 100644 --- a/src/include/duckdb/function/table_function.hpp +++ b/src/include/duckdb/function/table_function.hpp @@ -385,6 +385,8 @@ class TableFunction : public SimpleNamedParameterFunction { // NOLINT: work-arou //! Whether or not the table function supports sampling pushdown. If not supported a sample will be taken after the //! table function. bool sampling_pushdown; + //! Whether or not the table function supports late materialization + bool late_materialization; //! Additional function info, passed to the bind shared_ptr function_info; diff --git a/src/optimizer/late_materialization.cpp b/src/optimizer/late_materialization.cpp index 01f8118f5257..67884f483070 100644 --- a/src/optimizer/late_materialization.cpp +++ b/src/optimizer/late_materialization.cpp @@ -238,10 +238,13 @@ bool LateMaterialization::TryLateMaterialization(unique_ptr &op // we need all of the columns to compute the root node anyway (Top-N/Limit/etc) return false; } + if (!get.function.late_materialization) { + // this function does not support late materialization + return false; + } auto entry = get.virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); if (entry == get.virtual_columns.end()) { - // we can only do the late-materialization optimization for tables that support the rowid column - return false; + throw InternalException("Table function supports late materialization but does not expose a rowid column"); } row_id_type = entry->second.type; // we benefit from late materialization From 94e50026b862cf19997e240df12796460b83c641 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 09:36:22 +0100 Subject: [PATCH 081/142] use cast operator for src/target types in primitive dictionary, and add fast path to TemplatedWritePlain --- extension/parquet/column_writer.cpp | 2 +- .../writer/templated_column_writer.hpp | 86 ++++++++++----- .../duckdb/common/primitive_dictionary.hpp | 104 +++++++++++------- 3 files changed, 122 insertions(+), 70 deletions(-) diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index 4841aca02355..6d3bd63618bf 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -211,7 +211,7 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat //===--------------------------------------------------------------------===// // Used to store the metadata for a WKB-encoded geometry column when writing // GeoParquet files. -class WKBColumnWriterState final : public StandardColumnWriterState { +class WKBColumnWriterState final : public StandardColumnWriterState { public: WKBColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx) : StandardColumnWriterState(writer, row_group, col_idx), geo_data(), geo_data_writer(writer.GetContext()) { diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index 6e20c6dd6e23..bd2a4e220342 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -17,21 +17,47 @@ namespace duckdb { -template +template static void TemplatedWritePlain(Vector &col, ColumnWriterStatistics *stats, const idx_t chunk_start, const idx_t chunk_end, const ValidityMask &mask, WriteStream &ser) { - const auto *ptr = FlatVector::GetData(col); + static constexpr bool COPY_DIRECTLY_FROM_VECTOR = + ALL_VALID && std::is_same::value && std::is_arithmetic::value; + + const auto *const ptr = FlatVector::GetData(col); + TGT local_write[STANDARD_VECTOR_SIZE]; + idx_t local_write_count = 0; + for (idx_t r = chunk_start; r < chunk_end; r++) { - if (!mask.RowIsValid(r)) { + if (!ALL_VALID && !mask.RowIsValid(r)) { continue; } + TGT target_value = OP::template Operation(ptr[r]); OP::template HandleStats(stats, target_value); - OP::template WriteToStream(target_value, ser); + + if (COPY_DIRECTLY_FROM_VECTOR) { + continue; + } + + if (std::is_arithmetic::value) { + local_write[local_write_count++] = target_value; + } else { + OP::template WriteToStream(target_value, ser); + } + } + + if (COPY_DIRECTLY_FROM_VECTOR) { + ser.WriteData(const_data_ptr_cast(&ptr[chunk_start]), (chunk_end - chunk_start) * sizeof(TGT)); + return; } + + if (std::is_arithmetic::value) { + ser.WriteData(data_ptr_cast(local_write), local_write_count * sizeof(TGT)); + } + // Else we already wrote to stream } -template +template class StandardColumnWriterState : public PrimitiveColumnWriterState { public: StandardColumnWriterState(ParquetWriter &writer, duckdb_parquet::RowGroup &row_group, idx_t col_idx) @@ -47,16 +73,16 @@ class StandardColumnWriterState : public PrimitiveColumnWriterState { idx_t total_string_size = 0; uint32_t key_bit_width = 0; - PrimitiveDictionary dictionary; + PrimitiveDictionary dictionary; duckdb_parquet::Encoding::type encoding; }; -template +template class StandardWriterPageState : public ColumnWriterPageState { public: explicit StandardWriterPageState(const idx_t total_value_count, const idx_t total_string_size, duckdb_parquet::Encoding::type encoding_p, - const PrimitiveDictionary &dictionary_p) + const PrimitiveDictionary &dictionary_p) : encoding(encoding_p), dbp_initialized(false), dbp_encoder(total_value_count), dlba_initialized(false), dlba_encoder(total_value_count, total_string_size), bss_encoder(total_value_count, sizeof(TGT)), dictionary(dictionary_p), dict_written_value(false), @@ -72,7 +98,7 @@ class StandardWriterPageState : public ColumnWriterPageState { BssEncoder bss_encoder; - const PrimitiveDictionary &dictionary; + const PrimitiveDictionary &dictionary; bool dict_written_value; uint32_t dict_bit_width; RleBpEncoder dict_encoder; @@ -89,22 +115,22 @@ class StandardColumnWriter : public PrimitiveColumnWriter { public: unique_ptr InitializeWriteState(duckdb_parquet::RowGroup &row_group) override { - auto result = make_uniq>(writer, row_group, row_group.columns.size()); + auto result = make_uniq>(writer, row_group, row_group.columns.size()); result->encoding = duckdb_parquet::Encoding::RLE_DICTIONARY; RegisterToRowGroup(row_group); return std::move(result); } unique_ptr InitializePageState(PrimitiveColumnWriterState &state_p) override { - auto &state = state_p.Cast>(); + auto &state = state_p.Cast>(); - auto result = make_uniq>(state.total_value_count, state.total_string_size, - state.encoding, state.dictionary); + auto result = make_uniq>(state.total_value_count, state.total_string_size, + state.encoding, state.dictionary); return std::move(result); } void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override { - auto &page_state = state_p->Cast>(); + auto &page_state = state_p->Cast>(); switch (page_state.encoding) { case duckdb_parquet::Encoding::DELTA_BINARY_PACKED: if (!page_state.dbp_initialized) { @@ -139,7 +165,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } duckdb_parquet::Encoding::type GetEncoding(PrimitiveColumnWriterState &state_p) override { - auto &state = state_p.Cast>(); + auto &state = state_p.Cast>(); return state.encoding; } @@ -148,7 +174,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } void Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) override { - auto &state = state_p.Cast>(); + auto &state = state_p.Cast>(); auto data_ptr = FlatVector::GetData(vector); idx_t vector_index = 0; @@ -188,7 +214,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { void FinalizeAnalyze(ColumnWriterState &state_p) override { const auto type = writer.GetType(schema_idx); - auto &state = state_p.Cast>(); + auto &state = state_p.Cast>(); if (state.dictionary.GetSize() == 0 || state.dictionary.IsFull()) { if (writer.GetParquetVersion() == ParquetVersion::V1) { // Can't do the cool stuff for V1 @@ -221,18 +247,18 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } bool HasDictionary(PrimitiveColumnWriterState &state_p) override { - auto &state = state_p.Cast>(); + auto &state = state_p.Cast>(); return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY; } idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override { - auto &state = state_p.Cast>(); + auto &state = state_p.Cast>(); return state.dictionary.GetSize(); } void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start, idx_t chunk_end) override { - auto &page_state = page_state_p->Cast>(); + auto &page_state = page_state_p->Cast>(); const auto &mask = FlatVector::Validity(input_column); const auto *data_ptr = FlatVector::GetData(input_column); @@ -331,7 +357,12 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } case duckdb_parquet::Encoding::PLAIN: { D_ASSERT(page_state.encoding == duckdb_parquet::Encoding::PLAIN); - TemplatedWritePlain(input_column, stats, chunk_start, chunk_end, mask, temp_writer); + if (mask.AllValid()) { + TemplatedWritePlain(input_column, stats, chunk_start, chunk_end, mask, temp_writer); + } else { + TemplatedWritePlain(input_column, stats, chunk_start, chunk_end, mask, + temp_writer); + } break; } default: @@ -340,29 +371,28 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override { - auto &state = state_p.Cast>(); + auto &state = state_p.Cast>(); D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY); state.bloom_filter = make_uniq(state.dictionary.GetSize(), writer.BloomFilterFalsePositiveRatio()); - state.dictionary.IterateValues([&](const SRC &value) { - const TGT target_value = OP::template Operation(value); + state.dictionary.IterateValues([&](const SRC &src_value, const TGT &tgt_value) { // update the statistics - OP::template HandleStats(stats, target_value); + OP::template HandleStats(stats, tgt_value); // update the bloom filter - auto hash = OP::template XXHash64(target_value); + auto hash = OP::template XXHash64(tgt_value); state.bloom_filter->FilterInsert(hash); }); // flush the dictionary page and add it to the to-be-written pages - WriteDictionary(state, state.dictionary.GetPlainMemoryStream(), state.dictionary.GetSize()); + WriteDictionary(state, state.dictionary.GetTargetMemoryStream(), state.dictionary.GetSize()); // bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up } idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state_p) const override { - auto &state = state_p.Cast>(); + auto &state = state_p.Cast>(); if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) { return (state.key_bit_width + 7) / 8; } else { diff --git a/src/include/duckdb/common/primitive_dictionary.hpp b/src/include/duckdb/common/primitive_dictionary.hpp index a4369776d37d..4b48b7ab3dd3 100644 --- a/src/include/duckdb/common/primitive_dictionary.hpp +++ b/src/include/duckdb/common/primitive_dictionary.hpp @@ -14,14 +14,25 @@ namespace duckdb { -template +struct PrimitiveCastOperator { + template + static TGT Operation(SRC input) { + return TGT(input); + } +}; + +template class PrimitiveDictionary { private: + static_assert(!std::is_same::value || + (std::is_same::value && std::is_same::value), + "If SRC is string_t, TGT must also be string_t"); + static constexpr idx_t LOAD_FACTOR = 2; static constexpr uint32_t INVALID_INDEX = static_cast(-1); struct primitive_dictionary_entry_t { - T value; + SRC value; uint32_t index; bool IsEmpty() const { return index == INVALID_INDEX; @@ -33,13 +44,15 @@ class PrimitiveDictionary { //! PrimitiveDictionary is a fixed-size linear probing hash table for primitive types //! It is used to dictionary-encode data in, e.g., Parquet files - PrimitiveDictionary(Allocator &allocator, idx_t maximum_size_p, idx_t plain_capacity_p) + PrimitiveDictionary(Allocator &allocator, idx_t maximum_size_p, idx_t target_capacity_p) : maximum_size(maximum_size_p), size(0), capacity(NextPowerOfTwo(maximum_size * LOAD_FACTOR)), - capacity_mask(capacity - 1), plain_capacity(plain_capacity_p), plain_offset(0), + capacity_mask(capacity - 1), target_capacity(target_capacity_p), target_offset(0), allocated_dictionary(allocator.Allocate(capacity * sizeof(primitive_dictionary_entry_t))), - allocated_plain(allocator.Allocate(std::is_same::value ? plain_capacity : capacity * sizeof(T))), + allocated_target( + allocator.Allocate(std::is_same::value ? target_capacity : capacity * sizeof(TGT))), dictionary(reinterpret_cast(allocated_dictionary.get())), - plain(reinterpret_cast(allocated_plain.get())), plain_raw(allocated_plain.get()), full(false) { + target_values(reinterpret_cast(allocated_target.get())), target_raw(allocated_target.get()), + full(false) { // Initialize empty for (idx_t i = 0; i < capacity; i++) { dictionary[i].index = INVALID_INDEX; @@ -48,13 +61,13 @@ class PrimitiveDictionary { public: //! Insert value into dictionary (if not full) - void Insert(T value) { + void Insert(SRC value) { if (full) { return; } auto &entry = Lookup(value); if (entry.IsEmpty()) { - if (size + 1 > maximum_size || !AddToPlain(value)) { + if (size + 1 > maximum_size || !AddToTarget(value)) { full = true; return; } @@ -64,29 +77,33 @@ class PrimitiveDictionary { } //! Get dictionary index of an already inserted value - uint32_t GetIndex(const T &value) const { + uint32_t GetIndex(const SRC &value) const { const auto &entry = Lookup(value); D_ASSERT(!entry.IsEmpty()); return entry.index; } //! Iterates over inserted values - template ::value, int>::type = 0> - void IterateValues(const std::function &fun) const { - for (idx_t i = 0; i < size; i++) { - fun(plain[i]); + template ::value, int>::type = 0> + void IterateValues(const std::function &fun) const { + for (idx_t i = 0; i < capacity; i++) { + auto &entry = dictionary[i]; + if (entry.IsEmpty()) { + continue; + } + fun(entry.value, target_values[entry.index]); } } //! Specialized template to iterate over string_t values - template ::value, int>::type = 0> - void IterateValues(const std::function &fun) const { + template ::value, int>::type = 0> + void IterateValues(const std::function &fun) const { for (idx_t i = 0; i < capacity; i++) { auto &entry = dictionary[i]; if (entry.IsEmpty()) { continue; } - fun(entry.value); + fun(entry.value, entry.value); } } @@ -100,16 +117,16 @@ class PrimitiveDictionary { return full; } - //! Get the plain written values as a memory stream (zero-copy) - unique_ptr GetPlainMemoryStream() const { - auto result = make_uniq(plain_raw, plain_capacity); - result->SetPosition(plain_offset); + //! Get the target written values as a memory stream (zero-copy) + unique_ptr GetTargetMemoryStream() const { + auto result = make_uniq(target_raw, target_capacity); + result->SetPosition(target_offset); return result; } private: //! Looks up a value in the dictionary using linear probing - primitive_dictionary_entry_t &Lookup(const T &value) const { + primitive_dictionary_entry_t &Lookup(const SRC &value) const { auto offset = Hash(value) & capacity_mask; while (!dictionary[offset].IsEmpty() && dictionary[offset].value != value) { ++offset &= capacity_mask; @@ -117,27 +134,32 @@ class PrimitiveDictionary { return dictionary[offset]; } - //! Writes a value to the plain data - bool AddToPlain(const T &value) { - plain[size] = value; - plain_offset += sizeof(T); + //! Writes a value to the target data + template ::value, int>::type = 0> + bool AddToTarget(const SRC &src_value) { + const auto tgt_value = CAST_OP::template Operation(src_value); + target_values[size] = tgt_value; + target_offset += sizeof(TGT); return true; } - //! Specialized template to add a string_t value to the plain data - bool AddToPlain(string_t &value) { - if (plain_offset + sizeof(uint32_t) + value.GetSize() > plain_capacity) { + //! Specialized template to add a string_t value to the target data + template ::value, int>::type = 0> + bool AddToTarget(SRC &src_value) { + if (target_offset + sizeof(uint32_t) + src_value.GetSize() > target_capacity) { return false; // Out of capacity } // Store string length and increment offset - Store(UnsafeNumericCast(value.GetSize()), plain_raw + plain_offset); - plain_offset += sizeof(uint32_t); + Store(UnsafeNumericCast(src_value.GetSize()), target_raw + target_offset); + target_offset += sizeof(uint32_t); - // Copy over string data to plain, update "value" to point to it, and increment offset - memcpy(plain_raw + plain_offset, value.GetData(), value.GetSize()); - value = string_t(char_ptr_cast(plain_raw + plain_offset), value.GetSize()); - plain_offset += value.GetSize(); + // Copy over string data to target, update "value" to point to it, and increment offset + memcpy(target_raw + target_offset, src_value.GetData(), src_value.GetSize()); + if (!src_value.IsInlined()) { + src_value.SetPointer(char_ptr_cast(target_raw + target_offset)); + } + target_offset += src_value.GetSize(); return true; } @@ -151,18 +173,18 @@ class PrimitiveDictionary { const idx_t capacity; const idx_t capacity_mask; - //! Capacity/offset of plain encoded data - const idx_t plain_capacity; - idx_t plain_offset; + //! Capacity/offset of target encoded data + const idx_t target_capacity; + idx_t target_offset; - //! Allocated regions for dictionary/plain + //! Allocated regions for dictionary/target AllocatedData allocated_dictionary; - AllocatedData allocated_plain; + AllocatedData allocated_target; //! Pointers to allocated regions for convenience primitive_dictionary_entry_t *const dictionary; - T *const plain; - data_ptr_t const plain_raw; + TGT *const target_values; + data_ptr_t const target_raw; //! More values inserted than possible bool full; From 19ca17f1270bb848e187729a3353585f2fdffd8b Mon Sep 17 00:00:00 2001 From: Mytherin Date: Fri, 14 Feb 2025 09:41:31 +0100 Subject: [PATCH 082/142] Add EMPTY column that can be used for COUNT(*) - but not queried - and add union by name test --- src/common/constants.cpp | 1 + src/common/multi_file_reader.cpp | 2 +- src/function/table/read_csv.cpp | 2 +- src/include/duckdb/common/constants.hpp | 2 ++ src/planner/operator/logical_get.cpp | 12 ++++++++---- src/planner/table_binding.cpp | 4 ++++ test/sql/copy/csv/test_union_by_name.test | 19 +++++++++++++++++++ 7 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/common/constants.cpp b/src/common/constants.cpp index ee51460033ff..2bddd619532f 100644 --- a/src/common/constants.cpp +++ b/src/common/constants.cpp @@ -10,6 +10,7 @@ constexpr const idx_t DConstants::INVALID_INDEX; const row_t MAX_ROW_ID = 36028797018960000ULL; // 2^55 const row_t MAX_ROW_ID_LOCAL = 72057594037920000ULL; // 2^56 const column_t COLUMN_IDENTIFIER_ROW_ID = UINT64_C(18446744073709551615); +const column_t COLUMN_IDENTIFIER_EMPTY = UINT64_C(18446744073709551614); const column_t VIRTUAL_COLUMN_START = UINT64_C(9223372036854775808); // 2^63 const double PI = 3.141592653589793; diff --git a/src/common/multi_file_reader.cpp b/src/common/multi_file_reader.cpp index e9e8f40ca3f4..3aacf065dca3 100644 --- a/src/common/multi_file_reader.cpp +++ b/src/common/multi_file_reader.cpp @@ -239,7 +239,7 @@ void MultiFileReader::BindOptions(MultiFileReaderOptions &options, MultiFileList void MultiFileReader::GetVirtualColumns(ClientContext &context, MultiFileReaderBindData &bind_data, virtual_column_map_t &result) { - if (bind_data.filename_idx == DConstants::INVALID_INDEX) { + if (bind_data.filename_idx == DConstants::INVALID_INDEX || bind_data.filename_idx == COLUMN_IDENTIFIER_FILENAME) { bind_data.filename_idx = COLUMN_IDENTIFIER_FILENAME; result.insert(make_pair(COLUMN_IDENTIFIER_FILENAME, TableColumn("filename", LogicalType::VARCHAR))); } diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index d9603abfe81b..1e442873b296 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -419,7 +419,7 @@ virtual_column_map_t ReadCSVGetVirtualColumns(ClientContext &context, optional_p auto &csv_bind = bind_data->Cast(); virtual_column_map_t result; MultiFileReader::GetVirtualColumns(context, csv_bind.reader_bind, result); - result.insert(make_pair(COLUMN_IDENTIFIER_ROW_ID, TableColumn("rowid", LogicalType::ROW_TYPE))); + result.insert(make_pair(COLUMN_IDENTIFIER_EMPTY, TableColumn("", LogicalType::BOOLEAN))); return result; } diff --git a/src/include/duckdb/common/constants.hpp b/src/include/duckdb/common/constants.hpp index 387dd4127579..e9f816a17619 100644 --- a/src/include/duckdb/common/constants.hpp +++ b/src/include/duckdb/common/constants.hpp @@ -40,6 +40,8 @@ DUCKDB_API bool IsInvalidCatalog(const string &str); //! Special value used to signify the ROW ID of a table DUCKDB_API extern const column_t COLUMN_IDENTIFIER_ROW_ID; +//! Special value used to signify an empty column (used for e.g. COUNT(*)) +DUCKDB_API extern const column_t COLUMN_IDENTIFIER_EMPTY; DUCKDB_API extern const column_t VIRTUAL_COLUMN_START; DUCKDB_API bool IsRowIdColumnId(column_t column_id); DUCKDB_API bool IsVirtualColumn(column_t column_id); diff --git a/src/planner/operator/logical_get.cpp b/src/planner/operator/logical_get.cpp index c4b5a2f47ffc..e894cd8fab9a 100644 --- a/src/planner/operator/logical_get.cpp +++ b/src/planner/operator/logical_get.cpp @@ -142,14 +142,18 @@ const string &LogicalGet::GetColumnName(const ColumnIndex &index) const { } column_t LogicalGet::GetAnyColumn() const { - auto entry = virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); + auto entry = virtual_columns.find(COLUMN_IDENTIFIER_EMPTY); + if (entry != virtual_columns.end()) { + // return the empty column if the projection supports it + return COLUMN_IDENTIFIER_EMPTY; + } + entry = virtual_columns.find(COLUMN_IDENTIFIER_ROW_ID); if (entry != virtual_columns.end()) { // return the rowid column if the projection supports it return COLUMN_IDENTIFIER_ROW_ID; - } else { - // otherwise return the first column - return 0; } + // otherwise return the first column + return 0; } void LogicalGet::ResolveTypes() { diff --git a/src/planner/table_binding.cpp b/src/planner/table_binding.cpp index 934814ec7516..e18d899d60a8 100644 --- a/src/planner/table_binding.cpp +++ b/src/planner/table_binding.cpp @@ -125,6 +125,10 @@ TableBinding::TableBinding(const string &alias, vector types_p, vec "Virtual column index must be larger than VIRTUAL_COLUMN_START - found %d for column \"%s\"", idx, name); } + if (idx == COLUMN_IDENTIFIER_EMPTY) { + // the empty column cannot be queried by the user + continue; + } if (name_map.find(name) == name_map.end()) { name_map[name] = idx; } diff --git a/test/sql/copy/csv/test_union_by_name.test b/test/sql/copy/csv/test_union_by_name.test index 6d9032516759..ebe98d332378 100644 --- a/test/sql/copy/csv/test_union_by_name.test +++ b/test/sql/copy/csv/test_union_by_name.test @@ -67,6 +67,25 @@ ORDER BY a; 102 NULL 103 9223372036854775807 NULL NULL +query IIII +SELECT a, b, c, replace(replace(filename, '\', '/'), '__TEST_DIR__/', '') +FROM read_csv_auto(['__TEST_DIR__/ubn1.csv', '__TEST_DIR__/ubn2.csv', '__TEST_DIR__/ubn3.csv'], UNION_BY_NAME=TRUE) +ORDER BY a; +---- +1 NULL NULL ubn1.csv +2 NULL NULL ubn1.csv +3 4 NULL ubn2.csv +5 6 NULL ubn2.csv +100 NULL 101 ubn3.csv +102 NULL 103 ubn3.csv +9223372036854775807 NULL NULL ubn1.csv + +query IIII +SELECT COUNT(a), COUNT(b), COUNT(c), COUNT(filename) +FROM read_csv_auto(['__TEST_DIR__/ubn1.csv', '__TEST_DIR__/ubn2.csv', '__TEST_DIR__/ubn3.csv'], UNION_BY_NAME=TRUE) +---- +7 2 2 7 + query TTT SELECT typeof(a), typeof(b), typeof(c) FROM read_csv_auto(['__TEST_DIR__/ubn1.csv', '__TEST_DIR__/ubn2.csv', '__TEST_DIR__/ubn3.csv'], UNION_BY_NAME=TRUE) From fad85466924a65b0b9ad92b5ae1940f921c1f780 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 09:55:57 +0100 Subject: [PATCH 083/142] take vectors larger than standard into account --- extension/parquet/include/writer/templated_column_writer.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index bd2a4e220342..027af57fe6c5 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -24,6 +24,7 @@ static void TemplatedWritePlain(Vector &col, ColumnWriterStatistics *stats, cons ALL_VALID && std::is_same::value && std::is_arithmetic::value; const auto *const ptr = FlatVector::GetData(col); + TGT local_write[STANDARD_VECTOR_SIZE]; idx_t local_write_count = 0; @@ -41,6 +42,10 @@ static void TemplatedWritePlain(Vector &col, ColumnWriterStatistics *stats, cons if (std::is_arithmetic::value) { local_write[local_write_count++] = target_value; + if (local_write_count == STANDARD_VECTOR_SIZE) { + ser.WriteData(data_ptr_cast(local_write), local_write_count * sizeof(TGT)); + local_write_count = 0; + } } else { OP::template WriteToStream(target_value, ser); } From 8cb3c56d815128189420a9f9474a7c0d9e853be9 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Fri, 14 Feb 2025 10:29:47 +0100 Subject: [PATCH 084/142] Support virtual columns in the CSV reader --- extension/json/include/json_scan.hpp | 2 ++ extension/json/json_scan.cpp | 11 +++++++++- .../json/table/json_multi_file_reader.test | 20 +++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/extension/json/include/json_scan.hpp b/extension/json/include/json_scan.hpp index 4fd7bc0a0768..0504c21b8542 100644 --- a/extension/json/include/json_scan.hpp +++ b/extension/json/include/json_scan.hpp @@ -360,6 +360,8 @@ struct JSONScan { const TableFunction &function); static unique_ptr Deserialize(Deserializer &deserializer, TableFunction &function); + static virtual_column_map_t GetVirtualColumns(ClientContext &context, optional_ptr bind_data); + static void TableFunctionDefaults(TableFunction &table_function); }; diff --git a/extension/json/json_scan.cpp b/extension/json/json_scan.cpp index 6fd732a0b102..11777fc3eea2 100644 --- a/extension/json/json_scan.cpp +++ b/extension/json/json_scan.cpp @@ -171,7 +171,7 @@ unique_ptr JSONGlobalTableFunctionState::Init(ClientCo const auto &col_id = input.column_ids[col_idx]; // Skip any multi-file reader / row id stuff - if (col_id == bind_data.reader_bind.filename_idx || IsRowIdColumnId(col_id)) { + if (col_id == bind_data.reader_bind.filename_idx || IsVirtualColumn(col_id)) { continue; } bool skip = false; @@ -1025,6 +1025,14 @@ unique_ptr JSONScan::Deserialize(Deserializer &deserializer, Table return std::move(result); } +virtual_column_map_t JSONScan::GetVirtualColumns(ClientContext &context, optional_ptr bind_data) { + auto &csv_bind = bind_data->Cast(); + virtual_column_map_t result; + MultiFileReader::GetVirtualColumns(context, csv_bind.reader_bind, result); + result.insert(make_pair(COLUMN_IDENTIFIER_EMPTY, TableColumn("", LogicalType::BOOLEAN))); + return result; +} + void JSONScan::TableFunctionDefaults(TableFunction &table_function) { MultiFileReader().AddParameters(table_function); @@ -1039,6 +1047,7 @@ void JSONScan::TableFunctionDefaults(TableFunction &table_function) { table_function.serialize = Serialize; table_function.deserialize = Deserialize; + table_function.get_virtual_columns = GetVirtualColumns; table_function.projection_pushdown = true; table_function.filter_pushdown = false; diff --git a/test/sql/json/table/json_multi_file_reader.test b/test/sql/json/table/json_multi_file_reader.test index a56f7aee1ca3..e7b041b84ac5 100644 --- a/test/sql/json/table/json_multi_file_reader.test +++ b/test/sql/json/table/json_multi_file_reader.test @@ -32,6 +32,26 @@ select * exclude (filename), replace(filename, '\', '/') as filename from read_j 5 Raising Arizona data/json/example_r.ndjson 5 Raising Arizona data/json/example_rn.ndjson +# virtual column +query III +select *, replace(filename, '\', '/') from read_json_auto('data/json/example_*.ndjson') order by all +---- +1 O Brother, Where Art Thou? data/json/example_n.ndjson +1 O Brother, Where Art Thou? data/json/example_r.ndjson +1 O Brother, Where Art Thou? data/json/example_rn.ndjson +2 Home for the Holidays data/json/example_n.ndjson +2 Home for the Holidays data/json/example_r.ndjson +2 Home for the Holidays data/json/example_rn.ndjson +3 The Firm data/json/example_n.ndjson +3 The Firm data/json/example_r.ndjson +3 The Firm data/json/example_rn.ndjson +4 Broadcast News data/json/example_n.ndjson +4 Broadcast News data/json/example_r.ndjson +4 Broadcast News data/json/example_rn.ndjson +5 Raising Arizona data/json/example_n.ndjson +5 Raising Arizona data/json/example_r.ndjson +5 Raising Arizona data/json/example_rn.ndjson + query III select * from read_json_auto(['data/json/example_n.ndjson', 'data/json/top_level_array.json'], union_by_name=true) order by all ---- From 4b827f09c515c621231e5a4607ad4c15f5fa2be2 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Fri, 14 Feb 2025 10:30:01 +0100 Subject: [PATCH 085/142] Format fix --- src/function/table/read_csv.cpp | 1 - src/include/duckdb/common/multi_file_reader.hpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 1e442873b296..1bbe01ac8476 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -423,7 +423,6 @@ virtual_column_map_t ReadCSVGetVirtualColumns(ClientContext &context, optional_p return result; } - TableFunction ReadCSVTableFunction::GetFunction() { TableFunction read_csv("read_csv", {LogicalType::VARCHAR}, ReadCSVFunction, ReadCSVBind, ReadCSVInitGlobal, ReadCSVInitLocal); diff --git a/src/include/duckdb/common/multi_file_reader.hpp b/src/include/duckdb/common/multi_file_reader.hpp index 39fd78654d28..0d5d36484e24 100644 --- a/src/include/duckdb/common/multi_file_reader.hpp +++ b/src/include/duckdb/common/multi_file_reader.hpp @@ -294,7 +294,7 @@ struct MultiFileReader { OperatorPartitionData &partition_data); DUCKDB_API static void GetVirtualColumns(ClientContext &context, MultiFileReaderBindData &bind_data, - virtual_column_map_t &result); + virtual_column_map_t &result); template MultiFileReaderBindData BindUnionReader(ClientContext &context, vector &return_types, From 1990d37d304bba53af6e1ae4d7e07f205507e357 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 10:39:20 +0100 Subject: [PATCH 086/142] take parent nulls into account in fast path writing define levels --- extension/parquet/column_writer.cpp | 1 + extension/parquet/include/column_writer.hpp | 1 + extension/parquet/parquet_extension.cpp | 2 +- extension/parquet/writer/primitive_column_writer.cpp | 8 ++++---- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index 6d3bd63618bf..893081658a07 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -176,6 +176,7 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat idx_t current_index = state.definition_levels.size(); if (parent->definition_levels[current_index] != PARQUET_DEFINE_VALID) { state.definition_levels.push_back(parent->definition_levels[current_index]); + state.parent_null_count++; } else if (validity.RowIsValid(vector_index)) { state.definition_levels.push_back(define_value); } else { diff --git a/extension/parquet/include/column_writer.hpp b/extension/parquet/include/column_writer.hpp index dba6788e7cec..26f01d8d74b0 100644 --- a/extension/parquet/include/column_writer.hpp +++ b/extension/parquet/include/column_writer.hpp @@ -27,6 +27,7 @@ class ColumnWriterState { unsafe_vector definition_levels; unsafe_vector repetition_levels; vector is_empty; + idx_t parent_null_count = 0; idx_t null_count = 0; public: diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index 9142c0db681e..adc5ee37cd06 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -200,7 +200,7 @@ struct ParquetWriteBindData : public TableFunctionData { dictionary_size_limit = row_group_size / 20; } - idx_t string_dictionary_page_size_limit = 2097152; + idx_t string_dictionary_page_size_limit = 1048576; //! What false positive rate are we willing to accept for bloom filters double bloom_filter_false_positive_ratio = 0.01; diff --git a/extension/parquet/writer/primitive_column_writer.cpp b/extension/parquet/writer/primitive_column_writer.cpp index 0bd85d0894a9..9e3515de9d78 100644 --- a/extension/parquet/writer/primitive_column_writer.cpp +++ b/extension/parquet/writer/primitive_column_writer.cpp @@ -54,7 +54,7 @@ void PrimitiveColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterStat if (!check_parent_empty && validity.AllValid() && TypeIsConstantSize(vector.GetType().InternalType()) && page_info_ref.get().estimated_page_size + GetRowSize(vector, vector_index, state) * vcount < MAX_UNCOMPRESSED_PAGE_SIZE) { - // Fast path + // Fast path: fixed-size type, all valid, and it fits on the current page auto &page_info = page_info_ref.get(); page_info.row_count += vcount; page_info.estimated_page_size += GetRowSize(vector, vector_index, state) * vcount; @@ -140,8 +140,8 @@ void PrimitiveColumnWriter::WriteLevels(WriteStream &temp_writer, const unsafe_v MemoryStream intermediate_stream(Allocator::DefaultAllocator()); rle_encoder.BeginWrite(); - if (null_count.IsValid() && (null_count.GetIndex() == 0 || null_count.GetIndex() == count)) { - // All are NULL or none are NULL + if (null_count.IsValid() && null_count.GetIndex() == 0) { + // Fast path: no nulls rle_encoder.WriteMany(intermediate_stream, levels[0], count); } else { for (idx_t i = offset; i < offset + count; i++) { @@ -176,7 +176,7 @@ void PrimitiveColumnWriter::NextPage(PrimitiveColumnWriterState &state) { // write the definition levels WriteLevels(temp_writer, state.definition_levels, max_define, page_info.offset, page_info.row_count, - state.null_count); + state.null_count + state.parent_null_count); } void PrimitiveColumnWriter::FlushPage(PrimitiveColumnWriterState &state) { From 09a5da2794d27e7ea11d97eb0d998bc49169520d Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 10:47:07 +0100 Subject: [PATCH 087/142] prefer allocator over unique array --- extension/parquet/column_writer.cpp | 15 ++++++--------- extension/parquet/include/column_writer.hpp | 2 +- .../include/writer/primitive_column_writer.hpp | 2 +- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index 893081658a07..3b5c3747e7e9 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -91,7 +91,7 @@ ColumnWriterState::~ColumnWriterState() { } void ColumnWriter::CompressPage(MemoryStream &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data, - unique_ptr &compressed_buf) { + AllocatedData &compressed_buf) { switch (writer.GetCodec()) { case CompressionCodec::UNCOMPRESSED: compressed_size = temp_writer.GetPosition(); @@ -100,7 +100,7 @@ void ColumnWriter::CompressPage(MemoryStream &temp_writer, size_t &compressed_si case CompressionCodec::SNAPPY: { compressed_size = duckdb_snappy::MaxCompressedLength(temp_writer.GetPosition()); - compressed_buf = unique_ptr(new data_t[compressed_size]); + compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size); duckdb_snappy::RawCompress(const_char_ptr_cast(temp_writer.GetData()), temp_writer.GetPosition(), char_ptr_cast(compressed_buf.get()), &compressed_size); compressed_data = compressed_buf.get(); @@ -109,7 +109,7 @@ void ColumnWriter::CompressPage(MemoryStream &temp_writer, size_t &compressed_si } case CompressionCodec::LZ4_RAW: { compressed_size = duckdb_lz4::LZ4_compressBound(UnsafeNumericCast(temp_writer.GetPosition())); - compressed_buf = unique_ptr(new data_t[compressed_size]); + compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size); compressed_size = duckdb_lz4::LZ4_compress_default( const_char_ptr_cast(temp_writer.GetData()), char_ptr_cast(compressed_buf.get()), UnsafeNumericCast(temp_writer.GetPosition()), UnsafeNumericCast(compressed_size)); @@ -119,7 +119,7 @@ void ColumnWriter::CompressPage(MemoryStream &temp_writer, size_t &compressed_si case CompressionCodec::GZIP: { MiniZStream s; compressed_size = s.MaxCompressedLength(temp_writer.GetPosition()); - compressed_buf = unique_ptr(new data_t[compressed_size]); + compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size); s.Compress(const_char_ptr_cast(temp_writer.GetData()), temp_writer.GetPosition(), char_ptr_cast(compressed_buf.get()), &compressed_size); compressed_data = compressed_buf.get(); @@ -127,7 +127,7 @@ void ColumnWriter::CompressPage(MemoryStream &temp_writer, size_t &compressed_si } case CompressionCodec::ZSTD: { compressed_size = duckdb_zstd::ZSTD_compressBound(temp_writer.GetPosition()); - compressed_buf = unique_ptr(new data_t[compressed_size]); + compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size); compressed_size = duckdb_zstd::ZSTD_compress((void *)compressed_buf.get(), compressed_size, (const void *)temp_writer.GetData(), temp_writer.GetPosition(), UnsafeNumericCast(writer.CompressionLevel())); @@ -135,15 +135,12 @@ void ColumnWriter::CompressPage(MemoryStream &temp_writer, size_t &compressed_si break; } case CompressionCodec::BROTLI: { - compressed_size = duckdb_brotli::BrotliEncoderMaxCompressedSize(temp_writer.GetPosition()); - compressed_buf = unique_ptr(new data_t[compressed_size]); - + compressed_buf = BufferAllocator::Get(writer.GetContext()).Allocate(compressed_size); duckdb_brotli::BrotliEncoderCompress(BROTLI_DEFAULT_QUALITY, BROTLI_DEFAULT_WINDOW, BROTLI_DEFAULT_MODE, temp_writer.GetPosition(), temp_writer.GetData(), &compressed_size, compressed_buf.get()); compressed_data = compressed_buf.get(); - break; } default: diff --git a/extension/parquet/include/column_writer.hpp b/extension/parquet/include/column_writer.hpp index 26f01d8d74b0..09f17c7eeb37 100644 --- a/extension/parquet/include/column_writer.hpp +++ b/extension/parquet/include/column_writer.hpp @@ -113,7 +113,7 @@ class ColumnWriter { void HandleRepeatLevels(ColumnWriterState &state_p, ColumnWriterState *parent, idx_t count, idx_t max_repeat) const; void CompressPage(MemoryStream &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data, - unique_ptr &compressed_buf); + AllocatedData &compressed_buf); }; } // namespace duckdb diff --git a/extension/parquet/include/writer/primitive_column_writer.hpp b/extension/parquet/include/writer/primitive_column_writer.hpp index ccaa02f79503..6315efbd7452 100644 --- a/extension/parquet/include/writer/primitive_column_writer.hpp +++ b/extension/parquet/include/writer/primitive_column_writer.hpp @@ -31,7 +31,7 @@ struct PageWriteInformation { idx_t max_write_count = 0; size_t compressed_size; data_ptr_t compressed_data; - unique_ptr compressed_buf; + AllocatedData compressed_buf; }; class PrimitiveColumnWriterState : public ColumnWriterState { From e14c6ac8dc390067125323bafa80e7a6e7eb97a7 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Fri, 14 Feb 2025 12:48:24 +0100 Subject: [PATCH 088/142] Add missing includes --- extension/core_functions/scalar/generic/least.cpp | 1 + .../duckdb/execution/operator/persistent/physical_export.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/extension/core_functions/scalar/generic/least.cpp b/extension/core_functions/scalar/generic/least.cpp index 40a943101f94..886e909ca557 100644 --- a/extension/core_functions/scalar/generic/least.cpp +++ b/extension/core_functions/scalar/generic/least.cpp @@ -2,6 +2,7 @@ #include "core_functions/scalar/generic_functions.hpp" #include "duckdb/function/create_sort_key.hpp" #include "duckdb/planner/expression/bound_function_expression.hpp" +#include "duckdb/planner/expression_binder.hpp" namespace duckdb { diff --git a/src/include/duckdb/execution/operator/persistent/physical_export.hpp b/src/include/duckdb/execution/operator/persistent/physical_export.hpp index fd7f4981fc6f..214487dac0f8 100644 --- a/src/include/duckdb/execution/operator/persistent/physical_export.hpp +++ b/src/include/duckdb/execution/operator/persistent/physical_export.hpp @@ -14,6 +14,7 @@ #include "duckdb/function/copy_function.hpp" #include "duckdb/parser/parsed_data/copy_info.hpp" #include "duckdb/parser/parsed_data/exported_table_data.hpp" +#include "duckdb/catalog/catalog_entry_map.hpp" namespace duckdb { From 4a7d440ab2c9e3e519cb92384ef6402ebc88a2c5 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Fri, 14 Feb 2025 13:31:28 +0100 Subject: [PATCH 089/142] Fix for statistics propagation in Parquet for virtual columns --- extension/parquet/parquet_extension.cpp | 2 +- test/sql/copy/parquet/parquet_virtual_columns.test | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index 1a05891eb314..a7fbb1fe3c1c 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -441,7 +441,7 @@ class ParquetScanFunction { column_t column_index) { auto &bind_data = bind_data_p->Cast(); - if (IsRowIdColumnId(column_index)) { + if (IsVirtualColumn(column_index)) { return nullptr; } diff --git a/test/sql/copy/parquet/parquet_virtual_columns.test b/test/sql/copy/parquet/parquet_virtual_columns.test index f1f84155036b..03f5f4677811 100644 --- a/test/sql/copy/parquet/parquet_virtual_columns.test +++ b/test/sql/copy/parquet/parquet_virtual_columns.test @@ -5,6 +5,9 @@ require parquet # Filename without the filename option +statement ok +select filename from 'data/parquet-testing/glob/t1.parquet' + query III select i, j, replace(filename, '\', '/') from 'data/parquet-testing/glob*/t?.parquet' order by i; ---- From 8cb2b66edf747e1fc0553cdfea3057b8facf31b4 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 13:57:01 +0100 Subject: [PATCH 090/142] cast and directly write to memorystream in primitive dictionary so we can deal with uuid/interval --- extension/parquet/column_writer.cpp | 5 ++ .../writer/parquet_write_operators.hpp | 21 ++++++- .../duckdb/common/primitive_dictionary.hpp | 57 ++++++++----------- 3 files changed, 50 insertions(+), 33 deletions(-) diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index 3b5c3747e7e9..607351b3298a 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -274,6 +274,11 @@ struct double_na_equal { } return val == right; } + + bool operator!=(const double &right) const { + return !(*this == right); + } + double val; }; diff --git a/extension/parquet/include/writer/parquet_write_operators.hpp b/extension/parquet/include/writer/parquet_write_operators.hpp index d63acba17e80..8bef3067a05e 100644 --- a/extension/parquet/include/writer/parquet_write_operators.hpp +++ b/extension/parquet/include/writer/parquet_write_operators.hpp @@ -20,6 +20,11 @@ struct BaseParquetOperator { ser.WriteData(const_data_ptr_cast(&input), sizeof(TGT)); } + template + static constexpr idx_t WriteSize(const TGT &input) { + return sizeof(TGT); + } + template static uint64_t XXHash64(const TGT &target_value) { return duckdb_zstd::XXH64(&target_value, sizeof(target_value), 0); @@ -99,6 +104,11 @@ struct ParquetStringOperator : public BaseParquetOperator { ser.WriteData(const_data_ptr_cast(target_value.GetData()), target_value.GetSize()); } + template + static idx_t WriteSize(const TGT &target_value) { + return sizeof(uint32_t) + target_value.GetSize(); + } + template static uint64_t XXHash64(const TGT &target_value) { return duckdb_zstd::XXH64(target_value.GetData(), target_value.GetSize(), 0); @@ -118,7 +128,6 @@ struct ParquetIntervalTargetType { struct ParquetIntervalOperator : public BaseParquetOperator { template static TGT Operation(SRC input) { - if (input.days < 0 || input.months < 0 || input.micros < 0) { throw IOException("Parquet files do not support negative intervals"); } @@ -134,6 +143,11 @@ struct ParquetIntervalOperator : public BaseParquetOperator { ser.WriteData(target_value.bytes, ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE); } + template + static constexpr idx_t WriteSize(const TGT &target_value) { + return ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE; + } + template static uint64_t XXHash64(const TGT &target_value) { return duckdb_zstd::XXH64(target_value.bytes, ParquetIntervalTargetType::PARQUET_INTERVAL_SIZE, 0); @@ -167,6 +181,11 @@ struct ParquetUUIDOperator : public BaseParquetOperator { ser.WriteData(target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE); } + template + static constexpr idx_t WriteSize(const TGT &target_value) { + return ParquetUUIDTargetType::PARQUET_UUID_SIZE; + } + template static uint64_t XXHash64(const TGT &target_value) { return duckdb_zstd::XXH64(target_value.bytes, ParquetUUIDTargetType::PARQUET_UUID_SIZE, 0); diff --git a/src/include/duckdb/common/primitive_dictionary.hpp b/src/include/duckdb/common/primitive_dictionary.hpp index 4b48b7ab3dd3..8d75389a1a7b 100644 --- a/src/include/duckdb/common/primitive_dictionary.hpp +++ b/src/include/duckdb/common/primitive_dictionary.hpp @@ -21,13 +21,9 @@ struct PrimitiveCastOperator { } }; -template +template class PrimitiveDictionary { private: - static_assert(!std::is_same::value || - (std::is_same::value && std::is_same::value), - "If SRC is string_t, TGT must also be string_t"); - static constexpr idx_t LOAD_FACTOR = 2; static constexpr uint32_t INVALID_INDEX = static_cast(-1); @@ -46,13 +42,12 @@ class PrimitiveDictionary { //! It is used to dictionary-encode data in, e.g., Parquet files PrimitiveDictionary(Allocator &allocator, idx_t maximum_size_p, idx_t target_capacity_p) : maximum_size(maximum_size_p), size(0), capacity(NextPowerOfTwo(maximum_size * LOAD_FACTOR)), - capacity_mask(capacity - 1), target_capacity(target_capacity_p), target_offset(0), + capacity_mask(capacity - 1), target_capacity(target_capacity_p), allocated_dictionary(allocator.Allocate(capacity * sizeof(primitive_dictionary_entry_t))), allocated_target( allocator.Allocate(std::is_same::value ? target_capacity : capacity * sizeof(TGT))), - dictionary(reinterpret_cast(allocated_dictionary.get())), - target_values(reinterpret_cast(allocated_target.get())), target_raw(allocated_target.get()), - full(false) { + target_stream(allocated_target.get(), allocated_target.GetSize()), + dictionary(reinterpret_cast(allocated_dictionary.get())), full(false) { // Initialize empty for (idx_t i = 0; i < capacity; i++) { dictionary[i].index = INVALID_INDEX; @@ -86,6 +81,7 @@ class PrimitiveDictionary { //! Iterates over inserted values template ::value, int>::type = 0> void IterateValues(const std::function &fun) const { + const auto target_values = reinterpret_cast(allocated_target.get()); for (idx_t i = 0; i < capacity; i++) { auto &entry = dictionary[i]; if (entry.IsEmpty()) { @@ -119,13 +115,13 @@ class PrimitiveDictionary { //! Get the target written values as a memory stream (zero-copy) unique_ptr GetTargetMemoryStream() const { - auto result = make_uniq(target_raw, target_capacity); - result->SetPosition(target_offset); + auto result = make_uniq(target_stream.GetData(), target_stream.GetCapacity()); + result->SetPosition(target_stream.GetPosition()); return result; } private: - //! Looks up a value in the dictionary using linear probing + //! Look up a value in the dictionary using linear probing primitive_dictionary_entry_t &Lookup(const SRC &value) const { auto offset = Hash(value) & capacity_mask; while (!dictionary[offset].IsEmpty() && dictionary[offset].value != value) { @@ -134,32 +130,31 @@ class PrimitiveDictionary { return dictionary[offset]; } - //! Writes a value to the target data - template ::value, int>::type = 0> + //! Write a value to the target data (if source is not string) + template ::value, int>::type = 0> bool AddToTarget(const SRC &src_value) { - const auto tgt_value = CAST_OP::template Operation(src_value); - target_values[size] = tgt_value; - target_offset += sizeof(TGT); + const auto tgt_value = OP::template Operation(src_value); + if (target_stream.GetPosition() + OP::template WriteSize(tgt_value) > target_stream.GetCapacity()) { + return false; // Out of capacity + } + OP::template WriteToStream(tgt_value, target_stream); return true; } - //! Specialized template to add a string_t value to the target data - template ::value, int>::type = 0> + //! Write a value to the target data (if source is string) + template ::value, int>::type = 0> bool AddToTarget(SRC &src_value) { - if (target_offset + sizeof(uint32_t) + src_value.GetSize() > target_capacity) { + // If source is string, target must also be string + if (target_stream.GetPosition() + OP::template WriteSize(src_value) > target_stream.GetCapacity()) { return false; // Out of capacity } - // Store string length and increment offset - Store(UnsafeNumericCast(src_value.GetSize()), target_raw + target_offset); - target_offset += sizeof(uint32_t); + const auto ptr = target_stream.GetData() + target_stream.GetPosition() + sizeof(uint32_t); + OP::template WriteToStream(src_value, target_stream); - // Copy over string data to target, update "value" to point to it, and increment offset - memcpy(target_raw + target_offset, src_value.GetData(), src_value.GetSize()); if (!src_value.IsInlined()) { - src_value.SetPointer(char_ptr_cast(target_raw + target_offset)); + src_value.SetPointer(char_ptr_cast(ptr)); } - target_offset += src_value.GetSize(); return true; } @@ -169,22 +164,20 @@ class PrimitiveDictionary { const idx_t maximum_size; idx_t size; - //! Capacity (power of two) and corresponding mask + //! Dictionary capacity (power of two) and corresponding mask const idx_t capacity; const idx_t capacity_mask; - //! Capacity/offset of target encoded data + //! Capacity of target encoded data const idx_t target_capacity; - idx_t target_offset; //! Allocated regions for dictionary/target AllocatedData allocated_dictionary; AllocatedData allocated_target; + MemoryStream target_stream; //! Pointers to allocated regions for convenience primitive_dictionary_entry_t *const dictionary; - TGT *const target_values; - data_ptr_t const target_raw; //! More values inserted than possible bool full; From 32e1058dface2640d01b2d8408581a63a4b9aa87 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 14:08:53 +0100 Subject: [PATCH 091/142] all parquet tests working again --- extension/parquet/column_writer.cpp | 6 ++++++ .../sql/copy/parquet/writer/parquet_write_memory_usage.test | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index 607351b3298a..f774a8c681eb 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -291,12 +291,18 @@ struct float_na_equal { operator float() const { return val; } + bool operator==(const float &right) const { if (std::isnan(val) && std::isnan(right)) { return true; } return val == right; } + + bool operator!=(const float &right) const { + return !(*this == right); + } + float val; }; diff --git a/test/sql/copy/parquet/writer/parquet_write_memory_usage.test b/test/sql/copy/parquet/writer/parquet_write_memory_usage.test index 29a19bfe4609..2e91149a5cea 100644 --- a/test/sql/copy/parquet/writer/parquet_write_memory_usage.test +++ b/test/sql/copy/parquet/writer/parquet_write_memory_usage.test @@ -9,7 +9,7 @@ load __TEST_DIR__/parquet_write_memory_usage.db statement ok set threads=1 -foreach memory_limit,row_group_size 0.3mb,20480 0.6mb,40960 +foreach memory_limit,row_group_size 0.5mb,20480 1.0mb,40960 statement ok set memory_limit='${memory_limit}' From 3148218404701fa471623553e653a38b2f5e05b7 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Fri, 14 Feb 2025 14:21:43 +0100 Subject: [PATCH 092/142] Fix for filename on windows --- test/sql/copy/csv/test_union_by_name.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/copy/csv/test_union_by_name.test b/test/sql/copy/csv/test_union_by_name.test index ebe98d332378..a531f9eddda0 100644 --- a/test/sql/copy/csv/test_union_by_name.test +++ b/test/sql/copy/csv/test_union_by_name.test @@ -68,7 +68,7 @@ ORDER BY a; 9223372036854775807 NULL NULL query IIII -SELECT a, b, c, replace(replace(filename, '\', '/'), '__TEST_DIR__/', '') +SELECT a, b, c, replace(replace(filename, '__TEST_DIR__', ''), '\', '/')[2:] FROM read_csv_auto(['__TEST_DIR__/ubn1.csv', '__TEST_DIR__/ubn2.csv', '__TEST_DIR__/ubn3.csv'], UNION_BY_NAME=TRUE) ORDER BY a; ---- From 5f27683ea456f850bff4f91d9c9c540ec2e5c324 Mon Sep 17 00:00:00 2001 From: Niclas Haderer Date: Fri, 14 Feb 2025 14:29:06 +0100 Subject: [PATCH 093/142] Deleted copy constructor of pending query --- src/include/duckdb/main/pending_query_result.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/include/duckdb/main/pending_query_result.hpp b/src/include/duckdb/main/pending_query_result.hpp index 72fe9405fef8..28e87e9d86bb 100644 --- a/src/include/duckdb/main/pending_query_result.hpp +++ b/src/include/duckdb/main/pending_query_result.hpp @@ -29,6 +29,9 @@ class PendingQueryResult : public BaseQueryResult { DUCKDB_API explicit PendingQueryResult(ErrorData error_message); DUCKDB_API ~PendingQueryResult() override; DUCKDB_API bool AllowStreamResult() const; + PendingQueryResult(const PendingQueryResult&) = delete; + PendingQueryResult& operator=(const PendingQueryResult&) = delete; + public: //! Executes a single task within the query, returning whether or not the query is ready. From 20a0961fc1e71464070687a8fd1fa16fea8287f4 Mon Sep 17 00:00:00 2001 From: Niclas Haderer Date: Fri, 14 Feb 2025 14:32:45 +0100 Subject: [PATCH 094/142] format fix --- src/include/duckdb/main/pending_query_result.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/include/duckdb/main/pending_query_result.hpp b/src/include/duckdb/main/pending_query_result.hpp index 28e87e9d86bb..cf0268712cba 100644 --- a/src/include/duckdb/main/pending_query_result.hpp +++ b/src/include/duckdb/main/pending_query_result.hpp @@ -29,9 +29,8 @@ class PendingQueryResult : public BaseQueryResult { DUCKDB_API explicit PendingQueryResult(ErrorData error_message); DUCKDB_API ~PendingQueryResult() override; DUCKDB_API bool AllowStreamResult() const; - PendingQueryResult(const PendingQueryResult&) = delete; - PendingQueryResult& operator=(const PendingQueryResult&) = delete; - + PendingQueryResult(const PendingQueryResult &) = delete; + PendingQueryResult &operator=(const PendingQueryResult &) = delete; public: //! Executes a single task within the query, returning whether or not the query is ready. From efc541314d99daa60e883a6e04b41e25faf0b275 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 14:43:17 +0100 Subject: [PATCH 095/142] slightly tweak hash function --- src/common/types/hash.cpp | 8 ++++- test/sql/function/generic/hash_func.test | 44 ++++++++++++------------ 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/src/common/types/hash.cpp b/src/common/types/hash.cpp index f9fe42ffcbd5..9a9fd5daf9e8 100644 --- a/src/common/types/hash.cpp +++ b/src/common/types/hash.cpp @@ -98,9 +98,13 @@ hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept { h *= 0xd6e8feb86659fd93U; } - // XOR with remaining (<8) bytes + // Load and process remaining (<8) bytes hash_t hr = 0; memcpy(&hr, ptr, len & 7U); + hr *= 0xd6e8feb86659fd93U; + hr ^= h >> 32; + + // XOR with hash h ^= hr; // Finalize @@ -108,6 +112,8 @@ hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept { h ^= h >> 32; return h; + + // return Hash(h); } hash_t Hash(const char *val, size_t size) { diff --git a/test/sql/function/generic/hash_func.test b/test/sql/function/generic/hash_func.test index 0b933e660ffc..44ca5113eb87 100644 --- a/test/sql/function/generic/hash_func.test +++ b/test/sql/function/generic/hash_func.test @@ -44,9 +44,9 @@ CREATE TABLE structs AS query II SELECT s, HASH(s) FROM structs ---- -{'i': 5, 's': string} 16279265163003826010 +{'i': 5, 's': string} 5041354121594313779 {'i': -2, 's': NULL} 13311620765177879553 -{'i': NULL, 's': not null} 17906579446707938902 +{'i': NULL, 's': not null} 17669771151474316850 {'i': NULL, 's': NULL} 18212156630472451589 NULL 18212156630472451589 @@ -76,11 +76,11 @@ NULL 13787848793156543929 query II SELECT lg, HASH(lg) FROM lists ---- -[TGTA] 6988469852028562792 -[CGGT] 11509251853341801096 -[CCTC] 7465354080729552024 -[TCTA] 8712127848443266422 -[AGGG] 11482125973879342325 +[TGTA] 17595328716338797054 +[CGGT] 10306172129632853293 +[CCTC] 13297701768986389650 +[TCTA] 12532519228232631318 +[AGGG] 18327401687889337414 NULL 13787848793156543929 # Maps @@ -98,11 +98,11 @@ CREATE TABLE maps AS query II SELECT m, HASH(m) FROM maps ---- -{1=TGTA} 2794336106852724683 -{1=CGGT, 2=CCTC} 13102305630601287406 +{1=TGTA} 12831981919938534237 +{1=CGGT, 2=CCTC} 13475482557019497469 {} 13787848793156543929 -{1=TCTA, 2=NULL, 3=CGGT} 4782555145300717917 -{1=TGTA, 2=CGGT, 3=CCTC, 4=TCTA, 5=AGGG} 8572659779500367064 +{1=TCTA, 2=NULL, 3=CGGT} 6801514312074335687 +{1=TGTA, 2=CGGT, 3=CCTC, 4=TCTA, 5=AGGG} 1967491966533763128 NULL 13787848793156543929 statement ok @@ -189,17 +189,17 @@ SELECT r, HASH() FROM enums; query II SELECT r, HASH(r, 'capacitor') FROM enums; ---- -black 7369304742611425093 -brown 2341438809461609958 -red 8885610210938720771 -orange 10151273889449338965 -yellow 9455015799163091888 -green 5769395161578968563 -blue 264671877857503589 -violet 13697912152922098530 -grey 6956627843582995222 -white 11070700999111121301 -NULL 2712243419119719673 +black 10215506564763180114 +brown 14699666407584440049 +red 10435339440036763924 +orange 7449326894723801922 +yellow 7545557152300511399 +green 13515514493392674532 +blue 16730185616673645170 +violet 6167961171085770869 +grey 10019148715359395841 +white 8224352891729695362 +NULL 14853453776375799790 query II SELECT r, HASH('2022-02-12'::DATE, r) FROM enums; From 245f034e69680cc7c1c9befc4f80327a188ef4f2 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 14:44:54 +0100 Subject: [PATCH 096/142] add clickbench write benchmark --- benchmark/parquet/clickbench_write.benchmark | 23 ++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 benchmark/parquet/clickbench_write.benchmark diff --git a/benchmark/parquet/clickbench_write.benchmark b/benchmark/parquet/clickbench_write.benchmark new file mode 100644 index 000000000000..7f22b7fe706d --- /dev/null +++ b/benchmark/parquet/clickbench_write.benchmark @@ -0,0 +1,23 @@ +# name: benchmark/parquet/clickbench_write.benchmark +# description: Write ClickBench data to Parquet +# group: [clickbench] + +require httpfs + +require parquet + +name ClickBench Write Parquet +group Clickbench + +cache clickbench.duckdb + +load benchmark/clickbench/queries/load.sql + +init +set preserve_insertion_order=false; + +run +COPY hits TO '${BENCHMARK_DIR}/hits.parquet'; + +result I +10000000 From d99ceb6de9d36892c605dcc3fcbd3abe8bd2207f Mon Sep 17 00:00:00 2001 From: Tishj Date: Fri, 14 Feb 2025 15:02:17 +0100 Subject: [PATCH 097/142] backslashes only escape double/single quotes outside of quotes, inside quotes they escape everything --- src/function/cast/vector_cast_helpers.cpp | 74 +++++++++---------- test/sql/cast/string_to_list_cast.test | 5 ++ test/sql/cast/string_to_list_escapes.test | 51 ++++++++++++- test/sql/cast/string_to_map_escapes.test | 24 +++--- .../string_to_nested_types_cast.test_slow | 6 +- test/sql/cast/string_to_struct_escapes.test | 72 +++++++++++++++--- test/sql/cast/string_to_unnamed_struct.test | 2 +- 7 files changed, 167 insertions(+), 67 deletions(-) diff --git a/src/function/cast/vector_cast_helpers.cpp b/src/function/cast/vector_cast_helpers.cpp index 1d654da9b735..c0790d56b4ce 100644 --- a/src/function/cast/vector_cast_helpers.cpp +++ b/src/function/cast/vector_cast_helpers.cpp @@ -32,10 +32,6 @@ inline static void SkipWhitespace(StringCastInputState &input_state) { auto &buf = input_state.buf; auto &pos = input_state.pos; auto &len = input_state.len; - if (input_state.escaped) { - //! Escaped whitespace should not be skipped - return; - } while (pos < len && StringUtil::CharacterIsSpace(buf[pos])) { pos++; input_state.escaped = false; @@ -72,34 +68,35 @@ static bool SkipToClose(StringCastInputState &input_state) { auto &idx = input_state.pos; auto &buf = input_state.buf; auto &len = input_state.len; - auto &escaped = input_state.escaped; D_ASSERT(buf[idx] == '{' || buf[idx] == '[' || buf[idx] == '('); vector brackets; while (idx < len) { - if (!escaped) { - if (buf[idx] == '"' || buf[idx] == '\'') { + bool set_escaped = false; + if (buf[idx] == '"' || buf[idx] == '\'') { + if (!input_state.escaped) { if (!SkipToCloseQuotes(input_state)) { return false; } - } else if (buf[idx] == '{') { - brackets.push_back('}'); - } else if (buf[idx] == '(') { - brackets.push_back(')'); - } else if (buf[idx] == '[') { - brackets.push_back(']'); - } else if (buf[idx] == brackets.back()) { - brackets.pop_back(); - if (brackets.empty()) { - return true; - } - } else if (buf[idx] == '\\') { - escaped = true; } - } else { - escaped = false; + } else if (buf[idx] == '{') { + brackets.push_back('}'); + } else if (buf[idx] == '(') { + brackets.push_back(')'); + } else if (buf[idx] == '[') { + brackets.push_back(']'); + } else if (buf[idx] == brackets.back()) { + brackets.pop_back(); + if (brackets.empty()) { + return true; + } + } else if (buf[idx] == '\\') { + //! Note that we don't treat `\\` special here, backslashes can't be escaped outside of quotes + //! backslashes within quotes will not be encountered in this function + set_escaped = true; } + input_state.escaped = set_escaped; idx++; } return false; @@ -133,9 +130,11 @@ static string_t HandleString(Vector &vec, const char *buf, idx_t start, idx_t en auto current_char = buf[start + i]; if (!escaped) { if (scopes.empty() && current_char == '\\') { - //! Start of escape - escaped = true; - continue; + if (quoted || (start + i + 1 < end && (buf[start + i + 1] == '\'' || buf[start + i + 1] == '"'))) { + //! Start of escape + escaped = true; + continue; + } } if (scopes.empty() && (current_char == '\'' || current_char == '"')) { if (quoted && current_char == quote_char) { @@ -208,17 +207,14 @@ static inline bool ValueStateTransition(StringCastInputState &input_state, optio auto &pos = input_state.pos; bool set_escaped = false; - if (input_state.escaped) { + if (buf[pos] == '"' || buf[pos] == '\'') { if (!start_pos.IsValid()) { start_pos = pos; } - end_pos = pos; - } else if (buf[pos] == '"' || buf[pos] == '\'') { - if (!start_pos.IsValid()) { - start_pos = pos; - } - if (!SkipToCloseQuotes(input_state)) { - return false; + if (!input_state.escaped) { + if (!SkipToCloseQuotes(input_state)) { + return false; + } } end_pos = pos; } else if (buf[pos] == '{') { @@ -285,7 +281,7 @@ static bool SplitStringListInternal(const string_t &input, OP &state) { optional_idx start_pos; idx_t end_pos; - while (pos < len && ((buf[pos] != ',' && buf[pos] != ']') || input_state.escaped)) { + while (pos < len && (buf[pos] != ',' && buf[pos] != ']')) { if (!ValueStateTransition(input_state, start_pos, end_pos)) { return false; } @@ -387,7 +383,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { while (pos < len) { optional_idx start_pos; idx_t end_pos; - while (pos < len && (buf[pos] != '=' || input_state.escaped)) { + while (pos < len && buf[pos] != '=') { if (!ValueStateTransition(input_state, start_pos, end_pos)) { return false; } @@ -407,7 +403,7 @@ static bool SplitStringMapInternal(const string_t &input, OP &state) { start_pos = optional_idx(); pos++; SkipWhitespace(input_state); - while (pos < len && ((buf[pos] != ',' && buf[pos] != '}') || input_state.escaped)) { + while (pos < len && (buf[pos] != ',' && buf[pos] != '}')) { if (!ValueStateTransition(input_state, start_pos, end_pos)) { return false; } @@ -474,7 +470,7 @@ bool VectorStringToStruct::SplitStruct(const string_t &input, vectortimestamp_ns)::VARCHAR AS STRUCT(A TIME)) FROM test_a query I SELECT CAST(struct_pack(A=>blob)::VARCHAR AS STRUCT(A BLOB)) FROM test_all_types(); ---- -{'A': thisisalongblobx00withnullbytes} -{'A': x00x00x00a} +{'A': thisisalongblob\x00withnullbytes} +{'A': \x00\x00\x00a} {'A': NULL} query I diff --git a/test/sql/cast/string_to_struct_escapes.test b/test/sql/cast/string_to_struct_escapes.test index 869a48bc35c6..041386e77e74 100644 --- a/test/sql/cast/string_to_struct_escapes.test +++ b/test/sql/cast/string_to_struct_escapes.test @@ -67,20 +67,32 @@ can't be cast to the destination type # Invalid: Name contains a backslash statement error -SELECT $${backslash\name: value}$$::STRUCT("backslash\name" VARCHAR); +SELECT $${"backslash\name": value}$$::STRUCT("backslash\name" VARCHAR); ---- can't be cast to the destination type +# Valid: Name contains a backslash outside of quotes, interpreted as literal +query I +SELECT $${backslash\name: value}$$::STRUCT("backslash\name" VARCHAR); +---- +{'backslash\name': value} + # first `:` is not escaped, won't match the "name:" struct key statement error SELECT $${name: test, value: 30}$$::STRUCT("name:" VARCHAR, value INT); ---- can't be cast to the destination type -# Name can contain escaped `:` -query I +# Invalid: Name can contain escaped `:`, but only in quotes +statement error SELECT $${name\:: test, value: 30}$$::STRUCT("name:" VARCHAR, value INT); ---- +can't be cast to the destination type STRUCT("name:" VARCHAR, "value" INTEGER) + +# Valid: Name can contain escaped `:` in quotes +query I +SELECT $${"name\:": test, value: 30}$$::STRUCT("name:" VARCHAR, value INT); +---- {'name:': test, 'value': 30} # Name consists of `{}`, not a problem, with this syntax we expect a name, which is a plain string @@ -125,9 +137,14 @@ SELECT $${description: "Special characters: \\, \", ;, (, )"}$$::STRUCT(descript ---- {'description': Special characters: \, ", ;, (, )} +statement error +SELECT $${first\ name: "John", age: 30}$$::STRUCT("first name" VARCHAR, age INT); +---- +can't be cast to the destination type STRUCT("first name" VARCHAR, age INTEGER) + # Valid: Name with escaped space query I -SELECT $${first\ name: "John", age: 30}$$::STRUCT("first name" VARCHAR, age INT); +SELECT $${"first\ name": "John", age: 30}$$::STRUCT("first name" VARCHAR, age INT); ---- {'first name': John, 'age': 30} @@ -137,22 +154,43 @@ SELECT $${\"quote at start\": "value", age: 30}$$::STRUCT("""quote at start""" V ---- {'"quote at start"': value, 'age': 30} +statement error +SELECT $${backslash\\name: "John Doe", age: 30}$$::STRUCT("backslash\name" VARCHAR, age INT); +---- +can't be cast to the destination type STRUCT("backslash\name" VARCHAR, age INTEGER) + # Valid: Name with escaped backslash query I -SELECT $${backslash\\name: "John Doe", age: 30}$$::STRUCT("backslash\name" VARCHAR, age INT); +SELECT $${"backslash\\name": "John Doe", age: 30}$$::STRUCT("backslash\name" VARCHAR, age INT); ---- {'backslash\name': John Doe, 'age': 30} +statement error +SELECT $${user\,name: "Alice", age: 25}$$::STRUCT("user,name" VARCHAR, age INT); +---- +can't be cast to the destination type STRUCT("user,name" VARCHAR, age INTEGER) + # Valid: Name with escaped comma query I -SELECT $${user\,name: "Alice", age: 25}$$::STRUCT("user,name" VARCHAR, age INT); +SELECT $${"user\,name": "Alice", age: 25}$$::STRUCT("user,name" VARCHAR, age INT); ---- {'user,name': Alice, 'age': 25} -# Valid: Name with escaped parenthesis +# Valid: Name with comma query I +SELECT $${"user,name": "Alice", age: 25}$$::STRUCT("user,name" VARCHAR, age INT); +---- +{'user,name': Alice, 'age': 25} + +statement error SELECT $${user\(name\): "Alice", status: "active"}$$::STRUCT("user(name)" VARCHAR, status VARCHAR); ---- +can't be cast to the destination type STRUCT("user(name)" VARCHAR, status VARCHAR) + +# Valid: Name with escaped parenthesis +query I +SELECT $${"user\(name\)": "Alice", status: "active"}$$::STRUCT("user(name)" VARCHAR, status VARCHAR); +---- {'user(name)': Alice, 'status': active} # Valid: Name with unescaped parenthesis @@ -163,21 +201,26 @@ SELECT $${user(name): "Alice", status: "active"}$$::STRUCT("user(name)" VARCHAR, # Valid: Name with escaped space at end query I -SELECT $${user\ name\ : "Alice", age\ : 25}$$::STRUCT("user name " VARCHAR, "age " INT); +SELECT $${"user\ name\ ": "Alice", "age ": 25}$$::STRUCT("user name " VARCHAR, "age " INT); ---- {'user name ': Alice, 'age ': 25} +statement error +SELECT $${user\ name\ : "Alice", age\ : 25}$$::STRUCT("user name " VARCHAR, "age " INT); +---- +can't be cast to the destination type STRUCT("user name " VARCHAR, "age " INTEGER) + # Invalid: Name contains unescaped quote statement error SELECT $${"quote"start": "value", age: 30}$$::STRUCT("quote""start" VARCHAR, age INT); ---- can't be cast to the destination type -# Invalid: Name contains unescaped backslash -statement error +# Valid: Name contains unescaped backslash outside of quotes +query I SELECT $${backslash\name: "John", age: 30}$$::STRUCT("backslash\name" VARCHAR, age INT); ---- -can't be cast to the destination type +{'backslash\name': John, 'age': 30} # Valid: Name contains (unescaped) opening parenthesis query I @@ -191,9 +234,14 @@ SELECT $${\": "value", age: 30}$$::STRUCT("""" VARCHAR, age INTEGER) ---- {'"': value, 'age': 30} +statement error +SELECT $${\\: "escaped", age: 30}$$::STRUCT("\" VARCHAR, age INT); +---- +can't be cast to the destination type STRUCT("\" VARCHAR, age INTEGER) + # Name with only a special character (escaped) query I -SELECT $${\\: "escaped", age: 30}$$::STRUCT("\" VARCHAR, age INT); +SELECT $${"\\": "escaped", age: 30}$$::STRUCT("\" VARCHAR, age INT); ---- {'\': escaped, 'age': 30} diff --git a/test/sql/cast/string_to_unnamed_struct.test b/test/sql/cast/string_to_unnamed_struct.test index 5066463b05da..d51c01dcee58 100644 --- a/test/sql/cast/string_to_unnamed_struct.test +++ b/test/sql/cast/string_to_unnamed_struct.test @@ -75,4 +75,4 @@ select [ $$[((" test ")), {'a': (\\ test \\)}]$$ ] ---- -[[{'a': {'inner': test }}, {'a': {'inner': \ test \}}], [{'a': {'inner': test }}, {'a': {'inner': \ test \}}]] +[[{'a': {'inner': test }}, {'a': {'inner': \ test \}}], [{'a': {'inner': test }}, {'a': {'inner': \\ test \\}}]] From d0fbc866de09243766472a92f917021b3714b683 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 16:04:27 +0100 Subject: [PATCH 098/142] arena allocator for minmaxn and just skip nulls when creating enum --- .../aggregate/distributive/arg_min_max.cpp | 6 +- .../operator/schema/physical_create_type.cpp | 2 +- .../aggregate/distributive/minmax.cpp | 8 +- .../function/aggregate/minmax_n_helpers.hpp | 97 ++++++++++--------- 4 files changed, 59 insertions(+), 54 deletions(-) diff --git a/extension/core_functions/aggregate/distributive/arg_min_max.cpp b/extension/core_functions/aggregate/distributive/arg_min_max.cpp index 63c112b3ce3c..edb6c77c5371 100644 --- a/extension/core_functions/aggregate/distributive/arg_min_max.cpp +++ b/extension/core_functions/aggregate/distributive/arg_min_max.cpp @@ -545,8 +545,8 @@ class ArgMinMaxNState { BinaryAggregateHeap heap; bool is_initialized = false; - void Initialize(idx_t nval) { - heap.Initialize(nval); + void Initialize(ArenaAllocator &allocator, idx_t nval) { + heap.Initialize(allocator, nval); is_initialized = true; } }; @@ -601,7 +601,7 @@ static void ArgMinMaxNUpdate(Vector inputs[], AggregateInputData &aggr_input, id if (nval >= MAX_N) { throw InvalidInputException("Invalid input for arg_min/arg_max: n value must be < %d", MAX_N); } - state.Initialize(UnsafeNumericCast(nval)); + state.Initialize(aggr_input.allocator, UnsafeNumericCast(nval)); } // Now add the input to the heap diff --git a/src/execution/operator/schema/physical_create_type.cpp b/src/execution/operator/schema/physical_create_type.cpp index 68bc258b36d6..e73ca2662dc8 100644 --- a/src/execution/operator/schema/physical_create_type.cpp +++ b/src/execution/operator/schema/physical_create_type.cpp @@ -51,7 +51,7 @@ SinkResultType PhysicalCreateType::Sink(ExecutionContext &context, DataChunk &ch for (idx_t i = 0; i < chunk.size(); i++) { idx_t idx = sdata.sel->get_index(i); if (!sdata.validity.RowIsValid(idx)) { - throw InvalidInputException("Attempted to create ENUM type with NULL value!"); + continue; } auto str = src_ptr[idx]; auto entry = gstate.found_strings.find(src_ptr[idx]); diff --git a/src/function/aggregate/distributive/minmax.cpp b/src/function/aggregate/distributive/minmax.cpp index b862bf6d9623..32a9518e1889 100644 --- a/src/function/aggregate/distributive/minmax.cpp +++ b/src/function/aggregate/distributive/minmax.cpp @@ -412,8 +412,8 @@ class MinMaxNState { UnaryAggregateHeap heap; bool is_initialized = false; - void Initialize(idx_t nval) { - heap.Initialize(nval); + void Initialize(ArenaAllocator &allocator, idx_t nval) { + heap.Initialize(allocator, nval); is_initialized = true; } @@ -432,7 +432,7 @@ static void MinMaxNUpdate(Vector inputs[], AggregateInputData &aggr_input, idx_t UnifiedVectorFormat val_format; UnifiedVectorFormat n_format; UnifiedVectorFormat state_format; - ; + auto val_extra_state = STATE::VAL_TYPE::CreateExtraState(val_vector, count); STATE::VAL_TYPE::PrepareData(val_vector, count, val_extra_state, val_format); @@ -464,7 +464,7 @@ static void MinMaxNUpdate(Vector inputs[], AggregateInputData &aggr_input, idx_t if (nval >= MAX_N) { throw InvalidInputException("Invalid input for MIN/MAX: n value must be < %d", MAX_N); } - state.Initialize(UnsafeNumericCast(nval)); + state.Initialize(aggr_input.allocator, UnsafeNumericCast(nval)); } // Now add the input to the heap diff --git a/src/include/duckdb/function/aggregate/minmax_n_helpers.hpp b/src/include/duckdb/function/aggregate/minmax_n_helpers.hpp index 9c59d11cbd1a..07e1c48e9ea7 100644 --- a/src/include/duckdb/function/aggregate/minmax_n_helpers.hpp +++ b/src/include/duckdb/function/aggregate/minmax_n_helpers.hpp @@ -102,20 +102,22 @@ class UnaryAggregateHeap { public: UnaryAggregateHeap() = default; - explicit UnaryAggregateHeap(idx_t capacity_p) : capacity(capacity_p) { - heap.reserve(capacity); + UnaryAggregateHeap(ArenaAllocator &allocator, idx_t capacity_p) { + Initialize(allocator, capacity_p); } - void Initialize(const idx_t capacity_p) { + void Initialize(ArenaAllocator &allocator, const idx_t capacity_p) { capacity = capacity_p; - heap.reserve(capacity); + heap = reinterpret_cast *>(allocator.AllocateAligned(capacity * sizeof(HeapEntry))); + memset(heap, 0, capacity * sizeof(HeapEntry)); + size = 0; } bool IsEmpty() const { - return heap.empty(); + return size == 0; } idx_t Size() const { - return heap.size(); + return size; } idx_t Capacity() const { return capacity; @@ -125,29 +127,28 @@ class UnaryAggregateHeap { D_ASSERT(capacity != 0); // must be initialized // If the heap is not full, insert the value into a new slot - if (heap.size() < capacity) { - heap.emplace_back(); - heap.back().Assign(allocator, value); - std::push_heap(heap.begin(), heap.end(), Compare); + if (size < capacity) { + heap[size++].Assign(allocator, value); + std::push_heap(heap, heap + size, Compare); } // If the heap is full, check if the value is greater than the smallest value in the heap // If it is, assign the new value to the slot and re-heapify - else if (T_COMPARATOR::Operation(value, heap.front().value)) { - std::pop_heap(heap.begin(), heap.end(), Compare); - heap.back().Assign(allocator, value); - std::push_heap(heap.begin(), heap.end(), Compare); + else if (T_COMPARATOR::Operation(value, heap[0].value)) { + std::pop_heap(heap, heap + size, Compare); + heap[size - 1].Assign(allocator, value); + std::push_heap(heap, heap + size, Compare); } - D_ASSERT(std::is_heap(heap.begin(), heap.end(), Compare)); + D_ASSERT(std::is_heap(heap, heap + size, Compare)); } void Insert(ArenaAllocator &allocator, const UnaryAggregateHeap &other) { - for (auto &slot : other.heap) { - Insert(allocator, slot.value); + for (idx_t slot = 0; slot < other.Size(); slot++) { + Insert(allocator, other.heap[slot].value); } } - vector> &SortAndGetHeap() { - std::sort_heap(heap.begin(), heap.end(), Compare); + HeapEntry *SortAndGetHeap() { + std::sort_heap(heap, heap + size, Compare); return heap; } @@ -160,8 +161,9 @@ class UnaryAggregateHeap { return T_COMPARATOR::Operation(left.value, right.value); } - vector> heap; idx_t capacity; + HeapEntry *heap; + idx_t size; }; template @@ -171,20 +173,22 @@ class BinaryAggregateHeap { public: BinaryAggregateHeap() = default; - explicit BinaryAggregateHeap(idx_t capacity_p) : capacity(capacity_p) { - heap.reserve(capacity); + BinaryAggregateHeap(ArenaAllocator &allocator, idx_t capacity_p) { + Initialize(allocator, capacity_p); } - void Initialize(const idx_t capacity_p) { + void Initialize(ArenaAllocator &allocator, const idx_t capacity_p) { capacity = capacity_p; - heap.reserve(capacity); + heap = reinterpret_cast(allocator.AllocateAligned(capacity * sizeof(STORAGE_TYPE))); + memset(heap, 0, capacity * sizeof(STORAGE_TYPE)); + size = 0; } bool IsEmpty() const { - return heap.empty(); + return size == 0; } idx_t Size() const { - return heap.size(); + return size; } idx_t Capacity() const { return capacity; @@ -194,31 +198,31 @@ class BinaryAggregateHeap { D_ASSERT(capacity != 0); // must be initialized // If the heap is not full, insert the value into a new slot - if (heap.size() < capacity) { - heap.emplace_back(); - heap.back().first.Assign(allocator, key); - heap.back().second.Assign(allocator, value); - std::push_heap(heap.begin(), heap.end(), Compare); + if (size < capacity) { + heap[size].first.Assign(allocator, key); + heap[size].second.Assign(allocator, value); + size++; + std::push_heap(heap, heap + size, Compare); } // If the heap is full, check if the value is greater than the smallest value in the heap // If it is, assign the new value to the slot and re-heapify - else if (K_COMPARATOR::Operation(key, heap.front().first.value)) { - std::pop_heap(heap.begin(), heap.end(), Compare); - heap.back().first.Assign(allocator, key); - heap.back().second.Assign(allocator, value); - std::push_heap(heap.begin(), heap.end(), Compare); + else if (K_COMPARATOR::Operation(key, heap[0].first.value)) { + std::pop_heap(heap, heap + size, Compare); + heap[size - 1].first.Assign(allocator, key); + heap[size - 1].second.Assign(allocator, value); + std::push_heap(heap, heap + size, Compare); } - D_ASSERT(std::is_heap(heap.begin(), heap.end(), Compare)); + D_ASSERT(std::is_heap(heap, heap + size, Compare)); } void Insert(ArenaAllocator &allocator, const BinaryAggregateHeap &other) { - for (auto &slot : other.heap) { - Insert(allocator, slot.first.value, slot.second.value); + for (idx_t slot = 0; slot < other.Size(); slot++) { + Insert(allocator, other.heap[slot].first.value, other.heap[slot].second.value); } } - vector &SortAndGetHeap() { - std::sort_heap(heap.begin(), heap.end(), Compare); + STORAGE_TYPE *SortAndGetHeap() { + std::sort_heap(heap, heap + size, Compare); return heap; } @@ -231,8 +235,9 @@ class BinaryAggregateHeap { return K_COMPARATOR::Operation(left.first.value, right.first.value); } - vector heap; idx_t capacity; + STORAGE_TYPE *heap; + idx_t size; }; //------------------------------------------------------------------------------ @@ -326,7 +331,7 @@ struct MinMaxNOperation { } if (!target.is_initialized) { - target.Initialize(source.heap.Capacity()); + target.Initialize(aggr_input.allocator, source.heap.Capacity()); } else if (source.heap.Capacity() != target.heap.Capacity()) { throw InvalidInputException("Mismatched n values in min/max/arg_min/arg_max"); } @@ -377,10 +382,10 @@ struct MinMaxNOperation { list_entry.length = state.heap.Size(); // Turn the heap into a sorted list, invalidating the heap property - auto &heap = state.heap.SortAndGetHeap(); + auto heap = state.heap.SortAndGetHeap(); - for (const auto &slot : heap) { - STATE::VAL_TYPE::Assign(child_data, current_offset++, state.heap.GetValue(slot)); + for (idx_t slot = 0; slot < state.heap.Size(); slot++) { + STATE::VAL_TYPE::Assign(child_data, current_offset++, state.heap.GetValue(heap[slot])); } } From 4a2097101da48c83edc301e7fdde77c3365979f1 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Fri, 14 Feb 2025 16:16:33 +0100 Subject: [PATCH 099/142] codequality fixes and buffer-manage parquet columndatacollections --- benchmark/parquet/clickbench_write.benchmark | 2 +- extension/parquet/include/parquet_writer.hpp | 1 - extension/parquet/parquet_extension.cpp | 3 ++- extension/parquet/parquet_writer.cpp | 8 ++------ .../operator/persistent/physical_batch_copy_to_file.cpp | 9 ++++++--- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/benchmark/parquet/clickbench_write.benchmark b/benchmark/parquet/clickbench_write.benchmark index 7f22b7fe706d..2a4f3bc6340e 100644 --- a/benchmark/parquet/clickbench_write.benchmark +++ b/benchmark/parquet/clickbench_write.benchmark @@ -1,6 +1,6 @@ # name: benchmark/parquet/clickbench_write.benchmark # description: Write ClickBench data to Parquet -# group: [clickbench] +# group: [parquet] require httpfs diff --git a/extension/parquet/include/parquet_writer.hpp b/extension/parquet/include/parquet_writer.hpp index 8af50765e50f..b16a43fab8cb 100644 --- a/extension/parquet/include/parquet_writer.hpp +++ b/extension/parquet/include/parquet_writer.hpp @@ -36,7 +36,6 @@ class Deserializer; struct PreparedRowGroup { duckdb_parquet::RowGroup row_group; vector> states; - vector> heaps; }; struct FieldID; diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index adc5ee37cd06..aae6f44b3be5 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -222,7 +222,8 @@ struct ParquetWriteGlobalState : public GlobalFunctionData { struct ParquetWriteLocalState : public LocalFunctionData { explicit ParquetWriteLocalState(ClientContext &context, const vector &types) - : buffer(BufferAllocator::Get(context), types) { + : buffer(context, types) { + buffer.SetPartitionIndex(0); // Makes the buffer manager less likely to spill this data buffer.InitializeAppend(append_state); } diff --git a/extension/parquet/parquet_writer.cpp b/extension/parquet/parquet_writer.cpp index b3af8efe3e81..4a8e38bba44b 100644 --- a/extension/parquet/parquet_writer.cpp +++ b/extension/parquet/parquet_writer.cpp @@ -389,9 +389,8 @@ void ParquetWriter::PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGro // We write 8 columns at a time so that iterating over ColumnDataCollection is more efficient static constexpr idx_t COLUMNS_PER_PASS = 8; - // We want these to be in-memory/hybrid so we don't have to copy over strings to the dictionary - D_ASSERT(buffer.GetAllocatorType() == ColumnDataAllocatorType::IN_MEMORY_ALLOCATOR || - buffer.GetAllocatorType() == ColumnDataAllocatorType::HYBRID); + // We want these to be buffer-managed + D_ASSERT(buffer.GetAllocatorType() == ColumnDataAllocatorType::BUFFER_MANAGER_ALLOCATOR); // set up a new row group for this chunk collection auto &row_group = result.row_group; @@ -451,7 +450,6 @@ void ParquetWriter::PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGro states.push_back(std::move(write_state)); } } - result.heaps = buffer.GetHeapReferences(); } // Validation code adapted from Impala @@ -509,8 +507,6 @@ void ParquetWriter::FlushRowGroup(PreparedRowGroup &prepared) { // append the row group to the file meta data file_meta_data.row_groups.push_back(row_group); file_meta_data.num_rows += row_group.num_rows; - - prepared.heaps.clear(); } void ParquetWriter::Flush(ColumnDataCollection &buffer) { diff --git a/src/execution/operator/persistent/physical_batch_copy_to_file.cpp b/src/execution/operator/persistent/physical_batch_copy_to_file.cpp index 4effccaff4b7..8b3fa0b9b119 100644 --- a/src/execution/operator/persistent/physical_batch_copy_to_file.cpp +++ b/src/execution/operator/persistent/physical_batch_copy_to_file.cpp @@ -143,7 +143,8 @@ class FixedBatchCopyLocalState : public LocalSinkState { FixedBatchCopyState current_task = FixedBatchCopyState::SINKING_DATA; void InitializeCollection(ClientContext &context, const PhysicalOperator &op) { - collection = make_uniq(BufferAllocator::Get(context), op.children[0]->types); + collection = make_uniq(context, op.children[0]->types); + collection->SetPartitionIndex(0); // Makes the buffer manager less likely to spill this data collection->InitializeAppend(append_state); local_memory_usage = 0; } @@ -434,7 +435,8 @@ void PhysicalBatchCopyToFile::RepartitionBatches(ClientContext &context, GlobalS // the collection is too large for a batch - we need to repartition // create an empty collection auto new_collection = - make_uniq(BufferAllocator::Get(context), children[0]->types); + make_uniq(context, children[0]->types); + new_collection->SetPartitionIndex(0); // Makes the buffer manager less likely to spill this data append_batch = make_uniq(0U, std::move(new_collection)); } if (append_batch) { @@ -458,7 +460,8 @@ void PhysicalBatchCopyToFile::RepartitionBatches(ClientContext &context, GlobalS // the collection is full - move it to the result and create a new one task_manager.AddTask(make_uniq(gstate.scheduled_batch_index++, std::move(append_batch))); - auto new_collection = make_uniq(BufferAllocator::Get(context), children[0]->types); + auto new_collection = make_uniq(context, children[0]->types); + new_collection->SetPartitionIndex(0); // Makes the buffer manager less likely to spill this data append_batch = make_uniq(0U, std::move(new_collection)); append_batch->collection->InitializeAppend(append_state); } From 0682cec6d5aaace2d4eadb6fc7d03d15134518fd Mon Sep 17 00:00:00 2001 From: Mathias Lafeldt Date: Sat, 15 Feb 2025 21:17:24 +0100 Subject: [PATCH 100/142] Include extension_util.hpp in libduckdb --- scripts/amalgamation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/amalgamation.py b/scripts/amalgamation.py index 325cc19f1521..1ba307278910 100644 --- a/scripts/amalgamation.py +++ b/scripts/amalgamation.py @@ -39,6 +39,7 @@ os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'memory_stream.hpp'), os.path.join(include_dir, 'duckdb', 'main', 'appender.hpp'), os.path.join(include_dir, 'duckdb', 'main', 'client_context.hpp'), + os.path.join(include_dir, 'duckdb', 'main', 'extension_util.hpp'), os.path.join(include_dir, 'duckdb', 'function', 'function.hpp'), os.path.join(include_dir, 'duckdb', 'function', 'table_function.hpp'), os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_table_function_info.hpp'), From 141c449eb7f1f828daa6dfb932b164f485c1a8cb Mon Sep 17 00:00:00 2001 From: Mathias Lafeldt Date: Sat, 15 Feb 2025 13:43:27 +0100 Subject: [PATCH 101/142] Report errors caused by get_database in C extensions --- src/main/extension/extension_load.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/extension/extension_load.cpp b/src/main/extension/extension_load.cpp index 4be4588371cf..dfdb5a306bcc 100644 --- a/src/main/extension/extension_load.cpp +++ b/src/main/extension/extension_load.cpp @@ -92,9 +92,11 @@ struct ExtensionAccess { load_state.database_data->database = make_shared_ptr(load_state.db); return reinterpret_cast(load_state.database_data.get()); } catch (std::exception &ex) { + load_state.has_error = true; load_state.error_data = ErrorData(ex); return nullptr; } catch (...) { + load_state.has_error = true; load_state.error_data = ErrorData(ExceptionType::UNKNOWN_TYPE, "Unknown error in GetDatabase when trying to load extension!"); return nullptr; From a46237bd200242f148c2c16cba8ba49289f6a051 Mon Sep 17 00:00:00 2001 From: Mathias Lafeldt Date: Sat, 15 Feb 2025 13:44:59 +0100 Subject: [PATCH 102/142] Simplify SetError --- src/main/extension/extension_load.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/main/extension/extension_load.cpp b/src/main/extension/extension_load.cpp index dfdb5a306bcc..5499665f1a27 100644 --- a/src/main/extension/extension_load.cpp +++ b/src/main/extension/extension_load.cpp @@ -71,15 +71,11 @@ struct ExtensionAccess { static void SetError(duckdb_extension_info info, const char *error) { auto &load_state = DuckDBExtensionLoadState::Get(info); - if (error) { - load_state.has_error = true; - load_state.error_data = ErrorData(error); - } else { - load_state.has_error = true; - load_state.error_data = ErrorData( - ExceptionType::UNKNOWN_TYPE, - "Extension has indicated an error occured during initialization, but did not set an error message."); - } + load_state.has_error = true; + load_state.error_data = + error ? ErrorData(error) + : ErrorData(ExceptionType::UNKNOWN_TYPE, "Extension has indicated an error occured during " + "initialization, but did not set an error message."); } //! Called by the extension get a pointer to the database that is loading it From 381f75e76ff870b129773c34effe5f7aeb510ee5 Mon Sep 17 00:00:00 2001 From: Richard Wesley <13156216+hawkfish@users.noreply.github.com> Date: Mon, 17 Feb 2025 17:06:34 +1300 Subject: [PATCH 103/142] Issue #16250: Window Range Performance * Use two cursors for range searches * Reduces benchmark time from 35s to 25s --- .../window/window_boundaries_state.cpp | 110 +++++++++++------- .../window/window_boundaries_state.hpp | 4 + 2 files changed, 75 insertions(+), 39 deletions(-) diff --git a/src/function/window/window_boundaries_state.cpp b/src/function/window/window_boundaries_state.cpp index ce3ba3bbeb85..6ee3c105234d 100644 --- a/src/function/window/window_boundaries_state.cpp +++ b/src/function/window/window_boundaries_state.cpp @@ -180,9 +180,9 @@ struct OperationCompare : public std::function { }; template -static idx_t FindTypedRangeBound(WindowCursor &over, const idx_t order_begin, const idx_t order_end, - const WindowBoundary range, WindowInputExpression &boundary, const idx_t chunk_idx, - const FrameBounds &prev) { +static idx_t FindTypedRangeBound(WindowCursor &range_lo, WindowCursor &range_hi, const idx_t order_begin, + const idx_t order_end, const WindowBoundary range, WindowInputExpression &boundary, + const idx_t chunk_idx, const FrameBounds &prev) { D_ASSERT(!boundary.CellIsNull(chunk_idx)); const auto val = boundary.GetCell(chunk_idx); @@ -191,14 +191,14 @@ static idx_t FindTypedRangeBound(WindowCursor &over, const idx_t order_begin, co // Check that the value we are searching for is in range. if (range == WindowBoundary::EXPR_PRECEDING_RANGE) { // Preceding but value past the current value - const auto cur_val = over.GetCell(0, order_end - 1); + const auto cur_val = range_hi.GetCell(0, order_end - 1); if (comp(cur_val, val)) { throw OutOfRangeException("Invalid RANGE PRECEDING value"); } } else { // Following but value before the current value D_ASSERT(range == WindowBoundary::EXPR_FOLLOWING_RANGE); - const auto cur_val = over.GetCell(0, order_begin); + const auto cur_val = range_lo.GetCell(0, order_begin); if (comp(val, cur_val)) { throw OutOfRangeException("Invalid RANGE FOLLOWING value"); } @@ -206,18 +206,18 @@ static idx_t FindTypedRangeBound(WindowCursor &over, const idx_t order_begin, co // Try to reuse the previous bounds to restrict the search. // This is only valid if the previous bounds were non-empty // Only inject the comparisons if the previous bounds are a strict subset. - WindowColumnIterator begin(over, order_begin); - WindowColumnIterator end(over, order_end); + WindowColumnIterator begin(range_lo, order_begin); + WindowColumnIterator end(range_hi, order_end); if (prev.start < prev.end) { if (order_begin < prev.start && prev.start < order_end) { - const auto first = over.GetCell(0, prev.start); + const auto first = range_lo.GetCell(0, prev.start); if (!comp(val, first)) { // prev.first <= val, so we can start further forward begin += UnsafeNumericCast(prev.start - order_begin); } } if (order_begin < prev.end && prev.end < order_end) { - const auto second = over.GetCell(0, prev.end - 1); + const auto second = range_hi.GetCell(0, prev.end - 1); if (!comp(second, val)) { // val <= prev.second, so we can end further back // (prev.second is the largest peer) @@ -234,52 +234,65 @@ static idx_t FindTypedRangeBound(WindowCursor &over, const idx_t order_begin, co } template -static idx_t FindRangeBound(WindowCursor &over, const idx_t order_begin, const idx_t order_end, - const WindowBoundary range, WindowInputExpression &boundary, const idx_t chunk_idx, - const FrameBounds &prev) { +static idx_t FindRangeBound(WindowCursor &range_lo, WindowCursor &range_hi, const idx_t order_begin, + const idx_t order_end, const WindowBoundary range, WindowInputExpression &boundary, + const idx_t chunk_idx, const FrameBounds &prev) { switch (boundary.InternalType()) { case PhysicalType::INT8: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::INT16: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::INT32: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::INT64: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::UINT8: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::UINT16: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::UINT32: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::UINT64: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::INT128: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::UINT128: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, - prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::FLOAT: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::DOUBLE: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); case PhysicalType::INTERVAL: - return FindTypedRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, - prev); + return FindTypedRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, + chunk_idx, prev); default: throw InternalException("Unsupported column type for RANGE"); } } template -static idx_t FindOrderedRangeBound(WindowCursor &over, const OrderType range_sense, const idx_t order_begin, - const idx_t order_end, const WindowBoundary range, WindowInputExpression &boundary, - const idx_t chunk_idx, const FrameBounds &prev) { +static idx_t FindOrderedRangeBound(WindowCursor &range_lo, WindowCursor &range_hi, const OrderType range_sense, + const idx_t order_begin, const idx_t order_end, const WindowBoundary range, + WindowInputExpression &boundary, const idx_t chunk_idx, const FrameBounds &prev) { switch (range_sense) { case OrderType::ASCENDING: - return FindRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, chunk_idx, + prev); case OrderType::DESCENDING: - return FindRangeBound(over, order_begin, order_end, range, boundary, chunk_idx, prev); + return FindRangeBound(range_lo, range_hi, order_begin, order_end, range, boundary, chunk_idx, + prev); default: throw InternalException("Unsupported ORDER BY sense for RANGE"); } @@ -718,6 +731,13 @@ void WindowBoundariesState::FrameBegin(DataChunk &bounds, idx_t row_idx, const i prev.start = valid_begin_data[0]; prev.end = valid_end_data[0]; + if (has_preceding_range || has_following_range) { + if (range_lo.get() != range.get()) { + range_lo = range.get(); + range_hi = range_lo->Copy(); + } + } + switch (start_boundary) { case WindowBoundary::UNBOUNDED_PRECEDING: bounds.data[FRAME_BEGIN].Reference(bounds.data[PARTITION_BEGIN]); @@ -766,7 +786,12 @@ void WindowBoundariesState::FrameBegin(DataChunk &bounds, idx_t row_idx, const i } else { const auto valid_start = valid_begin_data[chunk_idx]; prev.end = valid_end_data[chunk_idx]; - window_start = FindOrderedRangeBound(*range, range_sense, valid_start, row_idx + 1, + const auto cur_partition = partition_begin_data[chunk_idx]; + if (cur_partition != prev_partition) { + prev.start = valid_start; + prev_partition = cur_partition; + } + window_start = FindOrderedRangeBound(*range_lo, *range_hi, range_sense, valid_start, row_idx + 1, start_boundary, boundary_begin, chunk_idx, prev); prev.start = window_start; } @@ -785,8 +810,8 @@ void WindowBoundariesState::FrameBegin(DataChunk &bounds, idx_t row_idx, const i prev.start = valid_begin_data[chunk_idx]; prev_partition = cur_partition; } - window_start = FindOrderedRangeBound(*range, range_sense, row_idx, valid_end, start_boundary, - boundary_begin, chunk_idx, prev); + window_start = FindOrderedRangeBound(*range_lo, *range_hi, range_sense, row_idx, valid_end, + start_boundary, boundary_begin, chunk_idx, prev); prev.start = window_start; } frame_begin_data[chunk_idx] = window_start; @@ -862,6 +887,13 @@ void WindowBoundariesState::FrameEnd(DataChunk &bounds, idx_t row_idx, const idx prev.start = valid_begin_data[0]; prev.end = valid_end_data[0]; + if (has_preceding_range || has_following_range) { + if (range_lo.get() != range.get()) { + range_lo = range.get(); + range_hi = range_lo->Copy(); + } + } + switch (end_boundary) { case WindowBoundary::CURRENT_ROW_ROWS: for (idx_t chunk_idx = 0; chunk_idx < count; ++chunk_idx, ++row_idx) { @@ -911,8 +943,8 @@ void WindowBoundariesState::FrameEnd(DataChunk &bounds, idx_t row_idx, const idx } else { const auto valid_start = valid_begin_data[chunk_idx]; prev.start = valid_start; - window_end = FindOrderedRangeBound(*range, range_sense, valid_start, row_idx + 1, end_boundary, - boundary_end, chunk_idx, prev); + window_end = FindOrderedRangeBound(*range_lo, *range_hi, range_sense, valid_start, row_idx + 1, + end_boundary, boundary_end, chunk_idx, prev); prev.end = window_end; } frame_end_data[chunk_idx] = window_end; @@ -930,8 +962,8 @@ void WindowBoundariesState::FrameEnd(DataChunk &bounds, idx_t row_idx, const idx prev.end = valid_end; prev_partition = cur_partition; } - window_end = FindOrderedRangeBound(*range, range_sense, row_idx, valid_end, end_boundary, - boundary_end, chunk_idx, prev); + window_end = FindOrderedRangeBound(*range_lo, *range_hi, range_sense, row_idx, valid_end, + end_boundary, boundary_end, chunk_idx, prev); prev.end = window_end; } frame_end_data[chunk_idx] = window_end; diff --git a/src/include/duckdb/function/window/window_boundaries_state.hpp b/src/include/duckdb/function/window/window_boundaries_state.hpp index 2748bc7a0600..11c724d9b638 100644 --- a/src/include/duckdb/function/window/window_boundaries_state.hpp +++ b/src/include/duckdb/function/window/window_boundaries_state.hpp @@ -148,6 +148,10 @@ struct WindowBoundariesState { idx_t valid_end = 0; FrameBounds prev; + + // Extra range cursor + optional_ptr range_lo; + unique_ptr range_hi; }; } // namespace duckdb From 5ac9f9e1840ebb5e4461565cc8617006b321617d Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Mon, 17 Feb 2025 10:27:15 +0100 Subject: [PATCH 104/142] format/test fixes for parquet writer --- extension/parquet/parquet_extension.cpp | 3 +-- src/common/types/column/column_data_collection.cpp | 2 +- .../operator/persistent/physical_batch_copy_to_file.cpp | 3 +-- test/sql/copy/parquet/writer/parquet_write_memory_usage.test | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index aae6f44b3be5..cb86e1bd93bb 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -221,8 +221,7 @@ struct ParquetWriteGlobalState : public GlobalFunctionData { }; struct ParquetWriteLocalState : public LocalFunctionData { - explicit ParquetWriteLocalState(ClientContext &context, const vector &types) - : buffer(context, types) { + explicit ParquetWriteLocalState(ClientContext &context, const vector &types) : buffer(context, types) { buffer.SetPartitionIndex(0); // Makes the buffer manager less likely to spill this data buffer.InitializeAppend(append_state); } diff --git a/src/common/types/column/column_data_collection.cpp b/src/common/types/column/column_data_collection.cpp index 17be6722389b..a2480f44d624 100644 --- a/src/common/types/column/column_data_collection.cpp +++ b/src/common/types/column/column_data_collection.cpp @@ -562,7 +562,7 @@ void ColumnDataCopy(ColumnDataMetaData &meta_data, const UnifiedVector offset += append_count; remaining -= append_count; - if (vector_remaining - append_count == 0) { + if (remaining != 0 && vector_remaining - append_count == 0) { // need to append more, check if we need to allocate a new vector or not if (!current_segment.next_data.IsValid()) { segment.AllocateVector(source.GetType(), meta_data.chunk_data, append_state, current_index); diff --git a/src/execution/operator/persistent/physical_batch_copy_to_file.cpp b/src/execution/operator/persistent/physical_batch_copy_to_file.cpp index 8b3fa0b9b119..80a761d34f20 100644 --- a/src/execution/operator/persistent/physical_batch_copy_to_file.cpp +++ b/src/execution/operator/persistent/physical_batch_copy_to_file.cpp @@ -434,8 +434,7 @@ void PhysicalBatchCopyToFile::RepartitionBatches(ClientContext &context, GlobalS } else { // the collection is too large for a batch - we need to repartition // create an empty collection - auto new_collection = - make_uniq(context, children[0]->types); + auto new_collection = make_uniq(context, children[0]->types); new_collection->SetPartitionIndex(0); // Makes the buffer manager less likely to spill this data append_batch = make_uniq(0U, std::move(new_collection)); } diff --git a/test/sql/copy/parquet/writer/parquet_write_memory_usage.test b/test/sql/copy/parquet/writer/parquet_write_memory_usage.test index 2e91149a5cea..6bfc26d0fdf6 100644 --- a/test/sql/copy/parquet/writer/parquet_write_memory_usage.test +++ b/test/sql/copy/parquet/writer/parquet_write_memory_usage.test @@ -9,7 +9,7 @@ load __TEST_DIR__/parquet_write_memory_usage.db statement ok set threads=1 -foreach memory_limit,row_group_size 0.5mb,20480 1.0mb,40960 +foreach memory_limit,row_group_size 0.6mb,20480 1.2mb,40960 statement ok set memory_limit='${memory_limit}' From 3c90da4674539789ed29bee41900dc0ff8cf3669 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 17 Feb 2025 10:44:01 +0100 Subject: [PATCH 105/142] whenever seed is set, parallel sink is false --- .../operator/helper/physical_streaming_sample.cpp | 2 +- test/sql/sample/bernoulli_sampling.test | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/execution/operator/helper/physical_streaming_sample.cpp b/src/execution/operator/helper/physical_streaming_sample.cpp index 721717989f88..ed9e21f35195 100644 --- a/src/execution/operator/helper/physical_streaming_sample.cpp +++ b/src/execution/operator/helper/physical_streaming_sample.cpp @@ -51,7 +51,7 @@ void PhysicalStreamingSample::BernoulliSample(DataChunk &input, DataChunk &resul } bool PhysicalStreamingSample::ParallelOperator() const { - return !sample_options->repeatable; + return !(sample_options->repeatable || sample_options->seed.IsValid()); } unique_ptr PhysicalStreamingSample::GetOperatorState(ExecutionContext &context) const { diff --git a/test/sql/sample/bernoulli_sampling.test b/test/sql/sample/bernoulli_sampling.test index e8953bb38e40..95b3e3796c8f 100644 --- a/test/sql/sample/bernoulli_sampling.test +++ b/test/sql/sample/bernoulli_sampling.test @@ -26,18 +26,16 @@ INSERT INTO output select count(*) as n_rows FROM sampled; endloop -query III -select min(num_rows) > 0, max(num_rows) < 25, count(*) FILTER (num_rows = 0) = 0 from output; +query II +select min(num_rows) > 0, count(*) FILTER (num_rows = 0) = 0 from output; ---- -true true true +true true query III select avg(rowid), min(rowid), max(rowid) from output where num_rows = 0; ---- NULL NULL NULL - - statement ok create table t1 as select range id from range(1000); From 0483d905743300366f838a9610c0a222f3f41a2a Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Mon, 17 Feb 2025 11:03:31 +0100 Subject: [PATCH 106/142] merge resolution --- src/execution/operator/persistent/physical_batch_insert.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/execution/operator/persistent/physical_batch_insert.cpp b/src/execution/operator/persistent/physical_batch_insert.cpp index 2cc4ea709fe4..0415585dd7f3 100644 --- a/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/execution/operator/persistent/physical_batch_insert.cpp @@ -229,7 +229,7 @@ class MergeCollectionTask : public BatchInsertTask { auto &g_state = g_state_p.Cast(); auto &l_state = l_state_p.Cast(); - // Merge the collections. + // Merge the collections. if (!l_state.writer) { l_state.writer = &g_state.table.GetStorage().CreateOptimisticWriter(context); } From 694ad70bd30b2b7d06be15c4305b7713734e2979 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Mon, 17 Feb 2025 11:11:18 +0100 Subject: [PATCH 107/142] some ci fixes --- .../duckdb/function/aggregate/minmax_n_helpers.hpp | 10 ++++++---- test/sql/types/enum/test_enum_from_query.test_slow | 7 +++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/include/duckdb/function/aggregate/minmax_n_helpers.hpp b/src/include/duckdb/function/aggregate/minmax_n_helpers.hpp index 07e1c48e9ea7..a26772819c43 100644 --- a/src/include/duckdb/function/aggregate/minmax_n_helpers.hpp +++ b/src/include/duckdb/function/aggregate/minmax_n_helpers.hpp @@ -108,8 +108,9 @@ class UnaryAggregateHeap { void Initialize(ArenaAllocator &allocator, const idx_t capacity_p) { capacity = capacity_p; - heap = reinterpret_cast *>(allocator.AllocateAligned(capacity * sizeof(HeapEntry))); - memset(heap, 0, capacity * sizeof(HeapEntry)); + auto ptr = allocator.AllocateAligned(capacity * sizeof(HeapEntry)); + memset(ptr, 0, capacity * sizeof(HeapEntry)); + heap = reinterpret_cast *>(ptr); size = 0; } @@ -179,8 +180,9 @@ class BinaryAggregateHeap { void Initialize(ArenaAllocator &allocator, const idx_t capacity_p) { capacity = capacity_p; - heap = reinterpret_cast(allocator.AllocateAligned(capacity * sizeof(STORAGE_TYPE))); - memset(heap, 0, capacity * sizeof(STORAGE_TYPE)); + auto ptr = allocator.AllocateAligned(capacity * sizeof(STORAGE_TYPE)); + memset(ptr, 0, capacity * sizeof(STORAGE_TYPE)); + heap = reinterpret_cast(ptr); size = 0; } diff --git a/test/sql/types/enum/test_enum_from_query.test_slow b/test/sql/types/enum/test_enum_from_query.test_slow index d9d1d68cd2e6..68697606bcb5 100644 --- a/test/sql/types/enum/test_enum_from_query.test_slow +++ b/test/sql/types/enum/test_enum_from_query.test_slow @@ -107,10 +107,9 @@ DROP TABLE number_str; statement ok DROP TYPE number_enum; -# Throw exception for NULL -statement error -CREATE TYPE number_enum AS ENUM (SELECT NULL::VARCHAR); ----- +# This just creates an empty enum type +statement ok +CREATE TYPE empty_number_enum AS ENUM (SELECT NULL::VARCHAR); # Test inserted order statement ok From 7b9d464b956ca02705d36b9a7995098fe1f958e0 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Mon, 17 Feb 2025 12:47:31 +0100 Subject: [PATCH 108/142] Modify histogram test to statement ok since the test can be inconsistent on different platforms --- .../aggregates/histogram_table_function.test | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/test/sql/aggregate/aggregates/histogram_table_function.test b/test/sql/aggregate/aggregates/histogram_table_function.test index 3b54ef1270e8..a5406cf897b1 100644 --- a/test/sql/aggregate/aggregates/histogram_table_function.test +++ b/test/sql/aggregate/aggregates/histogram_table_function.test @@ -64,19 +64,8 @@ x <= 12 13 statement ok INSERT INTO integers VALUES (99999999) -query II +statement ok SELECT * FROM histogram_values(integers, i, technique := 'equi-height') ----- -12 13 -25 13 -38 13 -50 13 -63 13 -76 13 -88 13 -101 13 -114 13 -99999999 13 # sample integers query II From bf1d472d689be49565e1029bc8ee0b132b734415 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Mon, 17 Feb 2025 12:49:52 +0100 Subject: [PATCH 109/142] Check avg count --- test/sql/aggregate/aggregates/histogram_table_function.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/sql/aggregate/aggregates/histogram_table_function.test b/test/sql/aggregate/aggregates/histogram_table_function.test index a5406cf897b1..846789118d0b 100644 --- a/test/sql/aggregate/aggregates/histogram_table_function.test +++ b/test/sql/aggregate/aggregates/histogram_table_function.test @@ -64,8 +64,10 @@ x <= 12 13 statement ok INSERT INTO integers VALUES (99999999) -statement ok -SELECT * FROM histogram_values(integers, i, technique := 'equi-height') +query II +SELECT COUNT(*), AVG(count) FROM histogram_values(integers, i, technique := 'equi-height') +---- +10 13 # sample integers query II From 8651a48618241d72ecaed30684939f202b709602 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Mon, 17 Feb 2025 13:12:17 +0100 Subject: [PATCH 110/142] move optimistic writers --- .../persistent/physical_batch_insert.cpp | 28 ++++++------ .../operator/persistent/physical_insert.cpp | 13 +++--- .../operator/persistent/physical_insert.hpp | 3 +- src/include/duckdb/storage/data_table.hpp | 6 +-- .../duckdb/transaction/local_storage.hpp | 12 ++--- src/storage/data_table.cpp | 9 +--- src/storage/local_storage.cpp | 44 +++---------------- 7 files changed, 39 insertions(+), 76 deletions(-) diff --git a/src/execution/operator/persistent/physical_batch_insert.cpp b/src/execution/operator/persistent/physical_batch_insert.cpp index 0415585dd7f3..3caaff78914d 100644 --- a/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/execution/operator/persistent/physical_batch_insert.cpp @@ -193,7 +193,7 @@ class BatchInsertLocalState : public LocalSinkState { idx_t current_index; TableAppendState current_append_state; PhysicalIndex collection_index; - optional_ptr writer; + unique_ptr optimistic_writer; unique_ptr constraint_state; void CreateNewCollection(ClientContext &context, DuckTableEntry &table_entry, @@ -230,10 +230,10 @@ class MergeCollectionTask : public BatchInsertTask { auto &l_state = l_state_p.Cast(); // Merge the collections. - if (!l_state.writer) { - l_state.writer = &g_state.table.GetStorage().CreateOptimisticWriter(context); + if (!l_state.optimistic_writer) { + l_state.optimistic_writer = make_uniq(g_state.table.GetStorage()); } - auto result_collection_index = g_state.MergeCollections(context, merge_collections, *l_state.writer); + auto result_collection_index = g_state.MergeCollections(context, merge_collections, *l_state.optimistic_writer); merge_collections.clear(); lock_guard l(g_state.lock); @@ -474,7 +474,7 @@ SinkNextBatchType PhysicalBatchInsert::NextBatch(ExecutionContext &context, Oper auto &collection = gstate.table.GetStorage().GetOptimisticCollection(context.client, lstate.collection_index); collection.FinalizeAppend(tdata, lstate.current_append_state); gstate.AddCollection(context.client, lstate.current_index, lstate.partition_info.min_batch_index.GetIndex(), - lstate.collection_index, lstate.writer); + lstate.collection_index, lstate.optimistic_writer); bool any_unblocked; { @@ -530,8 +530,8 @@ SinkResultType PhysicalBatchInsert::Sink(ExecutionContext &context, DataChunk &c lock_guard l(gstate.lock); // no collection yet: create a new one lstate.CreateNewCollection(context.client, table, insert_types); - if (!lstate.writer) { - lstate.writer = &table.GetStorage().CreateOptimisticWriter(context.client); + if (!lstate.optimistic_writer) { + lstate.optimistic_writer = make_uniq(table.GetStorage()); } } @@ -549,7 +549,7 @@ SinkResultType PhysicalBatchInsert::Sink(ExecutionContext &context, DataChunk &c auto new_row_group = collection.Append(lstate.insert_chunk, lstate.current_append_state); if (new_row_group) { // we have already written to disk - flush the next row group as well - lstate.writer->WriteNewRowGroup(collection); + lstate.optimistic_writer->WriteNewRowGroup(collection); } return SinkResultType::NEED_MORE_INPUT; } @@ -577,9 +577,10 @@ SinkCombineResultType PhysicalBatchInsert::Combine(ExecutionContext &context, Op lstate.collection_index = PhysicalIndex(DConstants::INVALID_INDEX); } } - if (lstate.writer) { + if (lstate.optimistic_writer) { lock_guard l(gstate.lock); - gstate.table.GetStorage().FinalizeOptimisticWriter(context.client, *lstate.writer); + auto &optimistic_writer = gstate.table.GetStorage().GetOptimisticWriter(context.client); + optimistic_writer.Merge(*lstate.optimistic_writer); } // unblock any blocked tasks @@ -636,9 +637,9 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, // now that we have created all of the mergers, perform the actual merging vector final_collections; final_collections.reserve(mergers.size()); - auto &writer = data_table.CreateOptimisticWriter(context); + auto writer = make_uniq(data_table); for (auto &merger : mergers) { - final_collections.push_back(merger->Flush(writer)); + final_collections.push_back(merger->Flush(*writer)); } // finally, merge the row groups into the local storage @@ -648,7 +649,8 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, data_table.ResetOptimisticCollection(context, collection_index); } - data_table.FinalizeOptimisticWriter(context, writer); + auto &optimistic_writer = data_table.GetOptimisticWriter(context); + optimistic_writer.Merge(*writer); memory_manager.FinalCheck(); return SinkFinalizeType::READY; } diff --git a/src/execution/operator/persistent/physical_insert.cpp b/src/execution/operator/persistent/physical_insert.cpp index 8e206c60bb05..8d5572be200c 100644 --- a/src/execution/operator/persistent/physical_insert.cpp +++ b/src/execution/operator/persistent/physical_insert.cpp @@ -665,7 +665,7 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, gstate.insert_count += lstate.insert_chunk.size(); gstate.insert_count += updated_tuples; - if (!parallel && return_chunk) { + if (return_chunk) { gstate.return_collection.Append(lstate.insert_chunk); } storage.LocalAppend(gstate.append_state, context.client, lstate.insert_chunk, true); @@ -692,7 +692,7 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, collection->InitializeAppend(lstate.local_append_state); lock_guard l(gstate.lock); - lstate.writer = data_table.CreateOptimisticWriter(context.client); + lstate.optimistic_writer = make_uniq(data_table); lstate.collection_index = data_table.CreateOptimisticCollection(context.client, std::move(collection)); } @@ -702,7 +702,7 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, auto &collection = data_table.GetOptimisticCollection(context.client, lstate.collection_index); auto new_row_group = collection.Append(lstate.insert_chunk, lstate.local_append_state); if (new_row_group) { - lstate.writer->WriteNewRowGroup(collection); + lstate.optimistic_writer->WriteNewRowGroup(collection); } return SinkResultType::NEED_MORE_INPUT; } @@ -743,10 +743,11 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato storage.FinalizeLocalAppend(gstate.append_state); } else { // we have written rows to disk optimistically - merge directly into the transaction-local storage - lstate.writer->WriteLastRowGroup(collection); - lstate.writer->FinalFlush(); + lstate.optimistic_writer->WriteLastRowGroup(collection); + lstate.optimistic_writer->FinalFlush(); gstate.table.GetStorage().LocalMerge(context.client, collection); - gstate.table.GetStorage().FinalizeOptimisticWriter(context.client, *lstate.writer); + auto &optimistic_writer = gstate.table.GetStorage().GetOptimisticWriter(context.client); + optimistic_writer.Merge(*lstate.optimistic_writer); } return SinkCombineResultType::FINISHED; diff --git a/src/include/duckdb/execution/operator/persistent/physical_insert.hpp b/src/include/duckdb/execution/operator/persistent/physical_insert.hpp index 9a800ae82678..ffa4f6b224e3 100644 --- a/src/include/duckdb/execution/operator/persistent/physical_insert.hpp +++ b/src/include/duckdb/execution/operator/persistent/physical_insert.hpp @@ -16,6 +16,7 @@ #include "duckdb/storage/table/append_state.hpp" #include "duckdb/catalog/catalog_entry/duck_table_entry.hpp" #include "duckdb/storage/table/delete_state.hpp" +#include "duckdb/storage/optimistic_data_writer.hpp" namespace duckdb { @@ -55,7 +56,7 @@ class InsertLocalState : public LocalSinkState { TableAppendState local_append_state; //! An index to the optimistic row group collection vector of the local table storage for this transaction. PhysicalIndex collection_index; - optional_ptr writer; + unique_ptr optimistic_writer; // Rows that have been updated by a DO UPDATE conflict unordered_set updated_rows; idx_t update_count = 0; diff --git a/src/include/duckdb/storage/data_table.hpp b/src/include/duckdb/storage/data_table.hpp index c282a2a6560e..5d9f6c057aed 100644 --- a/src/include/duckdb/storage/data_table.hpp +++ b/src/include/duckdb/storage/data_table.hpp @@ -29,7 +29,6 @@ class ColumnDataCollection; class ColumnDefinition; class DataTable; class DuckTransaction; -class OptimisticDataWriter; class RowGroup; class StorageManager; class TableCatalogEntry; @@ -122,9 +121,8 @@ class DataTable { RowGroupCollection &GetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index); //! Resets the optimistic row group collection corresponding to the index. void ResetOptimisticCollection(ClientContext &context, const PhysicalIndex collection_index); - //! Create an optimistic writer for this table. Used for optimistically writing parallel appends. - OptimisticDataWriter &CreateOptimisticWriter(ClientContext &context); - void FinalizeOptimisticWriter(ClientContext &context, OptimisticDataWriter &writer); + //! Returns the optimistic writer of the corresponding local table. + OptimisticDataWriter &GetOptimisticWriter(ClientContext &context); unique_ptr InitializeDelete(TableCatalogEntry &table, ClientContext &context, const vector> &bound_constraints); diff --git a/src/include/duckdb/transaction/local_storage.hpp b/src/include/duckdb/transaction/local_storage.hpp index 83adef5387d7..4119e968dc71 100644 --- a/src/include/duckdb/transaction/local_storage.hpp +++ b/src/include/duckdb/transaction/local_storage.hpp @@ -58,8 +58,6 @@ class LocalTableStorage : public enable_shared_from_this { vector> optimistic_collections; //! The main optimistic data writer associated with this table. OptimisticDataWriter optimistic_writer; - //! The optimistic data writers associated with this table. - vector> optimistic_writers; //! Whether or not storage was merged bool merged_storage = false; @@ -86,9 +84,8 @@ class LocalTableStorage : public enable_shared_from_this { RowGroupCollection &GetOptimisticCollection(const PhysicalIndex collection_index); //! Resets the optimistic row group collection corresponding to the index. void ResetOptimisticCollection(const PhysicalIndex collection_index); - //! Create an optimistic writer for this table. - OptimisticDataWriter &CreateOptimisticWriter(); - void FinalizeOptimisticWriter(OptimisticDataWriter &writer); + //! Returns the optimistic writer. + OptimisticDataWriter &GetOptimisticWriter(); private: mutex collections_lock; @@ -152,9 +149,8 @@ class LocalStorage { RowGroupCollection &GetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); //! Resets the optimistic row group collection corresponding to the index. void ResetOptimisticCollection(DataTable &table, const PhysicalIndex collection_index); - //! Create an optimistic writer for this table. - OptimisticDataWriter &CreateOptimisticWriter(DataTable &table); - void FinalizeOptimisticWriter(DataTable &table, OptimisticDataWriter &writer); + //! Returns the optimistic writer. + OptimisticDataWriter &GetOptimisticWriter(DataTable &table); //! Delete a set of rows from the local storage idx_t Delete(DataTable &table, Vector &row_ids, idx_t count); diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index d34872c68778..eaddd112175a 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -869,14 +869,9 @@ void DataTable::ResetOptimisticCollection(ClientContext &context, const Physical local_storage.ResetOptimisticCollection(*this, collection_index); } -OptimisticDataWriter &DataTable::CreateOptimisticWriter(ClientContext &context) { +OptimisticDataWriter &DataTable::GetOptimisticWriter(ClientContext &context) { auto &local_storage = LocalStorage::Get(context, db); - return local_storage.CreateOptimisticWriter(*this); -} - -void DataTable::FinalizeOptimisticWriter(ClientContext &context, OptimisticDataWriter &writer) { - auto &local_storage = LocalStorage::Get(context, db); - local_storage.FinalizeOptimisticWriter(*this, writer); + return local_storage.GetOptimisticWriter(*this); } void DataTable::LocalMerge(ClientContext &context, RowGroupCollection &collection) { diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 5379ddc30f83..d8339fdadb35 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -55,8 +55,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_data const vector &bound_columns, Expression &cast_expr) : table_ref(new_data_table), allocator(Allocator::Get(new_data_table.db)), deleted_rows(parent.deleted_rows), optimistic_collections(std::move(parent.optimistic_collections)), - optimistic_writer(new_data_table, parent.optimistic_writer), - optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { + optimistic_writer(new_data_table, parent.optimistic_writer), merged_storage(parent.merged_storage) { // Alter the column type. row_groups = parent.row_groups->AlterType(context, alter_column_index, target_type, bound_columns, cast_expr); @@ -70,8 +69,7 @@ LocalTableStorage::LocalTableStorage(DataTable &new_data_table, LocalTableStorag const idx_t drop_column_index) : table_ref(new_data_table), allocator(Allocator::Get(new_data_table.db)), deleted_rows(parent.deleted_rows), optimistic_collections(std::move(parent.optimistic_collections)), - optimistic_writer(new_data_table, parent.optimistic_writer), - optimistic_writers(std::move(parent.optimistic_writers)), merged_storage(parent.merged_storage) { + optimistic_writer(new_data_table, parent.optimistic_writer), merged_storage(parent.merged_storage) { // Remove the column from the previous table storage. row_groups = parent.row_groups->RemoveColumn(drop_column_index); @@ -85,8 +83,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, ColumnDefinition &new_column, ExpressionExecutor &default_executor) : table_ref(new_dt), allocator(Allocator::Get(new_dt.db)), deleted_rows(parent.deleted_rows), optimistic_collections(std::move(parent.optimistic_collections)), - optimistic_writer(new_dt, parent.optimistic_writer), optimistic_writers(std::move(parent.optimistic_writers)), - merged_storage(parent.merged_storage) { + optimistic_writer(new_dt, parent.optimistic_writer), merged_storage(parent.merged_storage) { row_groups = parent.row_groups->AddColumn(context, new_column, default_executor); parent.row_groups.reset(); @@ -250,33 +247,11 @@ void LocalTableStorage::ResetOptimisticCollection(const PhysicalIndex collection optimistic_collections[collection_index.index].reset(); } -OptimisticDataWriter &LocalTableStorage::CreateOptimisticWriter() { - auto writer = make_uniq(table_ref.get()); - optimistic_writers.push_back(std::move(writer)); - return *optimistic_writers.back(); -} - -void LocalTableStorage::FinalizeOptimisticWriter(OptimisticDataWriter &writer) { - // remove the writer from the set of optimistic writers - unique_ptr owned_writer; - for (idx_t i = 0; i < optimistic_writers.size(); i++) { - if (optimistic_writers[i].get() == &writer) { - owned_writer = std::move(optimistic_writers[i]); - optimistic_writers.erase_at(i); - break; - } - } - if (!owned_writer) { - throw InternalException("Error in FinalizeOptimisticWriter - could not find writer"); - } - optimistic_writer.Merge(*owned_writer); +OptimisticDataWriter &LocalTableStorage::GetOptimisticWriter() { + return optimistic_writer; } void LocalTableStorage::Rollback() { - for (auto &writer : optimistic_writers) { - writer->Rollback(); - } - optimistic_writers.clear(); optimistic_writer.Rollback(); for (auto &collection : optimistic_collections) { @@ -489,14 +464,9 @@ void LocalStorage::ResetOptimisticCollection(DataTable &table, const PhysicalInd storage.ResetOptimisticCollection(collection_index); } -OptimisticDataWriter &LocalStorage::CreateOptimisticWriter(DataTable &table) { - auto &storage = table_manager.GetOrCreateStorage(context, table); - return storage.CreateOptimisticWriter(); -} - -void LocalStorage::FinalizeOptimisticWriter(DataTable &table, OptimisticDataWriter &writer) { +OptimisticDataWriter &LocalStorage::GetOptimisticWriter(DataTable &table) { auto &storage = table_manager.GetOrCreateStorage(context, table); - storage.FinalizeOptimisticWriter(writer); + return storage.GetOptimisticWriter(); } bool LocalStorage::ChangesMade() noexcept { From 3136585cd87ab9244273d267578a092bea4268c4 Mon Sep 17 00:00:00 2001 From: Tishj Date: Mon, 17 Feb 2025 13:16:01 +0100 Subject: [PATCH 111/142] Execute does not like a dirty validity mask, use vector caches (through the DataChunk) just for completeness of using 'clean' Vectors every iteration --- src/execution/expression_executor/execute_operator.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/execution/expression_executor/execute_operator.cpp b/src/execution/expression_executor/execute_operator.cpp index 6f37e5d29138..04883c5deeac 100644 --- a/src/execution/expression_executor/execute_operator.cpp +++ b/src/execution/expression_executor/execute_operator.cpp @@ -124,13 +124,16 @@ void ExpressionExecutor::Execute(const BoundOperatorExpression &expr, Expression } } SelectionVector selvec(1); - Vector intermediate(result.GetType(), 1); + DataChunk intermediate; + intermediate.Initialize(GetAllocator(), {result.GetType()}, 1); for (idx_t i = 0; i < count; i++) { + intermediate.Reset(); + intermediate.SetCardinality(1); selvec.set_index(0, sel ? sel->get_index(i) : i); Value val(result.GetType()); try { - Execute(*expr.children[0], &child_state, &selvec, 1, intermediate); - val = intermediate.GetValue(0); + Execute(*expr.children[0], &child_state, &selvec, 1, intermediate.data[0]); + val = intermediate.GetValue(0, 0); } catch (std::exception &ex) { ErrorData error(ex); auto error_type = error.Type(); From ee5cc9061f2f02f7cd5acc5e88e9d837c42cebc2 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Mon, 17 Feb 2025 13:30:10 +0100 Subject: [PATCH 112/142] change result order now that string hash has changed --- test/api/adbc/test_adbc.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/api/adbc/test_adbc.cpp b/test/api/adbc/test_adbc.cpp index afcbb596d073..a624a66a7857 100644 --- a/test/api/adbc/test_adbc.cpp +++ b/test/api/adbc/test_adbc.cpp @@ -1364,8 +1364,8 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { REQUIRE((res->ColumnCount() == 2)); REQUIRE((res->RowCount() == 3)); REQUIRE((res->GetValue(1, 0).ToString() == - "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, " - "'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]")); + "[{'db_schema_name': main, 'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, " + "'db_schema_tables': NULL}, {'db_schema_name': information_schema, 'db_schema_tables': NULL}]")); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, nullptr, nullptr, From b27267eaed46a1610c77db5a25195e5269895e5c Mon Sep 17 00:00:00 2001 From: Mytherin Date: Mon, 17 Feb 2025 13:30:36 +0100 Subject: [PATCH 113/142] Avoid caching the compressed buffer in the ColumnReader --- extension/parquet/column_reader.cpp | 10 ++++------ extension/parquet/include/column_reader.hpp | 2 -- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/extension/parquet/column_reader.cpp b/extension/parquet/column_reader.cpp index 8a2112c356dd..73db368bc277 100644 --- a/extension/parquet/column_reader.cpp +++ b/extension/parquet/column_reader.cpp @@ -319,7 +319,8 @@ void ColumnReader::PreparePageV2(PageHeader &page_hdr) { auto compressed_bytes = page_hdr.compressed_page_size - uncompressed_bytes; - AllocateCompressed(compressed_bytes); + ResizeableBuffer compressed_buffer; + compressed_buffer.resize(GetAllocator(), compressed_bytes); reader.ReadData(*protocol, compressed_buffer.ptr, compressed_bytes); DecompressInternal(chunk->meta_data.codec, compressed_buffer.ptr, compressed_bytes, block->ptr + uncompressed_bytes, @@ -334,10 +335,6 @@ void ColumnReader::AllocateBlock(idx_t size) { } } -void ColumnReader::AllocateCompressed(idx_t size) { - compressed_buffer.resize(GetAllocator(), size); -} - void ColumnReader::PreparePage(PageHeader &page_hdr) { AllocateBlock(page_hdr.uncompressed_page_size + 1); if (chunk->meta_data.codec == CompressionCodec::UNCOMPRESSED) { @@ -348,7 +345,8 @@ void ColumnReader::PreparePage(PageHeader &page_hdr) { return; } - AllocateCompressed(page_hdr.compressed_page_size + 1); + ResizeableBuffer compressed_buffer; + compressed_buffer.resize(GetAllocator(), page_hdr.compressed_page_size + 1); reader.ReadData(*protocol, compressed_buffer.ptr, page_hdr.compressed_page_size); DecompressInternal(chunk->meta_data.codec, compressed_buffer.ptr, page_hdr.compressed_page_size, block->ptr, diff --git a/extension/parquet/include/column_reader.hpp b/extension/parquet/include/column_reader.hpp index 23d4fc3d4b6b..2b09623950f1 100644 --- a/extension/parquet/include/column_reader.hpp +++ b/extension/parquet/include/column_reader.hpp @@ -160,7 +160,6 @@ class ColumnReader { private: void AllocateBlock(idx_t size); - void AllocateCompressed(idx_t size); void PrepareRead(parquet_filter_t &filter); void PreparePage(PageHeader &page_hdr); void PrepareDataPage(PageHeader &page_hdr); @@ -178,7 +177,6 @@ class ColumnReader { shared_ptr block; - ResizeableBuffer compressed_buffer; ResizeableBuffer offset_buffer; unique_ptr dict_decoder; From b21d19bb6ac9d98b0aae61f40dbf1bc36bf99885 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Mon, 17 Feb 2025 13:55:50 +0100 Subject: [PATCH 114/142] improve performance of boolean column writer too --- .../parquet/writer/boolean_column_writer.cpp | 42 ++++++++++++------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/extension/parquet/writer/boolean_column_writer.cpp b/extension/parquet/writer/boolean_column_writer.cpp index b7a3ee01856b..bcfd78b3ea82 100644 --- a/extension/parquet/writer/boolean_column_writer.cpp +++ b/extension/parquet/writer/boolean_column_writer.cpp @@ -49,22 +49,36 @@ void BooleanColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStat idx_t chunk_end) { auto &stats = stats_p->Cast(); auto &state = state_p->Cast(); - auto &mask = FlatVector::Validity(input_column); - - auto *ptr = FlatVector::GetData(input_column); - for (idx_t r = chunk_start; r < chunk_end; r++) { - if (mask.RowIsValid(r)) { - // only encode if non-null - if (ptr[r]) { - stats.max = true; - state.byte |= 1 << state.byte_pos; - } else { - stats.min = false; + const auto &mask = FlatVector::Validity(input_column); + + const auto *const ptr = FlatVector::GetData(input_column); + if (mask.AllValid()) { + for (idx_t r = chunk_start; r < chunk_end; r++) { + const auto &val = ptr[r]; + + stats.max |= val; + stats.min &= val; + state.byte |= val << state.byte_pos; + + if (++state.byte_pos == 8) { + temp_writer.Write(state.byte); + state.byte = 0; + state.byte_pos = 0; + } + } + } else { + for (idx_t r = chunk_start; r < chunk_end; r++) { + if (!mask.RowIsValid(r)) { + continue; } - state.byte_pos++; + const auto &val = ptr[r]; + + stats.max |= val; + stats.min &= val; + state.byte |= val << state.byte_pos; - if (state.byte_pos == 8) { - temp_writer.Write(state.byte); + if (++state.byte_pos == 8) { + temp_writer.Write(state.byte); state.byte = 0; state.byte_pos = 0; } From 7d720dff253b459cdf83661f640aa1b3f4a8e0c4 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Mon, 17 Feb 2025 14:09:35 +0100 Subject: [PATCH 115/142] Fix #16260: correctly handle parameters in getvariable --- src/function/scalar/generic/getvariable.cpp | 6 +++--- test/sql/variables/test_variables.test | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/function/scalar/generic/getvariable.cpp b/src/function/scalar/generic/getvariable.cpp index 14d32954d1cf..e6eebf0d10af 100644 --- a/src/function/scalar/generic/getvariable.cpp +++ b/src/function/scalar/generic/getvariable.cpp @@ -24,12 +24,12 @@ struct GetVariableBindData : FunctionData { static unique_ptr GetVariableBind(ClientContext &context, ScalarFunction &function, vector> &arguments) { - if (!arguments[0]->IsFoldable()) { - throw NotImplementedException("getvariable requires a constant input"); - } if (arguments[0]->HasParameter()) { throw ParameterNotResolvedException(); } + if (!arguments[0]->IsFoldable()) { + throw NotImplementedException("getvariable requires a constant input"); + } Value value; auto variable_name = ExpressionExecutor::EvaluateScalar(context, *arguments[0]); if (!variable_name.IsNull()) { diff --git a/test/sql/variables/test_variables.test b/test/sql/variables/test_variables.test index ad3c15d43f57..b3dd60747846 100644 --- a/test/sql/variables/test_variables.test +++ b/test/sql/variables/test_variables.test @@ -13,6 +13,14 @@ SELECT GETVARIABLE('animal') ---- duck +statement ok +PREPARE v1 AS SELECT GETVARIABLE($1); + +query I +EXECUTE v1('animal'); +---- +duck + # overwriting statement ok SET VARIABLE animal='bird' From 9c3cd8a48fe64124137b31f4efff284ba336f8c4 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Mon, 17 Feb 2025 14:12:16 +0100 Subject: [PATCH 116/142] Handle macros as well --- src/function/scalar/generic/getvariable.cpp | 2 +- test/sql/variables/test_variables.test | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/function/scalar/generic/getvariable.cpp b/src/function/scalar/generic/getvariable.cpp index e6eebf0d10af..0181c07523bc 100644 --- a/src/function/scalar/generic/getvariable.cpp +++ b/src/function/scalar/generic/getvariable.cpp @@ -24,7 +24,7 @@ struct GetVariableBindData : FunctionData { static unique_ptr GetVariableBind(ClientContext &context, ScalarFunction &function, vector> &arguments) { - if (arguments[0]->HasParameter()) { + if (arguments[0]->HasParameter() || arguments[0]->return_type.id() == LogicalTypeId::UNKNOWN) { throw ParameterNotResolvedException(); } if (!arguments[0]->IsFoldable()) { diff --git a/test/sql/variables/test_variables.test b/test/sql/variables/test_variables.test index b3dd60747846..b67d81686222 100644 --- a/test/sql/variables/test_variables.test +++ b/test/sql/variables/test_variables.test @@ -21,6 +21,14 @@ EXECUTE v1('animal'); ---- duck +statement ok +CREATE MACRO _(x) AS getvariable(x); + +query I +SELECT _('animal') +---- +duck + # overwriting statement ok SET VARIABLE animal='bird' From ba6fe78896ab3f5dd771f1c3af2eece82e787ded Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Mon, 17 Feb 2025 14:53:12 +0100 Subject: [PATCH 117/142] change extension install mode to not_installed instead of null --- src/function/table/system/duckdb_extensions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/function/table/system/duckdb_extensions.cpp b/src/function/table/system/duckdb_extensions.cpp index 0edc2c2ff929..8adeb356afde 100644 --- a/src/function/table/system/duckdb_extensions.cpp +++ b/src/function/table/system/duckdb_extensions.cpp @@ -84,7 +84,7 @@ unique_ptr DuckDBExtensionsInit(ClientContext &context info.loaded = false; info.file_path = extension.statically_loaded ? "(BUILT-IN)" : string(); info.install_mode = - extension.statically_loaded ? ExtensionInstallMode::STATICALLY_LINKED : ExtensionInstallMode::UNKNOWN; + extension.statically_loaded ? ExtensionInstallMode::STATICALLY_LINKED : ExtensionInstallMode::NOT_INSTALLED; info.description = extension.description; for (idx_t k = 0; k < alias_count; k++) { auto alias = ExtensionHelper::GetExtensionAlias(k); @@ -206,7 +206,7 @@ void DuckDBExtensionsFunction(ClientContext &context, TableFunctionInput &data_p // extension version LogicalType::LIST(LogicalType::VARCHAR) output.SetValue(6, count, Value(entry.extension_version)); // installed_mode LogicalType::VARCHAR - output.SetValue(7, count, entry.installed ? Value(EnumUtil::ToString(entry.install_mode)) : Value()); + output.SetValue(7, count, EnumUtil::ToString(entry.install_mode)); // installed_source LogicalType::VARCHAR output.SetValue(8, count, Value(entry.installed_from)); From 80fa4cd08d4b89ab796dccf88b9489a3c262e4cf Mon Sep 17 00:00:00 2001 From: Tishj Date: Mon, 17 Feb 2025 14:59:02 +0100 Subject: [PATCH 118/142] add the correct variant of the flag based on the compiler (MSVC or not) --- CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c61c7f9def65..24ca161f0d83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1420,6 +1420,15 @@ if(BUILD_PYTHON) set(ALL_COMPILE_FLAGS "${CMAKE_CXX_FLAGS}") endif() + # Check for MSVC compiler and set the correct C++ standard flag + if(MSVC) + # MSVC does not support `-std=c++11` or `-std=c++14`, use `/std:c++14` + set(ALL_COMPILE_FLAGS "${ALL_COMPILE_FLAGS} /std:c++14") + else() + # For non-MSVC compilers, use the `-std=c++11` + set(ALL_COMPILE_FLAGS "${ALL_COMPILE_FLAGS} -std=c++11") + endif() + get_target_property(duckdb_libs duckdb LINK_LIBRARIES) set(PIP_COMMAND From 7c3296fedf07754df1bf6963c1b76fd5367b4f73 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Mon, 17 Feb 2025 15:01:27 +0100 Subject: [PATCH 119/142] add test --- test/extension/autoloading_base.test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/extension/autoloading_base.test b/test/extension/autoloading_base.test index 591383eb80f6..80c120e8d119 100644 --- a/test/extension/autoloading_base.test +++ b/test/extension/autoloading_base.test @@ -15,6 +15,12 @@ SELECT (count(*) > 0) FROM duckdb_extensions() WHERE install_path ILIKE '%duckdb ---- false +# All extensions reported by duckdb are either statically linked or not installed +query I +SELECT count(*) FROM duckdb_extensions() WHERE install_mode != 'NOT_INSTALLED' AND install_mode != 'STATICALLY_LINKED' +---- +0 + ### No autoloading nor installing: throw error with installation hint statement ok set autoload_known_extensions=false From f066290ff0dd7ad5931499719710097bb95c2383 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Mon, 17 Feb 2025 15:37:39 +0100 Subject: [PATCH 120/142] Avoid calling SetFilterAlwaysTrue multiple times in RowGroup::CheckZonemap --- src/storage/table/row_group.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/storage/table/row_group.cpp b/src/storage/table/row_group.cpp index d5250387362b..55c6e064f4e5 100644 --- a/src/storage/table/row_group.cpp +++ b/src/storage/table/row_group.cpp @@ -430,14 +430,13 @@ bool RowGroup::CheckZonemap(ScanFilterInfo &filters) { if (prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) { return false; } - if (prune_result == FilterPropagateResult::FILTER_ALWAYS_TRUE) { - // filter is always true - no need to check it - // label the filter as always true so we don't need to check it anymore - filters.SetFilterAlwaysTrue(i); - } if (filter.filter_type == TableFilterType::OPTIONAL_FILTER) { // these are only for row group checking, set as always true so we don't check it filters.SetFilterAlwaysTrue(i); + } else if (prune_result == FilterPropagateResult::FILTER_ALWAYS_TRUE) { + // filter is always true - no need to check it + // label the filter as always true so we don't need to check it anymore + filters.SetFilterAlwaysTrue(i); } } return true; @@ -619,7 +618,7 @@ void RowGroup::TemplatedScan(TransactionData transaction, CollectionScanState &s if (prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) { // We can just break out of the loop here. approved_tuple_count = 0; - break; + continue; } // Generate row ids From 6637b90bb5a3cf9d5e59dc6ef51795bbb17b1bd1 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Mon, 17 Feb 2025 15:39:54 +0100 Subject: [PATCH 121/142] Add safeguard to SetFilterAlwaysTrue --- src/storage/table/scan_state.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/storage/table/scan_state.cpp b/src/storage/table/scan_state.cpp index adeccde91b03..fdfa76433059 100644 --- a/src/storage/table/scan_state.cpp +++ b/src/storage/table/scan_state.cpp @@ -96,6 +96,9 @@ void ScanFilterInfo::CheckAllFilters() { void ScanFilterInfo::SetFilterAlwaysTrue(idx_t filter_idx) { auto &filter = filter_list[filter_idx]; + if (filter.always_true) { + return; + } filter.always_true = true; column_has_filter[filter.scan_column_index] = false; always_true_filters++; From fe56c8f0554fe31f03a0eef51530c3dbccc9770e Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Mon, 17 Feb 2025 15:55:08 +0100 Subject: [PATCH 122/142] fix scanning from normal leaf to nested leaf --- src/execution/index/art/iterator.cpp | 10 +++-- .../duckdb/execution/index/art/iterator.hpp | 2 + .../scan/test_art_scan_normal_to_nested.test | 38 +++++++++++++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 test/sql/index/art/scan/test_art_scan_normal_to_nested.test diff --git a/src/execution/index/art/iterator.cpp b/src/execution/index/art/iterator.cpp index 689029a02e40..1c138e1d3e34 100644 --- a/src/execution/index/art/iterator.cpp +++ b/src/execution/index/art/iterator.cpp @@ -46,9 +46,11 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec bool has_next; do { // An empty upper bound indicates that no upper bound exists. - if (!upper_bound.Empty() && status == GateStatus::GATE_NOT_SET) { - if (current_key.GreaterThan(upper_bound, equal, nested_depth)) { - return true; + if (!upper_bound.Empty()) { + if (status == GateStatus::GATE_NOT_SET || entered_nested_leaf) { + if (current_key.GreaterThan(upper_bound, equal, nested_depth)) { + return true; + } } } @@ -86,6 +88,7 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec throw InternalException("Invalid leaf type for index scan."); } + entered_nested_leaf = false; has_next = Next(); } while (has_next); return true; @@ -104,6 +107,7 @@ void Iterator::FindMinimum(const Node &node) { if (node.GetGateStatus() == GateStatus::GATE_SET) { D_ASSERT(status == GateStatus::GATE_NOT_SET); status = GateStatus::GATE_SET; + entered_nested_leaf = true; nested_depth = 0; } diff --git a/src/include/duckdb/execution/index/art/iterator.hpp b/src/include/duckdb/execution/index/art/iterator.hpp index 58a0f106d54d..977cc7791081 100644 --- a/src/include/duckdb/execution/index/art/iterator.hpp +++ b/src/include/duckdb/execution/index/art/iterator.hpp @@ -90,6 +90,8 @@ class Iterator { GateStatus status; //! Depth in a nested leaf. uint8_t nested_depth = 0; + //! True, if we entered a nested leaf to retrieve the next node. + bool entered_nested_leaf = false; private: //! Goes to the next leaf in the ART and sets it as last_leaf, diff --git a/test/sql/index/art/scan/test_art_scan_normal_to_nested.test b/test/sql/index/art/scan/test_art_scan_normal_to_nested.test new file mode 100644 index 000000000000..0cd8cf886fe5 --- /dev/null +++ b/test/sql/index/art/scan/test_art_scan_normal_to_nested.test @@ -0,0 +1,38 @@ +# name: test/sql/index/art/scan/test_art_scan_normal_to_nested.test +# description: Test range scanning with an iterator moving from a normal leaf to a nested leaf. +# group: [scan] + +statement ok +PRAGMA enable_verification + +statement ok +CREATE TABLE integers (i BIGINT); + +statement ok +CREATE INDEX idx_integers ON integers (i); + +statement ok +INSERT INTO integers (i) VALUES ('1'), ('-1'), ('1'); + +# The border is exactly when moving from a non-nested leaf to a nested leaf. + +query I +SELECT i FROM integers WHERE i <= 0; +---- +-1 + +# Issue 16074. + +statement ok +CREATE TABLE t0(c1 TIMESTAMP); + +statement ok +INSERT INTO t0(c1) VALUES ('2020-02-29 12:00:00'), ('1969-12-09 09:26:38'), ('2020-02-29 12:00:00'); + +statement ok +CREATE INDEX i0 ON t0(c1); + +query I +SELECT c1 FROM t0 WHERE c1 <= '2007-07-07 07:07:07'; +---- +1969-12-09 09:26:38 \ No newline at end of file From 1d06c91a381eff4f507690cbb17bf0ca7a174fe0 Mon Sep 17 00:00:00 2001 From: Mytherin Date: Mon, 17 Feb 2025 16:01:03 +0100 Subject: [PATCH 123/142] Fix #16231: refer to order by condition in ARRAY(SUBQUERY) by alias instead of by index --- src/parser/transform/expression/transform_subquery.cpp | 5 ++++- test/sql/subquery/scalar/array_order_subquery.test | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/parser/transform/expression/transform_subquery.cpp b/src/parser/transform/expression/transform_subquery.cpp index 6f6d742073ba..0403d24bc5dc 100644 --- a/src/parser/transform/expression/transform_subquery.cpp +++ b/src/parser/transform/expression/transform_subquery.cpp @@ -107,6 +107,7 @@ unique_ptr Transformer::TransformSubquery(duckdb_libpgquery::P } } // transform constants (e.g. ORDER BY 1) into positional references (ORDER BY #1) + idx_t array_idx = 0; if (aggr->order_bys) { for (auto &order : aggr->order_bys->orders) { if (order.expression->GetExpressionType() == ExpressionType::VALUE_CONSTANT) { @@ -120,8 +121,10 @@ unique_ptr Transformer::TransformSubquery(duckdb_libpgquery::P } } else if (sub_select) { // if we have a SELECT we can push the ORDER BY clause into the SELECT list and reference it + auto alias = "__array_internal_idx_" + to_string(++array_idx); + order.expression->alias = alias; sub_select->select_list.push_back(std::move(order.expression)); - order.expression = make_uniq(sub_select->select_list.size() - 1); + order.expression = make_uniq(alias); } else { // otherwise we remove order qualifications RemoveOrderQualificationRecursive(order.expression); diff --git a/test/sql/subquery/scalar/array_order_subquery.test b/test/sql/subquery/scalar/array_order_subquery.test index a0ca2fb4c7d0..94abd308009a 100644 --- a/test/sql/subquery/scalar/array_order_subquery.test +++ b/test/sql/subquery/scalar/array_order_subquery.test @@ -86,6 +86,16 @@ SELECT ARRAY ---- [3, 2, 1] +query I +select array(select * from unnest(['a', 'b']) as _t(u) order by if(u='a',100, 1)) as out; +---- +[b, a] + +query I +select array(select * from unnest(['a', 'b']) as _t(u) order by if(u='a',100, 1) desc) as out; +---- +[a, b] + statement error SELECT ARRAY (SELECT 1 UNION ALL From 7dec52e599066b4e17f9a3aa47cbec400053ce2b Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Mon, 17 Feb 2025 15:37:12 +0100 Subject: [PATCH 124/142] add pragma to truncate log --- src/function/pragma/pragma_functions.cpp | 6 +++++ src/include/duckdb/logging/log_manager.hpp | 2 ++ src/include/duckdb/logging/log_storage.hpp | 7 ++++++ src/logging/log_manager.cpp | 5 +++++ src/logging/log_storage.cpp | 22 +++++++++++++++++++ .../test_logging_function_large.test_slow | 8 +++++++ 6 files changed, 50 insertions(+) diff --git a/src/function/pragma/pragma_functions.cpp b/src/function/pragma/pragma_functions.cpp index 635828066029..5612f519c1e0 100644 --- a/src/function/pragma/pragma_functions.cpp +++ b/src/function/pragma/pragma_functions.cpp @@ -94,6 +94,10 @@ static void PragmaForceCheckpoint(ClientContext &context, const FunctionParamete DBConfig::GetConfig(context).options.force_checkpoint = true; } +static void PragmaTruncateDuckDBLogs(ClientContext &context, const FunctionParameters ¶meters) { + context.db->GetLogManager().TruncateLogStorage(); +} + static void PragmaDisableForceParallelism(ClientContext &context, const FunctionParameters ¶meters) { ClientConfig::GetConfig(context).verify_parallelism = false; } @@ -149,6 +153,8 @@ void PragmaFunctions::RegisterFunction(BuiltinFunctions &set) { set.AddFunction(PragmaFunction::PragmaStatement("force_checkpoint", PragmaForceCheckpoint)); + set.AddFunction(PragmaFunction::PragmaStatement("truncate_duckdb_logs", PragmaTruncateDuckDBLogs)); + set.AddFunction(PragmaFunction::PragmaStatement("enable_progress_bar", PragmaEnableProgressBar)); set.AddFunction(PragmaFunction::PragmaStatement("disable_progress_bar", PragmaDisableProgressBar)); diff --git a/src/include/duckdb/logging/log_manager.hpp b/src/include/duckdb/logging/log_manager.hpp index 90c6384c3274..6f414d9efa7e 100644 --- a/src/include/duckdb/logging/log_manager.hpp +++ b/src/include/duckdb/logging/log_manager.hpp @@ -54,6 +54,8 @@ class LogManager : public enable_shared_from_this { DUCKDB_API void SetDisabledLogTypes(unordered_set &disabled_log_types); DUCKDB_API void SetLogStorage(DatabaseInstance &db, const string &storage_name); + DUCKDB_API void TruncateLogStorage(); + DUCKDB_API LogConfig GetConfig(); protected: diff --git a/src/include/duckdb/logging/log_storage.hpp b/src/include/duckdb/logging/log_storage.hpp index f99175b590ea..d30d370a7028 100644 --- a/src/include/duckdb/logging/log_storage.hpp +++ b/src/include/duckdb/logging/log_storage.hpp @@ -61,6 +61,8 @@ class LogStorage { DUCKDB_API virtual unique_ptr CreateScanContextsState() const; DUCKDB_API virtual bool ScanContexts(LogStorageScanState &state, DataChunk &result) const; DUCKDB_API virtual void InitializeScanContexts(LogStorageScanState &state) const; + + DUCKDB_API virtual void Truncate(); }; class StdOutLogStorage : public LogStorage { @@ -73,6 +75,8 @@ class StdOutLogStorage : public LogStorage { const RegisteredLoggingContext &context) override; void WriteLogEntries(DataChunk &chunk, const RegisteredLoggingContext &context) override; void Flush() override; + + void Truncate() override; }; class InMemoryLogStorageScanState : public LogStorageScanState { @@ -94,6 +98,8 @@ class InMemoryLogStorage : public LogStorage { void WriteLogEntries(DataChunk &chunk, const RegisteredLoggingContext &context) override; void Flush() override; + void Truncate() override; + //! LogStorage API: READING bool CanScan() override; @@ -106,6 +112,7 @@ class InMemoryLogStorage : public LogStorage { protected: void WriteLoggingContext(const RegisteredLoggingContext &context); + void ResetBuffers(); protected: mutable mutex lock; diff --git a/src/logging/log_manager.cpp b/src/logging/log_manager.cpp index c937b3fda006..f493e2ee57b2 100644 --- a/src/logging/log_manager.cpp +++ b/src/logging/log_manager.cpp @@ -149,6 +149,11 @@ void LogManager::SetLogStorage(DatabaseInstance &db, const string &storage_name) config.storage = storage_name_to_lower; } +void LogManager::TruncateLogStorage() { + unique_lock lck(lock); + log_storage->Truncate(); +} + LogConfig LogManager::GetConfig() { unique_lock lck(lock); return config; diff --git a/src/logging/log_storage.cpp b/src/logging/log_storage.cpp index 909bddf75830..8afb3f84e174 100644 --- a/src/logging/log_storage.cpp +++ b/src/logging/log_storage.cpp @@ -25,6 +25,9 @@ bool LogStorage::ScanContexts(LogStorageScanState &state, DataChunk &result) con void LogStorage::InitializeScanContexts(LogStorageScanState &state) const { throw NotImplementedException("Not implemented for this LogStorage: InitializeScanContexts"); } +void LogStorage::Truncate() { + throw NotImplementedException("Not implemented for this LogStorage: TruncateLogStorage"); +} StdOutLogStorage::StdOutLogStorage() { } @@ -46,6 +49,10 @@ void StdOutLogStorage::WriteLogEntries(DataChunk &chunk, const RegisteredLogging throw NotImplementedException("StdOutLogStorage::WriteLogEntries"); } +void StdOutLogStorage::Truncate() { + // NOP +} + void StdOutLogStorage::Flush() { // NOP } @@ -82,6 +89,16 @@ InMemoryLogStorage::InMemoryLogStorage(DatabaseInstance &db_p) log_contexts = make_uniq(db_p.GetBufferManager(), log_context_schema); } +void InMemoryLogStorage::ResetBuffers() { + entry_buffer->Reset(); + log_context_buffer->Reset(); + + log_entries->Reset(); + log_contexts->Reset(); + + registered_contexts.clear(); +} + InMemoryLogStorage::~InMemoryLogStorage() { } @@ -122,6 +139,11 @@ void InMemoryLogStorage::Flush() { FlushInternal(); } +void InMemoryLogStorage::Truncate() { + unique_lock lck(lock); + ResetBuffers(); +} + void InMemoryLogStorage::FlushInternal() { if (entry_buffer->size() > 0) { log_entries->Append(*entry_buffer); diff --git a/test/sql/logging/test_logging_function_large.test_slow b/test/sql/logging/test_logging_function_large.test_slow index 7cb6195aaf1c..4bb29995aa02 100644 --- a/test/sql/logging/test_logging_function_large.test_slow +++ b/test/sql/logging/test_logging_function_large.test_slow @@ -33,3 +33,11 @@ SELECT count(*), message FROM duckdb_logs where starts_with(message, 'hi_') grou 250000 hi_client 250000 hi_file 250000 hi_global + +statement ok +pragma truncate_duckdb_logs; + +query I +SELECT count(*) FROM duckdb_logs; +---- +0 \ No newline at end of file From fc58d8b7826ec422d34327bed498b4384f661832 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Mon, 17 Feb 2025 16:14:02 +0100 Subject: [PATCH 125/142] increase max variation for linux --- .../parallel/reclaim_space_primary_key_optimistic.test_slow | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow b/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow index 6865f4d0a75d..07cce7185d9a 100644 --- a/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow +++ b/test/sql/storage/parallel/reclaim_space_primary_key_optimistic.test_slow @@ -78,10 +78,10 @@ SELECT COUNT(*) - ${i} FROM integers2; query I SELECT CASE WHEN ${i} = 0 THEN True::test_result - WHEN current.total_blocks <= total_blocks_tbl.total_blocks * 1.4 THEN True::test_result + WHEN current.total_blocks <= total_blocks_tbl.total_blocks * 1.6 THEN True::test_result ELSE { 'old': total_blocks_tbl.total_blocks, - 'allowed_max': total_blocks_tbl.total_blocks * 1.4, + 'allowed_max': total_blocks_tbl.total_blocks * 1.6, 'actual': current.total_blocks }::test_result END From 16f1151c6a077628f1e90d30c0de57f125b97654 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Mon, 17 Feb 2025 16:16:38 +0100 Subject: [PATCH 126/142] fix #16257 --- extension/parquet/column_writer.cpp | 19 +++++++------- .../parquet/include/parquet_bss_encoder.hpp | 1 - .../parquet/include/parquet_dlba_encoder.hpp | 3 +-- test/issues/general/test_16257.test_slow | 25 +++++++++++++++++++ 4 files changed, 36 insertions(+), 12 deletions(-) create mode 100644 test/issues/general/test_16257.test_slow diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp index ba42a9b2f20a..8791bc596c08 100644 --- a/extension/parquet/column_writer.cpp +++ b/extension/parquet/column_writer.cpp @@ -388,7 +388,7 @@ class BasicColumnWriter : public ColumnWriter { virtual unique_ptr InitializeStatsState(); //! Initialize the writer for a specific page. Only used for scalar types. - virtual unique_ptr InitializePageState(BasicColumnWriterState &state); + virtual unique_ptr InitializePageState(BasicColumnWriterState &state, idx_t page_idx); //! Flushes the writer for a specific page. Only used for scalar types. virtual void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state); @@ -427,7 +427,8 @@ void BasicColumnWriter::RegisterToRowGroup(duckdb_parquet::RowGroup &row_group) row_group.columns.push_back(std::move(column_chunk)); } -unique_ptr BasicColumnWriter::InitializePageState(BasicColumnWriterState &state) { +unique_ptr BasicColumnWriter::InitializePageState(BasicColumnWriterState &state, + idx_t page_idx) { return nullptr; } @@ -502,7 +503,7 @@ void BasicColumnWriter::BeginWrite(ColumnWriterState &state_p) { MaxValue(NextPowerOfTwo(page_info.estimated_page_size), MemoryStream::DEFAULT_INITIAL_CAPACITY)); write_info.write_count = page_info.empty_count; write_info.max_write_count = page_info.row_count; - write_info.page_state = InitializePageState(state); + write_info.page_state = InitializePageState(state, page_idx); write_info.compressed_size = 0; write_info.compressed_data = nullptr; @@ -1232,11 +1233,11 @@ class StandardColumnWriter : public BasicColumnWriter { return std::move(result); } - unique_ptr InitializePageState(BasicColumnWriterState &state_p) override { + unique_ptr InitializePageState(BasicColumnWriterState &state_p, idx_t page_idx) override { auto &state = state_p.Cast>(); - - auto result = make_uniq>(state.total_value_count, state.total_string_size, - state.encoding, state.dictionary); + const auto &page_info = state_p.page_info[page_idx]; + auto result = make_uniq>( + page_info.row_count - page_info.empty_count, state.total_string_size, state.encoding, state.dictionary); return std::move(result); } @@ -1586,7 +1587,7 @@ class BooleanColumnWriter : public BasicColumnWriter { } } - unique_ptr InitializePageState(BasicColumnWriterState &state) override { + unique_ptr InitializePageState(BasicColumnWriterState &state, idx_t page_idx) override { return make_uniq(); } @@ -1828,7 +1829,7 @@ class EnumColumnWriter : public BasicColumnWriter { } } - unique_ptr InitializePageState(BasicColumnWriterState &state) override { + unique_ptr InitializePageState(BasicColumnWriterState &state, idx_t page_idx) override { return make_uniq(bit_width); } diff --git a/extension/parquet/include/parquet_bss_encoder.hpp b/extension/parquet/include/parquet_bss_encoder.hpp index 80da1726de92..65561eb2573a 100644 --- a/extension/parquet/include/parquet_bss_encoder.hpp +++ b/extension/parquet/include/parquet_bss_encoder.hpp @@ -30,7 +30,6 @@ class BssEncoder { } void FinishWrite(WriteStream &writer) { - D_ASSERT(count == total_value_count); writer.WriteData(buffer.get(), total_value_count * bit_width); } diff --git a/extension/parquet/include/parquet_dlba_encoder.hpp b/extension/parquet/include/parquet_dlba_encoder.hpp index b3cd1aa96076..89702fc12e41 100644 --- a/extension/parquet/include/parquet_dlba_encoder.hpp +++ b/extension/parquet/include/parquet_dlba_encoder.hpp @@ -33,9 +33,8 @@ class DlbaEncoder { } void FinishWrite(WriteStream &writer) { - D_ASSERT(stream->GetPosition() == total_string_size); dbp_encoder.FinishWrite(writer); - writer.WriteData(buffer.get(), total_string_size); + writer.WriteData(buffer.get(), stream->GetPosition()); } private: diff --git a/test/issues/general/test_16257.test_slow b/test/issues/general/test_16257.test_slow new file mode 100644 index 000000000000..6b3faf9a7ba4 --- /dev/null +++ b/test/issues/general/test_16257.test_slow @@ -0,0 +1,25 @@ +# name: test/issues/general/test_16257.test_slow +# description: Issue 16257 - value count mismatch when writing DELTA_BINARY_PACKED +# group: [general] + +require parquet + +# Some macros to generate lorem ipsum +statement ok +CREATE OR REPLACE MACRO deterministic_random(rand) AS hash(rand) / 18446744073709551615; + +statement ok +CREATE OR REPLACE MACRO lorem_word(rand) AS ['voluptatem', 'quaerat', 'quiquia', 'non', 'dolore', 'dolorem', 'labore', 'consectetur', 'porro', 'sed', 'numquam', 'aliquam', 'sit', 'eius', 'modi', 'est', 'amet', 'magnam', 'dolor', 'etincidunt', 'velit', 'neque', 'ipsum', 'adipisci', 'quisquam', 'ut', 'tempora'][1 + floor(rand * 27 % 27)::BIGINT]; + +statement ok +CREATE OR REPLACE MACRO lorem_sentence_util(s) AS upper(s[1]) || s[2:] || '.'; + +statement ok +CREATE OR REPLACE MACRO lorem_sentence(rand, words) AS lorem_sentence_util(list_aggr([lorem_word(deterministic_random(rand + i)) for i in range(words)], 'string_agg', ' ')); + + +statement ok +SET preserve_insertion_order=false; + +statement ok +COPY (SELECT lorem_sentence(random(), 20) FROM range(1_000_000)) TO '__TEST_DIR__/16257.parquet' (PARQUET_VERSION V2, ROW_GROUP_SIZE 2_000_000); From 73e15e8b217b41ca263b5c414c6a2cd92088af36 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Mon, 17 Feb 2025 17:10:57 +0100 Subject: [PATCH 127/142] even faster boolean writing --- extension/parquet/writer/boolean_column_writer.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/extension/parquet/writer/boolean_column_writer.cpp b/extension/parquet/writer/boolean_column_writer.cpp index bcfd78b3ea82..a8b2f9add185 100644 --- a/extension/parquet/writer/boolean_column_writer.cpp +++ b/extension/parquet/writer/boolean_column_writer.cpp @@ -52,14 +52,11 @@ void BooleanColumnWriter::WriteVector(WriteStream &temp_writer, ColumnWriterStat const auto &mask = FlatVector::Validity(input_column); const auto *const ptr = FlatVector::GetData(input_column); - if (mask.AllValid()) { + if (stats.max && !stats.min && mask.AllValid()) { + // Fast path: stats have already been set, and there's no NULLs for (idx_t r = chunk_start; r < chunk_end; r++) { const auto &val = ptr[r]; - - stats.max |= val; - stats.min &= val; state.byte |= val << state.byte_pos; - if (++state.byte_pos == 8) { temp_writer.Write(state.byte); state.byte = 0; From 81bd903c1619a809e7f047b87cb392704da45b7c Mon Sep 17 00:00:00 2001 From: pdet Date: Mon, 10 Feb 2025 09:34:13 -0300 Subject: [PATCH 128/142] Adding fuzzer tests --- data/csv/afl/4172/case_1.csv | Bin 0 -> 2398 bytes data/csv/afl/4172/case_2.csv | Bin 0 -> 229 bytes data/csv/afl/4172/case_3.csv | Bin 0 -> 257 bytes data/csv/afl/4172/case_4.csv | Bin 0 -> 239 bytes data/csv/afl/4172/case_5.csv | Bin 0 -> 240 bytes test/sql/copy/csv/afl/test_fuzz_4172.test | 32 ++++++++++++++++++++++ 6 files changed, 32 insertions(+) create mode 100644 data/csv/afl/4172/case_1.csv create mode 100644 data/csv/afl/4172/case_2.csv create mode 100644 data/csv/afl/4172/case_3.csv create mode 100644 data/csv/afl/4172/case_4.csv create mode 100644 data/csv/afl/4172/case_5.csv create mode 100644 test/sql/copy/csv/afl/test_fuzz_4172.test diff --git a/data/csv/afl/4172/case_1.csv b/data/csv/afl/4172/case_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..47200b6b4d2ec0e76ba53d1168e9d63a90edfebc GIT binary patch literal 2398 zcmeHI-H+Nv5Z_dZbVkZ^+qV&Y@C%?_^KsxlUDE4^)TGUYM%Sv+X|NaAlkw8^`sn2p z+Jt;)lgjk!KN5?JqPK7B;@%5ewmZ<4q#_^Qh1X$RXFz^m*GUlp zS=^vWmT1D(tCRcTpZ2H}fvr*xcIgIHYC)4supfnsGdPGkct@l)R&zUr78gHUp4_8K z{_~bG@om%^GWqCy^7zS%;RSy-n4Vw$F}VEx^4DhL&V-iGPMNwCXZWI9* zG3T-1LBykemk+vp1Y#8P(>NLhj=$zR;<{^JUF*{1n5mcY%9NRG$IzY=1O-)J0_G_c z@N4u2tJ~O&pEG^8nu}dh|ItRN{1tdDRDwnwhgj{YQM>k*&PNce# zDX1JWHIv5PdLPthl~Dasu$mYotc_mlLU}8@MB>RZSWz5EQ6)xtUHMmuSr2l(fepn< z5}7Y>QPCqToEjwRMrE);dI@u-mX3N9H|L}QUTI3u~fO0$1$Pb zop!qakws$E+Lg+YIwd2ss+i4kZ^Lkzjz!;T?G6j}Hz1_DD}eTd1&aLfJZ;BaKn5`+$p4dt;Q9T|J-g zO+yqe-6Mq4$9{v@bMo3 zxBRprM@Qg8Ek1k6I;_t)d&pYs346mb`buU9jo6TF$L{DldCFSy<>KCM9zLSw_ZOI$$nWh?PQSUWtx^dWI3ug0jS-WQYN*|NpaQBo=`z llKP{jqn=@?lVQYVW}s8fz{J47rEjQHo)0w0%*3xe9{^6dIeh>C literal 0 HcmV?d00001 diff --git a/data/csv/afl/4172/case_3.csv b/data/csv/afl/4172/case_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..691f0c0e161588d33fb6b4be9499800244f0176c GIT binary patch literal 257 zcmY+8JqyA>42JKvl0k6Q<>=ro0nMG(a+`>#i$f{yR~0H$3&jsa{89craq#*fXdv*0 zym{XE$MfN~Yu$Z858nuSN+aYtvq+FYlG(o|PJZQwAOCbQi6jb{ycl~eSMh8v6_ek!>Li(Iy*kN@>7qgxHAv@#Ofaa<5%%v8e&0 pv6!T`D1$oh9?Fz3RS*pDnwO|PH6l=rP#Rrw-{?SpSHsmc{{U@iK(+t? literal 0 HcmV?d00001 diff --git a/data/csv/afl/4172/case_4.csv b/data/csv/afl/4172/case_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..c3c65cce9923449b0e63966944fb1d1b6b0919a2 GIT binary patch literal 239 zcmXrwVwB`y{LjF^zz73B)Mdq^)j>RUCr@|J1KH|23Z_sX$-&IWC?q4Mu9~Z_nxd~N zsimNb3QSE^bv> Date: Mon, 10 Feb 2025 10:24:30 -0300 Subject: [PATCH 129/142] Fix for buffer_size=1 in encoding, and check for conflicts between maximum line size and buffer size --- .../operator/csv_scanner/encode/csv_encoder.cpp | 4 ++++ .../operator/csv_scanner/util/csv_reader_options.cpp | 10 ++++++++++ test/sql/copy/csv/afl/test_fuzz_4172.test | 6 +++--- .../copy/csv/parallel/csv_parallel_buffer_size.test | 8 ++++---- test/sql/copy/csv/relaxed_quotes.test | 7 +------ test/sql/copy/csv/test_validator.test | 2 +- 6 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/execution/operator/csv_scanner/encode/csv_encoder.cpp b/src/execution/operator/csv_scanner/encode/csv_encoder.cpp index 89fc5df040bd..8a6c08032597 100644 --- a/src/execution/operator/csv_scanner/encode/csv_encoder.cpp +++ b/src/execution/operator/csv_scanner/encode/csv_encoder.cpp @@ -51,6 +51,10 @@ CSVEncoder::CSVEncoder(DBConfig &config, const string &encoding_name_to_find, id } // We ensure that the encoded buffer size is an even number to make the two byte lookup on utf-16 work idx_t encoded_buffer_size = buffer_size % 2 != 0 ? buffer_size - 1 : buffer_size; + if (encoded_buffer_size == 0) { + // This might happen if buffer size = 1 + encoded_buffer_size = 2; + } D_ASSERT(encoded_buffer_size > 0); encoded_buffer.Initialize(encoded_buffer_size); remaining_bytes_buffer.Initialize(function->GetBytesPerIteration()); diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index 7957f2c47be8..5c91a5523eef 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -251,6 +251,10 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, throw BinderException("Invalid value for MAX_LINE_SIZE parameter: it cannot be smaller than 0"); } maximum_line_size.Set(NumericCast(line_size)); + if (buffer_size_option.IsSetByUser() && maximum_line_size.GetValue() > buffer_size_option.GetValue()) { + throw InvalidInputException("Buffer Size of %d must be a higher value than the maximum line size %d", + buffer_size_option.GetValue(), maximum_line_size.GetValue()); + } } else if (loption == "date_format" || loption == "dateformat") { string format = ParseString(value, loption); SetDateFormat(LogicalTypeId::DATE, format, true); @@ -264,6 +268,12 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, if (buffer_size_option == 0) { throw InvalidInputException("Buffer Size option must be higher than 0"); } + if (maximum_line_size.IsSetByUser() && maximum_line_size.GetValue() > buffer_size_option.GetValue()) { + throw InvalidInputException("Buffer Size of %d must be a higher value than the maximum line size %d", + buffer_size_option.GetValue(), maximum_line_size.GetValue()); + } else { + maximum_line_size.Set(buffer_size_option.GetValue(), false); + } } else if (loption == "decimal_separator") { decimal_separator = ParseString(value, loption); if (decimal_separator != "." && decimal_separator != ",") { diff --git a/test/sql/copy/csv/afl/test_fuzz_4172.test b/test/sql/copy/csv/afl/test_fuzz_4172.test index 14cdfc62d2ba..2390bbaa1c9e 100644 --- a/test/sql/copy/csv/afl/test_fuzz_4172.test +++ b/test/sql/copy/csv/afl/test_fuzz_4172.test @@ -1,9 +1,9 @@ # name: test/sql/copy/csv/afl/test_fuzz_4172.test # description: fuzzer generated csv files - should not raise internal exception (by failed assertion). -# group: [csv] +# group: [afl] -# statement ok -# PRAGMA enable_verification +statement ok +PRAGMA enable_verification query I select count(file) from glob('data/csv/afl/4172/*'); diff --git a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test index 466758becc6f..0fa6a8506860 100644 --- a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test +++ b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test @@ -35,7 +35,7 @@ select * from read_csv('data/csv/test/multi_column_string.csv', COLUMNS=STRUCT_ 100000000 15519 785 p9 query IIII -select * from read_csv('data/csv/test/multi_column_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) +select * from read_csv('data/csv/test/multi_column_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=27) ---- 1 6370 371 p1 10 214 465 p2 @@ -53,7 +53,7 @@ SELECT sum(a) FROM read_csv('data/csv/test/new_line_string_rn.csv', COLUMNS=STR 111 query I -SELECT sum(a) FROM read_csv('data/csv/test/new_line_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=80) +SELECT sum(a) FROM read_csv('data/csv/test/new_line_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=100) ---- 111 @@ -64,7 +64,7 @@ SELECT sum(a) FROM read_csv('data/csv/test/new_line_string_rn_exc.csv', COLUMNS 111 query I -SELECT sum(a) FROM read_csv('data/csv/test/new_line_string_rn_exc.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=60) +SELECT sum(a) FROM read_csv('data/csv/test/new_line_string_rn_exc.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=80) ---- 111 @@ -75,6 +75,6 @@ SELECT sum(a) FROM read_csv('data/csv/test/new_line_string.csv', COLUMNS=STRUCT 111 query I -SELECT sum(a) FROM read_csv('data/csv/test/new_line_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=60) +SELECT sum(a) FROM read_csv('data/csv/test/new_line_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=80) ---- 111 diff --git a/test/sql/copy/csv/relaxed_quotes.test b/test/sql/copy/csv/relaxed_quotes.test index f8c6e8012c88..6bdfa8ede56b 100644 --- a/test/sql/copy/csv/relaxed_quotes.test +++ b/test/sql/copy/csv/relaxed_quotes.test @@ -78,12 +78,7 @@ statement ok drop table t; statement error -create table t as from read_csv('data/csv/unescaped_quotes/unescaped_quote_new_line_rn.csv', strict_mode=false, buffer_size = 20, header = 0) ----- - - -statement error -create table t as from read_csv('data/csv/unescaped_quotes/unescaped_quote_new_line_rn.csv', strict_mode=false, buffer_size = 20, header = 0) +create table t as from read_csv('data/csv/unescaped_quotes/unescaped_quote_new_line_rn.csv', strict_mode=false, buffer_size = 20, header = 0, delim = ';') ---- statement ok diff --git a/test/sql/copy/csv/test_validator.test b/test/sql/copy/csv/test_validator.test index 66a444809520..e44d97b23014 100644 --- a/test/sql/copy/csv/test_validator.test +++ b/test/sql/copy/csv/test_validator.test @@ -52,7 +52,7 @@ statement ok FROM read_csv('data/csv/validator/quoted_new_value.csv', columns = {'band': 'varchar', 'album': 'varchar', 'release': 'varchar'}, quote = '''', delim = ';', header = 0) statement ok -FROM read_csv('data/csv/validator/quoted_new_value.csv', columns = {'band': 'varchar', 'album': 'varchar', 'release': 'varchar'}, quote = '''', delim = ';', header = 0, buffer_size = 46) +FROM read_csv('data/csv/validator/quoted_new_value.csv', columns = {'band': 'varchar', 'album': 'varchar', 'release': 'varchar'}, quote = '''', delim = ';', header = 0, buffer_size = 48) statement ok FROM read_csv('data/csv/validator/single_column_quoted_newline.csv', columns = {'Raffaella Carrà': 'varchar'}, quote = '"', buffer_size = 24) From 7cba0a92b233eb21da4ef8f0ffab5fa8d6e4008c Mon Sep 17 00:00:00 2001 From: pdet Date: Mon, 10 Feb 2025 14:11:01 -0300 Subject: [PATCH 130/142] Adjust a couple more tests --- test/sql/copy/csv/maximum_line_size.test_slow | 2 +- test/sql/copy/csv/parallel/csv_parallel_buffer_size.test | 2 +- test/sql/copy/csv/parallel/csv_parallel_new_line.test_slow | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/sql/copy/csv/maximum_line_size.test_slow b/test/sql/copy/csv/maximum_line_size.test_slow index db5672073fbf..1ea62d7e6513 100644 --- a/test/sql/copy/csv/maximum_line_size.test_slow +++ b/test/sql/copy/csv/maximum_line_size.test_slow @@ -39,4 +39,4 @@ Be sure that the maximum line size is set to an appropriate value statement error select * from read_csv_auto('data/csv/issue_8320_3.csv.gz', max_line_size = 2097152, buffer_size = 10); ---- -BUFFER_SIZE option was set to 10, while MAX_LINE_SIZE was set to 2097152. BUFFER_SIZE must have always be set to value bigger than MAX_LINE_SIZE \ No newline at end of file +Buffer Size of 10 must be a higher value than the maximum line size 2097152 \ No newline at end of file diff --git a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test index 0fa6a8506860..e3b0531499a6 100644 --- a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test +++ b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test @@ -22,7 +22,7 @@ SELECT sum(a) FROM read_csv('data/csv/test/multi_column_integer_rn.csv', COLUMN 111111111 query IIII -select * from read_csv('data/csv/test/multi_column_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) +select * from read_csv('data/csv/test/multi_column_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=30) ---- 1 6370 371 p1 10 214 465 p2 diff --git a/test/sql/copy/csv/parallel/csv_parallel_new_line.test_slow b/test/sql/copy/csv/parallel/csv_parallel_new_line.test_slow index b9654da5d729..f9fb5c45385f 100644 --- a/test/sql/copy/csv/parallel/csv_parallel_new_line.test_slow +++ b/test/sql/copy/csv/parallel/csv_parallel_new_line.test_slow @@ -9,7 +9,7 @@ PRAGMA verify_parallelism statement ok PRAGMA enable_verification -loop i 25 100 +loop i 27 100 # Test read_csv auto with \n From fb12980e8e1911655d383ebc418a8c07b151131b Mon Sep 17 00:00:00 2001 From: pdet Date: Wed, 12 Feb 2025 07:15:05 -0300 Subject: [PATCH 131/142] More tests and fixes --- .../afl/20250211_csv_fuzz_crash/case_1.csv | Bin 0 -> 171 bytes .../afl/20250211_csv_fuzz_crash/case_10.csv | Bin 0 -> 209 bytes .../afl/20250211_csv_fuzz_crash/case_100.csv | 3 + .../afl/20250211_csv_fuzz_crash/case_101.csv | Bin 0 -> 204 bytes .../afl/20250211_csv_fuzz_crash/case_102.csv | Bin 0 -> 136 bytes .../afl/20250211_csv_fuzz_crash/case_103.csv | Bin 0 -> 86 bytes .../afl/20250211_csv_fuzz_crash/case_104.csv | Bin 0 -> 86 bytes .../afl/20250211_csv_fuzz_crash/case_105.csv | Bin 0 -> 132 bytes .../afl/20250211_csv_fuzz_crash/case_106.csv | Bin 0 -> 86 bytes .../afl/20250211_csv_fuzz_crash/case_107.csv | Bin 0 -> 137 bytes .../afl/20250211_csv_fuzz_crash/case_108.csv | Bin 0 -> 137 bytes .../afl/20250211_csv_fuzz_crash/case_109.csv | Bin 0 -> 122 bytes .../afl/20250211_csv_fuzz_crash/case_11.csv | Bin 0 -> 241 bytes .../afl/20250211_csv_fuzz_crash/case_110.csv | Bin 0 -> 116 bytes .../afl/20250211_csv_fuzz_crash/case_111.csv | Bin 0 -> 98 bytes .../afl/20250211_csv_fuzz_crash/case_112.csv | Bin 0 -> 134 bytes .../afl/20250211_csv_fuzz_crash/case_113.csv | Bin 0 -> 128 bytes .../afl/20250211_csv_fuzz_crash/case_114.csv | Bin 0 -> 98 bytes .../afl/20250211_csv_fuzz_crash/case_115.csv | Bin 0 -> 113 bytes .../afl/20250211_csv_fuzz_crash/case_116.csv | Bin 0 -> 159 bytes .../afl/20250211_csv_fuzz_crash/case_117.csv | Bin 0 -> 405 bytes .../afl/20250211_csv_fuzz_crash/case_118.csv | Bin 0 -> 134 bytes .../afl/20250211_csv_fuzz_crash/case_119.csv | Bin 0 -> 93 bytes .../afl/20250211_csv_fuzz_crash/case_12.csv | Bin 0 -> 241 bytes .../afl/20250211_csv_fuzz_crash/case_120.csv | Bin 0 -> 148 bytes .../afl/20250211_csv_fuzz_crash/case_121.csv | Bin 0 -> 228 bytes .../afl/20250211_csv_fuzz_crash/case_122.csv | Bin 0 -> 220 bytes .../afl/20250211_csv_fuzz_crash/case_123.csv | Bin 0 -> 264 bytes .../afl/20250211_csv_fuzz_crash/case_124.csv | Bin 0 -> 415 bytes .../afl/20250211_csv_fuzz_crash/case_125.csv | Bin 0 -> 418 bytes .../afl/20250211_csv_fuzz_crash/case_126.csv | 30 + .../afl/20250211_csv_fuzz_crash/case_127.csv | 28 + .../afl/20250211_csv_fuzz_crash/case_128.csv | Bin 0 -> 279 bytes .../afl/20250211_csv_fuzz_crash/case_129.csv | 30 + .../afl/20250211_csv_fuzz_crash/case_13.csv | Bin 0 -> 186 bytes .../afl/20250211_csv_fuzz_crash/case_130.csv | Bin 0 -> 759 bytes .../afl/20250211_csv_fuzz_crash/case_131.csv | Bin 0 -> 199 bytes .../afl/20250211_csv_fuzz_crash/case_132.csv | Bin 0 -> 291 bytes .../afl/20250211_csv_fuzz_crash/case_14.csv | Bin 0 -> 175 bytes .../afl/20250211_csv_fuzz_crash/case_15.csv | 20 + .../afl/20250211_csv_fuzz_crash/case_16.csv | Bin 0 -> 177 bytes .../afl/20250211_csv_fuzz_crash/case_17.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_18.csv | 8 + .../afl/20250211_csv_fuzz_crash/case_19.csv | 5 + .../afl/20250211_csv_fuzz_crash/case_2.csv | Bin 0 -> 171 bytes .../afl/20250211_csv_fuzz_crash/case_20.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_21.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_22.csv | Bin 0 -> 176 bytes .../afl/20250211_csv_fuzz_crash/case_23.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_24.csv | 7 + .../afl/20250211_csv_fuzz_crash/case_25.csv | 7 + .../afl/20250211_csv_fuzz_crash/case_26.csv | 7 + .../afl/20250211_csv_fuzz_crash/case_27.csv | 8 + .../afl/20250211_csv_fuzz_crash/case_28.csv | Bin 0 -> 175 bytes .../afl/20250211_csv_fuzz_crash/case_29.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_3.csv | Bin 0 -> 160 bytes .../afl/20250211_csv_fuzz_crash/case_30.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_31.csv | Bin 0 -> 189 bytes .../afl/20250211_csv_fuzz_crash/case_32.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_33.csv | Bin 0 -> 186 bytes .../afl/20250211_csv_fuzz_crash/case_34.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_35.csv | 27 + .../afl/20250211_csv_fuzz_crash/case_36.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_37.csv | 10 + .../afl/20250211_csv_fuzz_crash/case_38.csv | 6 + .../afl/20250211_csv_fuzz_crash/case_39.csv | Bin 0 -> 181 bytes .../afl/20250211_csv_fuzz_crash/case_4.csv | Bin 0 -> 171 bytes .../afl/20250211_csv_fuzz_crash/case_40.csv | 4 + .../afl/20250211_csv_fuzz_crash/case_41.csv | 3 + .../afl/20250211_csv_fuzz_crash/case_42.csv | 4 + .../afl/20250211_csv_fuzz_crash/case_43.csv | 4 + .../afl/20250211_csv_fuzz_crash/case_44.csv | Bin 0 -> 209 bytes .../afl/20250211_csv_fuzz_crash/case_45.csv | 4 + .../afl/20250211_csv_fuzz_crash/case_46.csv | 5 + .../afl/20250211_csv_fuzz_crash/case_47.csv | Bin 0 -> 204 bytes .../afl/20250211_csv_fuzz_crash/case_48.csv | Bin 0 -> 241 bytes .../afl/20250211_csv_fuzz_crash/case_49.csv | 3 + .../afl/20250211_csv_fuzz_crash/case_5.csv | Bin 0 -> 171 bytes .../afl/20250211_csv_fuzz_crash/case_50.csv | Bin 0 -> 157 bytes .../afl/20250211_csv_fuzz_crash/case_51.csv | Bin 0 -> 201 bytes .../afl/20250211_csv_fuzz_crash/case_52.csv | 5 + .../afl/20250211_csv_fuzz_crash/case_53.csv | Bin 0 -> 88 bytes .../afl/20250211_csv_fuzz_crash/case_54.csv | Bin 0 -> 234 bytes .../afl/20250211_csv_fuzz_crash/case_55.csv | Bin 0 -> 232 bytes .../afl/20250211_csv_fuzz_crash/case_56.csv | Bin 0 -> 288 bytes .../afl/20250211_csv_fuzz_crash/case_57.csv | Bin 0 -> 288 bytes .../afl/20250211_csv_fuzz_crash/case_58.csv | Bin 0 -> 319 bytes .../afl/20250211_csv_fuzz_crash/case_59.csv | Bin 0 -> 239 bytes .../afl/20250211_csv_fuzz_crash/case_6.csv | Bin 0 -> 171 bytes .../afl/20250211_csv_fuzz_crash/case_60.csv | Bin 0 -> 236 bytes .../afl/20250211_csv_fuzz_crash/case_61.csv | Bin 0 -> 236 bytes .../afl/20250211_csv_fuzz_crash/case_62.csv | Bin 0 -> 2371 bytes .../afl/20250211_csv_fuzz_crash/case_63.csv | Bin 0 -> 2399 bytes .../afl/20250211_csv_fuzz_crash/case_64.csv | Bin 0 -> 261 bytes .../afl/20250211_csv_fuzz_crash/case_65.csv | 27 + .../afl/20250211_csv_fuzz_crash/case_66.csv | Bin 0 -> 268 bytes .../afl/20250211_csv_fuzz_crash/case_67.csv | Bin 0 -> 291 bytes .../afl/20250211_csv_fuzz_crash/case_68.csv | Bin 0 -> 252 bytes .../afl/20250211_csv_fuzz_crash/case_69.csv | Bin 0 -> 233 bytes .../afl/20250211_csv_fuzz_crash/case_7.csv | Bin 0 -> 171 bytes .../afl/20250211_csv_fuzz_crash/case_70.csv | Bin 0 -> 272 bytes .../afl/20250211_csv_fuzz_crash/case_71.csv | Bin 0 -> 251 bytes .../afl/20250211_csv_fuzz_crash/case_72.csv | Bin 0 -> 251 bytes .../afl/20250211_csv_fuzz_crash/case_73.csv | Bin 0 -> 810 bytes .../afl/20250211_csv_fuzz_crash/case_74.csv | Bin 0 -> 236 bytes .../afl/20250211_csv_fuzz_crash/case_75.csv | Bin 0 -> 231 bytes .../afl/20250211_csv_fuzz_crash/case_76.csv | Bin 0 -> 232 bytes .../afl/20250211_csv_fuzz_crash/case_77.csv | Bin 0 -> 87 bytes .../afl/20250211_csv_fuzz_crash/case_78.csv | Bin 0 -> 92 bytes .../afl/20250211_csv_fuzz_crash/case_79.csv | Bin 0 -> 106 bytes .../afl/20250211_csv_fuzz_crash/case_8.csv | Bin 0 -> 175 bytes .../afl/20250211_csv_fuzz_crash/case_80.csv | Bin 0 -> 85 bytes .../afl/20250211_csv_fuzz_crash/case_81.csv | Bin 0 -> 101 bytes .../afl/20250211_csv_fuzz_crash/case_82.csv | Bin 0 -> 90 bytes .../afl/20250211_csv_fuzz_crash/case_83.csv | Bin 0 -> 101 bytes .../afl/20250211_csv_fuzz_crash/case_84.csv | Bin 0 -> 455 bytes .../afl/20250211_csv_fuzz_crash/case_85.csv | Bin 0 -> 449 bytes .../afl/20250211_csv_fuzz_crash/case_86.csv | Bin 0 -> 405 bytes .../afl/20250211_csv_fuzz_crash/case_87.csv | Bin 0 -> 397 bytes .../afl/20250211_csv_fuzz_crash/case_88.csv | Bin 0 -> 412 bytes .../afl/20250211_csv_fuzz_crash/case_89.csv | Bin 0 -> 393 bytes .../afl/20250211_csv_fuzz_crash/case_9.csv | Bin 0 -> 200 bytes .../afl/20250211_csv_fuzz_crash/case_90.csv | Bin 0 -> 421 bytes .../afl/20250211_csv_fuzz_crash/case_91.csv | Bin 0 -> 446 bytes .../afl/20250211_csv_fuzz_crash/case_92.csv | Bin 0 -> 397 bytes .../afl/20250211_csv_fuzz_crash/case_93.csv | Bin 0 -> 442 bytes .../afl/20250211_csv_fuzz_crash/case_94.csv | Bin 0 -> 444 bytes .../afl/20250211_csv_fuzz_crash/case_95.csv | Bin 0 -> 264 bytes .../afl/20250211_csv_fuzz_crash/case_96.csv | Bin 0 -> 397 bytes .../afl/20250211_csv_fuzz_crash/case_97.csv | Bin 0 -> 373 bytes .../afl/20250211_csv_fuzz_crash/case_98.csv | Bin 0 -> 270 bytes .../afl/20250211_csv_fuzz_crash/case_99.csv | Bin 0 -> 322 bytes .../scanner/string_value_scanner.cpp | 26 +- .../sql/copy/csv/afl/fuzz_20250211_crash.test | 539 ++++++++++++++++++ 134 files changed, 864 insertions(+), 10 deletions(-) create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_1.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_10.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_100.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_101.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_102.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_103.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_104.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_105.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_106.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_107.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_108.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_109.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_11.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_110.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_111.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_112.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_113.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_114.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_115.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_116.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_117.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_118.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_119.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_12.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_120.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_121.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_122.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_123.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_124.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_125.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_126.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_127.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_128.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_129.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_13.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_130.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_131.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_132.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_14.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_15.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_16.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_17.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_18.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_19.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_2.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_20.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_21.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_22.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_23.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_24.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_25.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_26.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_27.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_28.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_29.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_3.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_30.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_31.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_32.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_33.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_34.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_35.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_36.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_37.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_38.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_39.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_4.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_40.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_41.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_42.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_43.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_44.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_45.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_46.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_47.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_48.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_49.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_5.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_50.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_51.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_52.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_53.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_54.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_55.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_56.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_57.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_58.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_59.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_6.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_60.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_61.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_62.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_63.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_64.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_65.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_66.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_67.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_68.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_69.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_7.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_70.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_71.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_72.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_73.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_74.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_75.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_76.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_77.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_78.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_79.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_8.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_80.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_81.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_82.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_83.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_84.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_85.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_86.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_87.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_88.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_89.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_9.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_90.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_91.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_92.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_93.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_94.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_95.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_96.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_97.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_98.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_99.csv create mode 100644 test/sql/copy/csv/afl/fuzz_20250211_crash.test diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_1.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..2dff96bfb5f9b182d598d7cfb9ed20a53a95b5be GIT binary patch literal 171 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQQldagNcEGg^AIC#jcu5JyG3CLEY1@-qqbTNL@!g zN!_Yiy(}>YC@K&a83zS!VA==5Q!jJNQMa#E;!=XCL}n`>bh#tx;xaT)GBhv%QU(GH M28=+;KnH3#07#T7&;S4c literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_10.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_10.csv new file mode 100644 index 0000000000000000000000000000000000000000..1e43526015ac6450d9ea8a7f25505492a6628370 GIT binary patch literal 209 zcmeZK<<#ZYWz=U@Pf=%-XOw2J)#U^O1_nk32n&T&FH6i(w^C4ui;RoQj{^aBFzEv( zmAI6up(?>_2F3=suDG}eu&z>M0MZRMA~6SIVmyQ!w-913m=s`OcopL6 Y9%2OsjJ&J{hCs+=XrNdE;z>Q)Ns!66u?1cf?> zXp|-9D1gq;08fT;u8* e2hyWn3DnP(&BBo7;h~od(+w0=0IOF984Li$;)NM g%f-vfrJxWO83zUKVA==9)d3lg!YN7yS)>l70Hy3CLI3~& literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_103.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_103.csv new file mode 100644 index 0000000000000000000000000000000000000000..627236618725faeee0f2a66fdc8dfaf2ebff8b19 GIT binary patch literal 86 zcmY#Za8yXnNGwt}Qcy2TOq1u(;^bstV5sMWgG41OLmj2${2V40o-Fk^D9`~ZFG^;J L^FbE@>Q)B;4d)YR literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_104.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_104.csv new file mode 100644 index 0000000000000000000000000000000000000000..3ffe8f761076d228091c82edc09de307cc15f5ff GIT binary patch literal 86 zcmY#Za8yXnNGwt}Qcy2TOq1u(Vt{}+AkYDcl`HUaS@LpO@pAF%M#P%IkXrcKpCRGD79RH rm&=lu%ZitaSBXnOAuciw3fv(ygz_nbit2#ODN6Q%seSP=?B(FP9ZB7q1eRfM5wfq*-h^Z}D`>Sb;@>h`rtTuRkYm0&glBO?P$S6o~KSXU`B0O{1ObReCyb5u353vFRMqX9}Lm=cbG|)L`V88_h05se=hyVZp literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_110.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_110.csv new file mode 100644 index 0000000000000000000000000000000000000000..48f32730da7f3dc915d56e92b8784e748387b441 GIT binary patch literal 116 zcmZ=%NX|$sQr9uGG`D1Q;FRSA10x0Xvcxob4lM>SP*!jRD@!d`;N{Tb<+9@C;#J~O aP>73+g93Lj?E~ZLfb=tBfTCoOaq0jy9u1bwdLn;L-t81_lbr8Hq*eMhfa>iD~j2TD(eJ3JP(NV3k0^9nA27adkk- N7%@OmG6N8(0|1OQ6t@5X literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_112.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_112.csv new file mode 100644 index 0000000000000000000000000000000000000000..73296b846c3bb9a5983b32a841e35a29f892f0c0 GIT binary patch literal 134 zcmY#Za8yXnNGwt}Qcy2TOq1s@Ff>v%umS-~FbO4iS%DJTR#sXJV4w`umRhdB%Vo*S jWyQD literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_113.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_113.csv new file mode 100644 index 0000000000000000000000000000000000000000..91f981ac0cc27ecc6756b472a295f0772c608e51 GIT binary patch literal 128 zcmZQ%1cDcg43?Ib%-?_n8sO#CRrzmeSp@`^mfBFsYW1?j9Ca&&S|u(eOG{p`s2-X| aOG_(-irN$gC79w0uyQC*y%MAxWHQ)N1N?b~YhK5`U3UQHf Saj|i6>Xw#z2t7zFkahqQA{E#G literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_115.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_115.csv new file mode 100644 index 0000000000000000000000000000000000000000..e58f1e9f64aa92c00ad58df71c909366c8275960 GIT binary patch literal 113 zcmZQ%WMnXiQ!ldw0+s)kmQ_^(RhE{#Ag-kzm_#Oc83Z7Ts(^~Lp)zhc>h`rtTuSaC fLFx<*Zy6XE1U^HgK*qR2jQL;Bz+lPCtE&P4BGDVB literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_116.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_116.csv new file mode 100644 index 0000000000000000000000000000000000000000..f30120d3aa53c5e68cd1bbc85b14f504650971eb GIT binary patch literal 159 zcmXriDJd#V$mdNe83IL-=FfIT9 literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_117.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_117.csv new file mode 100644 index 0000000000000000000000000000000000000000..aa7e242fad58bbc5cfb7c8d9f456b8c407383cd9 GIT binary patch literal 405 zcmZQ%!~+NBgSs58nlN;BB%a&mE^gE%1Y0TIZCFfcGQFml0w6(a)!3lj!lG*DM( z`CreZo~RzKpzi5c@9Jg=2XT>cartp^c5!j;alE{`P;o6@US2R$$I>z`4rmJl$mX~> z^)j~{b^BT+E~RRXGS_BxtNTDU5&-+b2IPnP)#_!5IqFsl3Q!w>y4^uW05w=utBYX; z%3@+=3=QfKA2K{e_!Q)pxHuV*O@FvDK*rR72rxjf;y{K90dd*?1~g?7jE0Ivh6Y@Q RhK9f(s|Qj*z{M382LNjHN|OKp literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_118.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_118.csv new file mode 100644 index 0000000000000000000000000000000000000000..12c1eefed8aeda7b8b899c4fcc22e88442b2ba70 GIT binary patch literal 134 zcmY#Za8yXnNGwt}Qc#a$U|@)gi}Q(7h6<*ZEAVnz@^V@6a`7s0DJaC{s9PzhSE?)2 nDsd@QtCuAL`Rb8CEnwgdCVim1I2{F`0{oyVu_zg6CxbcwCtW4~ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_119.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_119.csv new file mode 100644 index 0000000000000000000000000000000000000000..79fea4b0758b2a85f0077dc5ba92526b45a2a78b GIT binary patch literal 93 zcmY#Za8yXnNGwt}Qcy2TOq1u(VgLeOB`yVpxX3sl0P>W9YCz05pSU_gE|0|!V>WS literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_12.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_12.csv new file mode 100644 index 0000000000000000000000000000000000000000..4c1a8352beb22246e53a5cd2b5c696744fca8535 GIT binary patch literal 241 zcmXpsGUhUb;{3c+AUhb&H)QhiL#)7nk(brL5D2*p4Rj0*47i{G02tahzyJUM literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_120.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_120.csv new file mode 100644 index 0000000000000000000000000000000000000000..dc91aa419dda8f81202f056122a3368eb813209c GIT binary patch literal 148 zcmY#Za8yXnNGwt}Qcy2TOq1u(Vqi!tN{tHvaf`}X1z5p=*T4b7P~hdVCcu pN{!P2DpM>fM~EtMDJaB6#zBEQjDd^xiDQIFfecD4N@f58bpVW-Bq{&^ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_121.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_121.csv new file mode 100644 index 0000000000000000000000000000000000000000..416d4de5e3f6fa6a8f82606e27787d3778dc14a1 GIT binary patch literal 228 zcmX|)y$ixH5XJ8>OQJ(}2TMS13O1V*#6h%6$2zzih#d=VS*qwCE@&>1{@~tyJa~tg z`C_})Ztie1qn>buOV~NBUpk|-MgT}l?4XMA?@+i7RF@@RnC~n!BDYvZc(6UB!9Z_5 zMm*+t6cDU6cBJs0B9eVh5!|LFOA()T$EK?jpa6X3E8`bJ&YzS#GlkPY+x=7C`~a1B BD$M`@ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_122.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_122.csv new file mode 100644 index 0000000000000000000000000000000000000000..29da7018bbd835f6eb234d0ea9d5faf4807dded3 GIT binary patch literal 220 zcmZQ%gn<8yKB}C$+`5eV%<3uXjPi`q47R$Qob{X#zyhLlpd1LLUY3}nZl$0Q7a13q z9|r>NU=m1hfq)P;P!p#F16)eg>SYZuBkF+KV&g&~M)<%C0@4Bu3=AQz?jcrSz{tz$ eZ)jj>z-7n-0$_^}gt}o|oO+pCj=Fs<$QA&eh9_PC literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_123.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_123.csv new file mode 100644 index 0000000000000000000000000000000000000000..3eebaa16b6aa47c88857f1169f10054a423d13d9 GIT binary patch literal 264 zcmZQ%1Oo;zc*DS82?9(kJk{!n>Q?Gr!Tx^gwE~QW)k^jGIq``~Rt7ps$v{>TkX6eS z?Fj=eKnA)POb(UK%P&z#RLCmM&x_?s&Ig$VG}RDps!=UhHP{A_C?ktpwK`CVx|M=` za7a+7bBIP+Vva(XW013l<62GiT7_zLpn$qntrC|KlnJ3w?S%5I6e^$!85)2_Rwy88 F0|361M_vE` literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_124.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_124.csv new file mode 100644 index 0000000000000000000000000000000000000000..955e7021fe1e00e1fab21209acb1864d3fc2f49c GIT binary patch literal 415 zcmZQ%1cU!T0AYbihR+b{DO}7_3<@A3mM{s0xX3sta8Hhli;c4Y@nEuWnhOT37#SE? zm@ojNfx0@&|9U3%M0G0#bx*%~S2s&IfLdV}7v~hi1&KL83&Ebq#{^*W zz{WzY4GI<$({c9j^l{Yz1&0|lOvK6<7#IY=o@GD;86(I8adBK+adDLpxA}k#1iH{Q i7w$r!0~ifO84VSU3=Oyp4GkF>8tQ=*5O8tbjROE@LR3Bg literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_125.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_125.csv new file mode 100644 index 0000000000000000000000000000000000000000..9c24c33b7bc7f8f769d417ddff855a26ad2b6d99 GIT binary patch literal 418 zcmZQ%#0D5XgGCvh!X+)mpa3Fb36oHWi;RN;_hb;m2SmVR;WQTvSTQm%urOf&Mgw(q zmjCrk>WS)B3hJJI^{#G~Z~(Q!E-ubJj+a*#DDLVS9AXZnwT)}Q%=jvxrS2e`_LVx%D!iGb$1O?*HKSWx2jezOU#LjtLB6Q^|btakew-T5gORmAE-p@()AgCvQ`8yd8KoKSSF0-k%>imtU|gnOtzMFuo2qUFrix1va|=qU)OCO+ zrWPj`WfqiV=H~$=%My!{GZKr`Ye9;2kmR_G42%qP4Ge)O#K6D`jC67{Q}Qy?GfEV4 F@&VzKR_y=) literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_126.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_126.csv new file mode 100644 index 000000000000..b0e23e1f3f15 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_126.csv @@ -0,0 +1,30 @@ +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +1 "" +10" +199;1 +;a "" +10" +199;1000;"a "" +;a99;1000;"a ""000;"a "" +;a "" +10 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_127.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_127.csv new file mode 100644 index 000000000000..7c1e2505553f --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_127.csv @@ -0,0 +1,28 @@ +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +1 ""#10" +199;1 +;a "" +10" +199;1000;"a "" +;a99;1000;"a ""000;"a ""a "" +;a99;1 \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_128.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_128.csv new file mode 100644 index 0000000000000000000000000000000000000000..ae9fd5dd6f8f0d3ff80db4a464ad756378f2ef53 GIT binary patch literal 279 zcmXrCw6r!fFfg!IN>or%;<5&lh6YMphJ#ZA46$ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_129.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_129.csv new file mode 100644 index 000000000000..475ffa66fb2c --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_129.csv @@ -0,0 +1,30 @@ +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +1 "" +10" +199;1 +;a "" +10" +199;1000;"a "" +;a99;1000;"a ""000;"a "" +;{{{{{{{{{{{{{a "" +10" diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_13.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_13.csv new file mode 100644 index 0000000000000000000000000000000000000000..738932afa8f621cb8c6703b3a12350cb65c74fe5 GIT binary patch literal 186 zcmXpsGUhS_;}49eK%ZGXMV-M92pAa{locEm(o)NTD*mIYLguO03NRX0D<$XW#3w3Q z8R&pnNkCRDS9D%}i9(`6R&jn_EEimfAxMdi0$5$L5+lgQI5;SUu)r3?sT&#q0hf-B OdXjofwR%}%jyeFSPbPf; literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_130.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_130.csv new file mode 100644 index 0000000000000000000000000000000000000000..d476d82e98e5d29ceaf19bb50b4dfd79419e5127 GIT binary patch literal 759 zcmcJK*-jKu5Qb&BAS4=Jz@%x^$U`{chj;P>i_G z=gvo|)n{hs8cFAy3yT*nUTR%lx^nf}^2+raH*eiuy|Z?=eQ*8##)F5C9(OjMY&|tE zx4XTaXU|`}?7w1fM7cz{M7cz{M7cz{M7fkYRr=1C za*1+j87P;!MNuh&A}E3)D1x$$vW;4mSYDa0%va|B{dew}uPm?3SC&_N!#7#24Bv<| z;*2;W&WJPOj5wp8;Tyi;8@}N?zLW3xj_>%6@A!`I_>S*e_V3Su!2?7m392Xbo4#aVBK43DAsh)wwu9{0d zQQb;G-P5n$)!j8nT}M4h-KrXBVh&JLAubZ46iugknOly!eXSCg5=Y*WQJ+~o oMV(QeQJRB^mw^RrlmO6KA+GKrLFx<*Zy60m84V2$3=Oyp0sLDqbpQYW literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_132.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_132.csv new file mode 100644 index 0000000000000000000000000000000000000000..2bfd1d477eab24c69f8fd82aa60f0548a9100d66 GIT binary patch literal 291 zcmXrCw6r!fFfg!IN>or%;<5&lh6YMph7?IK>NBgSs58nlN;4}8R;#Dw=c`*OsHfzY zCgr56C$KZB>!?d}FhBuNmdnsc$CS&^Ku^cOfXh+O2n1Y!n2Cj_T0K$SO5H1%(LkO< z3#x-f2O{d}7vk#f8l7C_}wgfYGp8DLFqU zK2gcaKnKi90h2n( zuA`o$ZdI*bmY4$+6^M(Bg93Lj?E~Sdm$~Jr+t(^_DZx}CvlX!G;xaT)GBhv%QU(GH M28=+;KnH3#03I_chX4Qo literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_20.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_20.csv new file mode 100644 index 000000000000..ac53beaa9544 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_20.csv @@ -0,0 +1,6 @@ +123 +123 +12ð'}1{"col_a":0,"col_b":0} +[h`r', 'yarchar', 'varchanot aþjson] +{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_21.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_21.csv new file mode 100644 index 000000000000..7a4be8dbbd6f --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_21.csv @@ -0,0 +1,6 @@ +123 +129*99999999999999 - nda999-93 +12ð'}1{"col_a":0,"col_b":0} +[not a json] +{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_22.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_22.csv new file mode 100644 index 0000000000000000000000000000000000000000..2ddf7b2c3297a4eb76023ad4664a9f57e0ac9df7 GIT binary patch literal 176 zcmXryP%kY>(={|xm+$7_xL{<=WeCO}7>NbywE~QW)k?|vIq``~Rt7p?RuYg^%N3oM zU!stxkX4+Y7s~}#VhB>AqX1Tyti-6#tez4F2c-}e*z!1aLjxe-($P^*Vu-0$FH6i( F2LJ)5B@_Sv literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_23.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_23.csv new file mode 100644 index 000000000000..9e40b5220838 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_23.csv @@ -0,0 +1,6 @@ +123 +123 +12ð'}1{"col_a":0,"col_b":0} +[not a json] +{"col_a":1,co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_24.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_24.csv new file mode 100644 index 000000000000..5a0c0c194278 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_24.csv @@ -0,0 +1,7 @@ +123 +123 +12ð'}1{"coolÿÿÿ2} +;STRUCTl_a":0,"col_b":0} +[not a json] +{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_25.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_25.csv new file mode 100644 index 000000000000..99e0a7375746 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_25.csv @@ -0,0 +1,7 @@ +123 +123 +12ðn] +{"col_a":1,"co, "co'}1{"col_a":0,"col_b":0} +[not a json] +{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_26.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_26.csv new file mode 100644 index 000000000000..48473ccb3069 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_26.csv @@ -0,0 +1,7 @@ +123 +123 +12ð'}1{"col_a":0,"col_b":0} +[not a json] +{"col_a":1,"co, "col_son] +{"col_a":1,"co, "col_cc"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_27.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_27.csv new file mode 100644 index 000000000000..fde479ec47d4 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_27.csv @@ -0,0 +1,8 @@ +123 +123 +12ð'}1{"col_a":0,"col_b":0} +[not a json] +{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'1~000 +,,'b'\{':0,"col_b":0} +[not a json] +{"cval' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_28.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_28.csv new file mode 100644 index 0000000000000000000000000000000000000000..99c23520e19a116ab8352b598cb944bd7e684589 GIT binary patch literal 175 zcmXpsGUhS_;}49)0`*z}M#E~QPhM`)eH;_0_p(0zaYi{ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_29.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_29.csv new file mode 100644 index 000000000000..a4787dbb0d68 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_29.csv @@ -0,0 +1,6 @@ +123 +  0123 +12ð'}1{"col_a":0,"col_b":0} +[not a json] +{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_3.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..f7dbe5492a56cc7bb3e271f4c85514b3a005f2f8 GIT binary patch literal 160 zcmXpsGUhS_VtpXa&r1ce8TpyjQ`8yd8LPR}6VfbDFYp-;Q;i# BC}RKs literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_30.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_30.csv new file mode 100644 index 000000000000..733a0aa58558 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_30.csv @@ -0,0 +1,6 @@ +123 +123 +12ð'}1{"col_a":0,"col_b":0} +[not a json] +{"0,"cocol_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_31.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_31.csv new file mode 100644 index 0000000000000000000000000000000000000000..ac1dfd8e50a13afb2c9069b16da0c24e46325fce GIT binary patch literal 189 zcmZQ#G?3@uW`F<|fokNGwpV6<{>1R!Yt{00J%@LjwaH z!}=s8D}!3D=)C+Ag+zs{;{3c=E|8L(_(UZuLmePXM*+-DR$|m=R!@n8gHi|!Y`ErT+w;?C0sg&1_nBY z^#%r9hCtc?1{4w%vWoNbV!7a24MAFU6u|nEl^FGz)l=f&pcKLaTN7{EiGhKIiP3EHMWtst^|$2LOW@FP;1=M&Cq*;K0fg!}z ZJtRn-q2Vo~p(vxFp@E?Rm!Sbr4gl0pE!6-3 literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_4.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..ccc7d2986d19d813a245c83b24f4e6518e7d6fdd GIT binary patch literal 171 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQJztogNcEGg^AIC#jcu5JyG3CL0!wQ-qqbTNL@!g zN!_Yiy(}>YC@K&a83zUKVA==5Q!jJNQMa#E;!=XBR8&B)trQfn>*6vrP%<r(Wik!)srw#HCbS3giO88kkZB0R{#TTa1x0k%5JQK`A*uCq7BZ z%BYE}niIr{OH{Hl)B&<|6u@%HN{sr<>M1}&b<~s8t#Tmd#l=AlhM5befd&JO&QS*d DACep} literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_48.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_48.csv new file mode 100644 index 0000000000000000000000000000000000000000..947747bb1c464de190e13f0f291157e8c0099300 GIT binary patch literal 241 zcmZQ%LL6|k1Bhf~lwnk_ z6<{>1R!Yv#iBD9rGSL1HVkH4twOrA8`6czP?ygDdI_gR4R@Lfdi8<<=K)}GrPy?b7 z04&15ARuX_prDwjkX4+Y7s~}V)ldg$zK#Oel4K=DeP;FAxVY>%2q=Y6VCTSyJOv&e HeUNPcA@evD literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_49.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_49.csv new file mode 100644 index 000000000000..199641f8f80d --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_49.csv @@ -0,0 +1,3 @@ +999+9899999999 not a json] +{"colËËËËËËËËËËËËËËËËËËËËËË 01_a":1,"co+ "col_c"/'d^^^^^^^^99@9999999^^u^^^^0苹æž0 +,,'b'\{00^^^'1'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_5.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..c754534b4502b986fb0e49b43982f99450eb5dd0 GIT binary patch literal 171 zcmXpozyj1$)EVU&r8$@w7+9DX4Or}|xzrQYtrXNf{pwxaU4zth)RWY$s@2O9bAX}( zaglLQ;LacbWcfgN>Sb;@>h`rtTuNY-|Ev_?Y@jX$BwcwR4G>*ih6YN81_nUNK!Cx3 M5l9*6K#Yq608SGt7ytkO literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_50.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_50.csv new file mode 100644 index 0000000000000000000000000000000000000000..0cb4a24515a27da97c355caf071ae0780e6f5dd7 GIT binary patch literal 157 zcmZQ%!~_1lL6>A;VKl5(O3u%TPgJrp(Ebl%B>`ErT+w;?B?^fOS;hHzv1_q_WneKb iB~CEl0+Xe2U;wrxPTkS~2)J}~)RWX>s@2O9bJPLp{wLo6 literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_51.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_51.csv new file mode 100644 index 0000000000000000000000000000000000000000..144ac1887d73bb137a8569634ff53718ceaba3ef GIT binary patch literal 201 zcmZQ%WMp{3NDQ#FL{_B==DubCi>cQNFd9}XCFkeFe+Gg?B`ZUnIm!7tV5Wige~?6y zl9fR%S9D%}yc+~4Br0ST=jX+8K{SFj0gcd60GpGn#Hi1#o)QNKr4SawPoRx)>V^hD Sz@?+3o}?aAtzMRxqYeNM0WIPH literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_52.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_52.csv new file mode 100644 index 000000000000..d702dcc8f36e --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_52.csv @@ -0,0 +1,5 @@ +a json] +{"col_a":1,"c'}1{"col_a":0+ÿcol_b":0} +[not a json] +{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 +,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_53.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_53.csv new file mode 100644 index 0000000000000000000000000000000000000000..869b03087ab2396a74eeba7aba188141e5a73493 GIT binary patch literal 88 zcmY#Za8yW3Emz>>vgB1@U|_K3<+9=BvgPHnd!<#OQVa#Yt*FH_f1P%leNEK1Hu qEGkN7$Vo}m2`)@5D$t44G0`zJ(lN6%GSvwV4f6DJ^mPRqqYeP{7ZTL~ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_54.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_54.csv new file mode 100644 index 0000000000000000000000000000000000000000..b57920716e6cb5c013518bbe15cf08a20b51a6ba GIT binary patch literal 234 zcmZQ%1OtYr5E@8YLVzKV;^o!l%}Ytt3(thhLKzI`0*val0*r>$O3C>-@rg=S2HO9V zfUG1StClM|FTX?~Q6Z~1KQERGs6>bHKckN-r!KcHqdv2GiaMiwzchobE+-hgV202j zvVj54RxeA;Q4dm3h>MJit5xDs%8vs9cQEM#CM_-9-3{GqGF6RKT}mu1K*g ztF}Vra;YbR6ngsAySlpusq3gGsqX@+NX&s~h>OdQ0|ED`Akrr;2t=uux#g(a*D7%- z0adyJRo;iVNe4tRI6ycrLGI(_wX_5xowy)%hK9F{hN6syhDL@4T!zSiiz_Y;0Qhz@ A9RL6T literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_56.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_56.csv new file mode 100644 index 0000000000000000000000000000000000000000..fd74276fc29f8a24dbe37c29362abbbfad62c68b GIT binary patch literal 288 zcmZQ%1OtYr5E@8Yf&c>p^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpbU%jil zYmmB*dbzrJwR%}%PF!4^dIJLk1CWk$2N7{TaX}zTz055~-M&_d3kZQKU31i}?gM#N zKsG~zx|M=L9J8WewR&29K9HlHl3$vXld4|J%?vbKn(;rQk1D4ww=SbTvpU!{KOfub zav}qTxX8G;2_TCYB{_gDWdKpi3|qHug*pT14lqEt1L&lDun^3h2>HYub!!EXTLHIn BLOuWh literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_57.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_57.csv new file mode 100644 index 0000000000000000000000000000000000000000..22a39d8efc129a9083c90c825b443414e65c3e7c GIT binary patch literal 288 zcmZQ%1OtYr5E@8Yf&c>p^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpbU%jil zYmmB*dXl<%wR%}%j=GhCLR@4V7`TH;pSU0ptzPDqqi$cT#07*vm99XQ_klbsAe*5< z-6}yLj#*K#T0Jd4A83SnN`7flPO5q>H#5*|X~zGIKB}C$+`5eV%<5p*{CsSy%ZUuY wmga*jVwB|g$H2ep^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpbU%jil zYmmB*dXl<%wR%}%j=GhCLR@4V7`TH;pSU0p4U@oV#ytAkzh^RcZiCo%wA3O1XNA0#<| z9${bvQpyZlw{DGtxH8TM?5;TVGPfLcd!WOVfJ}&6Vcv*40`dk(9_USL1&|s5DBnh? literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_59.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_59.csv new file mode 100644 index 0000000000000000000000000000000000000000..c04e3fedf310e26e11fc8f149a2ff7e12a69e416 GIT binary patch literal 239 zcmZQ%WMpIjqNhL_0xUs*fs6SCOn{LwMV(QeQJR5~fq{jA(Li0D#jcu5JyFd{LEY1@ z-qqbTNL@$0MBThvy(}?D-AX|rj`2UEFDs)yvpSMaCPo9GKA;&4>WM%_NLrKBt*UiF zT77|9Bje)o<3PY2O!|PyIQ25O9CiCzB`zhHB@i}Hr#crGCl?U#fK_Hdb=8CDN)RCc Pv@yihJtRn-q2Vn6i}W)# literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_6.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_6.csv new file mode 100644 index 0000000000000000000000000000000000000000..bcb439094c5584d2b9141561c8c54a8ad18e8ae3 GIT binary patch literal 171 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQJztngNcEGg^AIC#jcu5JyG3CLEY1@-qqbTNL@!g zN!_Yiy(}>YC@K&a83zUKVA==5Q!jJNQMa#E;!=XCgs^MCx)iYM;xaT)GBhv%QU(GH M28=+;KnH3#03_ckl>h($ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_60.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_60.csv new file mode 100644 index 0000000000000000000000000000000000000000..3d726a869239ab47a0376d4c77df4eff5ab3b781 GIT binary patch literal 236 zcmZQ%1OtYr5E@8Yf&c>p^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpbU%jil zYmmB*dXl<%wR%}%j=GhCLLB3NMqgG&eP(qeolJ}dKz)qri9i)7`mCyTLHc}w`Xb}v z^5a0j9ZdRw$vE{gw;Xl*S|u(em<YqJmtr)~)}rK$#3FSa1q`MD0|P^dt9wY0 IIzz)-0CDLv!2kdN literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_61.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_61.csv new file mode 100644 index 0000000000000000000000000000000000000000..05ba66ba7e8034bb3e169315903736ccd32a3013 GIT binary patch literal 236 zcmZQ%1OtYr5E@8Yf&c>p^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpfU%jil zYmmB*dXl<%wR%}%j=GhCLLB3NMqgG&eP(qeolJ}dEOuFp>WM%V>Ygxt_5anas&zs7 ze1ZBRpX4*p literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_62.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_62.csv new file mode 100644 index 0000000000000000000000000000000000000000..b08e116c19e39d67f543610e3390bf8063c85a78 GIT binary patch literal 2371 zcmd5--EZ4A5SJ1Uh>Qhj-v;bqtR6-Tusu??V_U#lWSN_GXp%McigXxk(-Li~kVQ|T z>}5{S{U!VF1`OEi9tU(sO8$sztk_F2!l0!4_}%Z0ceH8hW&EB!VQ;Br82f*$B&Lte z&&*E^(=ZB#NiZ%vTof_;+(r25`J47>`x2M%05=wR#zO2O)HUClXCIkg?&A&5J(GZ+ zi5Xbj@pBht<#z^tuex2)RqIsm&-C8i46-)Hv&SKRbZZy)3Lh^ND*pDx#W}vcXVQi3 ziLl3Pmz4C|z<#T6S#d+M{WU-)}bLw*(8@gJ?~^PKZrRvOrtE*jvPf0Zb6eS1P zjOjDUmD6!2cL8h>``qunqmgWsR5pDT-z~b@WXX9WBFMpZ#$s>GVqCmEDBvF@{$wBb zw<=)C%-hPd6p@u>X03CvHc2R&)*aH;4x+g%$5sYKLQk*0ZnsX(Ue+(k^IE5U^+WCI z@zrJkD!C99T6txS#(J#tCIb&mQM{k~K3`>>qS zyLbz`_gbWJ#KPbg-iyG&~xyQ3lV@NSProm<1u4v)C>n zd-IGYOnPHzLX|Rgsr_?^A%CUh%3c#AU!CgFac>D#bhwBrl8MaO23=vfY+zJ_;fGLVl{_lZsCD z_B}&h&ZoRa|82_cA|)p!`i2mADXCk-fd8tUx5j^-a`u6z9D3V?sfS@@Vfa6u@)0yA z+*qoYyfl$w!go!1!^sXOmbeXj#&wsJo@Rq1qypFtI0zE5{S; zNy3t>+DG7;OC)Z16nG9KcmyP*a%SvoehBoETJhLE=R5yDF-?6LzhzH2o9Z%*z2C1X zW`NDx=DUVz7zM*57#BXA7cqO^L-@hz^VV_e43}^Z*XMZ3BJ3g5F<+V|H_f|ycun$# zNkGoT3=kV~-n^{jPC)Kyrz1M*I@aeyeRkH3tiX7BKf)hf*~J@$H|I(#{_OnGDZaX8 z(z)%6sLO1Rl=NHAeyO8seo)Gu*XMpL)8d`wr{)^B)MsI@@YlSE5S9sxQ|8N1#N9AZ zzCSrUJNonxV%BWVMmhb#X*;6@VZuRR9A}Z{R5lWmSc03MJo@6r!&U}mzOSfS10*Gf zuGWu>aO5%6+ z@Yh=zuw*7}z~1=CoY50u%{7zWBJ+JUV$&J0p+XcI)Cx z_u~G==a0{>jc zKmGCN-Cq~K{CNKJq4B~{H=)jZaIaaL+udhklEf?uzvJCx5^>u@b{|M6g>rDj;sB+5 z%42lNS&||)5J++ROb$7G_YyG~3?=G|L=MqpEMg>wVTyba%P{7#L=44zhGM2VO_6;b zPI-XB7)4=^C-PF;fWn79WHdlhfCi4ni5QDC)XlhTJTwA77W_I4(eMFv|<8RF`tQEPTYXT zi)x)N>@l_XDOotwUAWrZB<0Dql6u213jx)SzwzeeGmTXdz-qR$C|my+5j2K TP6*vs0{_q8zooas`r!W!s64qn literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_64.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_64.csv new file mode 100644 index 0000000000000000000000000000000000000000..9050f01683ebf4f2e70e34fa4c5eeed7760abcc3 GIT binary patch literal 261 zcmZQ%1OtYrKNVA2On#;KRN<*3`&Dsd@QGa5Q76h-PdaT)3u85o)98W`)UTW}e{IR=*c T5J!Pk!yO1T7XpChI>rG2cUC$Z literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_65.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_65.csv new file mode 100644 index 000000000000..a98344d21bb3 --- /dev/null +++ b/data/csv/afl/20250211_csv_fuzz_crash/case_65.csv @@ -0,0 +1,27 @@ +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +11" +199;1000;"a "" +;a ""y^^z012 0^^^^^'0000 +,10C,1 +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" +199;1000;"a "" +;a "" +10" diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_66.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_66.csv new file mode 100644 index 0000000000000000000000000000000000000000..b7408c92520aa15dce7c3190e16ca05b96e749d2 GIT binary patch literal 268 zcmXrCw6r!fFfg!IN>or%;<5&lh6YMphP07j)Mr*tQD>BAk!E0IU=U+sG+?na;4(DO zF*I-w2~uZhc*|%g%4le4U>ZxQz%oW_%z2A+&K BL3jWF literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_68.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_68.csv new file mode 100644 index 0000000000000000000000000000000000000000..37a9ce79fbb141f91d187765647d099bb249da84 GIT binary patch literal 252 zcmXrCw6r!fFfg!IN>or%;<5&lh6YMphQvxR{%7=MWz=U@Pf=%-XOw1OWME)X0D}Ko zmb{iVuCBo$=1hzREOyn5>WS(p4CSc*J>b?pJaglLx`EelN z4kmrTWSn}LTaLPYtrC|KM5UEN?RyYg-Rgfm7Z)cN5b%IiWM0;T#5e(1!&^oG%S$`y literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_69.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_69.csv new file mode 100644 index 0000000000000000000000000000000000000000..bee400e35ba1bc4ed9609a9ecf2ac4c450e19af0 GIT binary patch literal 233 zcmXpsGUhS_;}48jfT5uQx~M+0dWt%uJfrky#(IVZ28Q~5V4!ZLpzi5c?`l=8UY3{x zGvD0|9q1=>sO?)XUs*)a`4PxRk0H4ZU=nxD0iS42(>44UBaSEVzt-oM51> zh6YBeT!scZhGqs_FaR<%4rnL@01b7Fi}PWH@qx4e0|P^dt9!^@b%utwjE15hr^5^a E07D%zw*UYD literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_7.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_7.csv new file mode 100644 index 0000000000000000000000000000000000000000..e6b78bdd5fe06abf9b9d2552b2037835db408356 GIT binary patch literal 171 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQJztngNcEGg^AIC#jcu5JyG3CLEY1@-qqbTNL@!g zN!_Yiy(}>YC@K&a83zUKVA==5Q!jJNQMa#E;!=XCoX%(fvkOF{*$39eWoV#eXkY-O P32*7ISfDv1%l434p1y6N#F?Fc&T98)W y3I!z1>Xn9t2&2?1)e(|8Ame}{VATxuAiW?iLV>!KIz$rUCUvM357>nY5IF#8Wj`1I literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_71.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_71.csv new file mode 100644 index 0000000000000000000000000000000000000000..1f44de30dc3ca3c073e951ef0787b4d958261889 GIT binary patch literal 251 zcmXpsGUhTgGOktPQmR%jOUzLZQbz-f7(l&NfYGp8DLFrfkwvar9Z0KNDX0gB1cf?> zXaLPn2y+Z__HYc+RIh~^YE^--z>0wZ%3%QMgUZwbt+wK=P(V_rUTKKJQ%6YVfNiV< jnh$jLk5Z6c1@%fK1?pDn5J|9~fEBf0P(W7Tt!||bk%U;U4%N#8RSK2^0OW}{CjbBd literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_73.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_73.csv new file mode 100644 index 0000000000000000000000000000000000000000..6e8fbb5507c2d718375126f1a053c1692f402353 GIT binary patch literal 810 zcmb_aO-tNh5PpIeeMRUW5JFILFrk}Wzi%$0Qg=OcPgPX1Uw*|z_QQj+^mp~xqko|H z9;B6}P$-ce#Q4m_^{NL4-ppj?d1s!Pd4pof1bY5jZxwg&!3wUOMiL!{&T7F^t4?cR z_Tr>xhxTbViTBM5BoK&aE3V;KAIIG0!He{Bi^bi`FmhMzf#V0Rm06Y934hYJteJ7Q z-)sB9qMJR&2yu=UJnAFrDHwc0f-df0<2xc;GSXq>ikK1#im2j&k%-<7y+tgjcY}W+ zV*KL$6K1fn{#r5L|KX?U1Z9pVP19kTPQNPy^l?ExF?wV=|IDg(dwfjx=H1LPoMg{*^cQmzSMciOgrKSrq z-wXv5;}{dMa#JesZk4u7JS2CSc}Z_?;TC7gPqLpV>pD>#vXLF+HOlZ+sB-FZ>oV#ytEZ?l%J)k%*y?g(fCdJbPy<6* zVvc%{fGqX$D(ePBg&7*}wo7QZGx)Q4dm3 zh>MJi%Z~#AcQEM#Cgar0+;Y_IYn8Z^s=+F)6rk+J9Ca&&|CT5KYzRo-{|0131Q-|? fLR{TLtiXU#mes(}z)+sc&_Kt~z(7e!i3<(@`O7VJ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_76.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_76.csv new file mode 100644 index 0000000000000000000000000000000000000000..db289c458b9c336b9a248463a9c19135c02de26b GIT binary patch literal 232 zcmY+5F$;oF6ov0i8@e>KH}r7mEJbdiL$#O$*&r@04~&Kep-qTK*q;qr`VHk1i#o%1 z50~>*+=lL-#(@oy^Z0)F2p;i}n8+pe0e4{p`8ZVA?5Ze^QcQ5b+@Q=0gi2|Canj|P zgCDcQe(f=GTK#*WnOZNYy}j0FZ4)=#HqSk#3FSa1@$rmxP$>1Bpc+!Cn{MP>L?}W>nJcN XFt7+z8yGO+0Fw0v`H4kkAS=`Xx~vdA literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_8.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_8.csv new file mode 100644 index 0000000000000000000000000000000000000000..2ee586557b168beb83a0b72dc5c2dd9524fee86b GIT binary patch literal 175 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQJztngNcEGg^AIC#jcu5JyG3CLEY1@-qqbTNL@!g zN!_Yiy(}>YC@K&a83zUKVA==5Q!jJNQMa#E;!>(s2dRLlhVUV51?)Px3=NbF4Ge&k PfdGR6BakxCfm#3n>K`lu literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_80.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_80.csv new file mode 100644 index 0000000000000000000000000000000000000000..ca8c67c89c03d488aab455b4f0c004ac5d89b3ce GIT binary patch literal 85 zcmdOrVE_R^Wd(a)E(cyNM|GY0vc#g~jKnH+9R>9=0|-dY&#^=Tx|RwI3Jfd))gal# MqB5Wf2o{4n0M7Ce0ssI2 literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_81.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_81.csv new file mode 100644 index 0000000000000000000000000000000000000000..7651212a6534a6397efc9b23f35795754ce6a0c2 GIT binary patch literal 101 zcmdO500Ti~1^c614!m3`KyF!LQF2CNk-Cn8dYJ))GysF-{G9kiB`ZT6rQCcS1qKBM U7Ig7ssJK8iNG7qU3}m`G02k5}>Hq)$ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_82.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_82.csv new file mode 100644 index 0000000000000000000000000000000000000000..9b792cfb3b325c840c4ef2387a1242f9fd6acf6f GIT binary patch literal 90 zcmdO500Ti~1$$mD2VO2mb)EXMKm(pA(;`WM!zMl$@`lz@Wgu nB2aB$U}&JkWoT(>Z3t9kt(2&sq{L+nCczR028l&wAk)05lLN85g%54FDB` zfqI!+j=FuV5|>i7dYNmpy48J%aX?9k3b2dg;t=W-z@7pd3Q+>oY*np}2n;k9C~P1; zMG8Ddkn7@n@PasSsQlr|06Vk>M8<-drKP3rAwlX44R09@MHvkhjSLOA3=It#7#iw< L6cBK6#l-;tSX67Y literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_85.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_85.csv new file mode 100644 index 0000000000000000000000000000000000000000..c6030efb1e9a477b11fbc9f696eead2dcdb95796 GIT binary patch literal 449 zcmZQ%!~+MT|Y zwMtw{)#_!5IqFsl$R>l#R7%dziBD9rGSo3NG*VSc7RZTDQnLDA&!nCRRITpmSMTa( z2?ro&<;TU@#l^YD@$%}z#lZ}yB+yWIkjZgz>Sb;@>h@6EU7OXd?gQCizzrt#nblL& z8RZ$J8SbMv4`>4rfDHj^u&P!^1QLR!24W(@3@K0daFekcmbKOlkb*cS=m z5lydOZ+ifLQ1>!9=~Toj#Ujz?4d(<~W&mB_fl{r;H(LLSN)5P(tyj{(5WPMwmdV_o zlw=Y|<|%W)mNhKn#>tbiL-5%HWB-7&i>>Vw#`7zBH}PwXnz%#gL08?RAUuemIYJbQ zA=2QTusT@0cI{Q&U;7})wqZ6xPrWz8&c+%OMNzO`p-LY>ZJ@!JTunqur3sPkIF9W) Jt^`PH<~OUsNIU=l literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_87.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_87.csv new file mode 100644 index 0000000000000000000000000000000000000000..7ab38f0bf7bc3bff1f0711631fcaf5476681efc4 GIT binary patch literal 397 zcmZvYu?oUK42JKtivdSn9IRb3Nj*0!=%hsu2XS!;6dfD{r*=_k!3XW+DEI(^=3=db zenS#2cmJQ9)_*x-GPz)opFIGuHwMiwiOTTv%B3|Ggd1DTZTNJ{%qnjJt&~&wLH7`y zXIjAs0~cqDbTS>3Sc?PEh$y6by<473Q8d{!w9Z7BO!qkE!M(S;x=IGP*JK30id)oe z-BLMk8ST+`WeIj*MC9xZ|4@FwcmG0L4D2*S&y-CbuoJP9LK Nz!;-cmU3bMh+ZVgO`!k) literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_88.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_88.csv new file mode 100644 index 0000000000000000000000000000000000000000..df6772439a27db84637baff573059fb93c79eb8a GIT binary patch literal 412 zcmZQ%!~+nOfj71f&nW=1_l-;48UlhuFmqm zo=H7X-AX~-)34su%@PhkHs;60*~P`V$MN#&g2k1%l&aOs5_5pckzC{sv|GK*El1tH z7OWu7wOQTjK9G$B^qJLD)EVU&r5Wy{n2;a05~v$wLL5+oRkb?8|7a{ka3F;jBgk2C zabS!7aAkmuiK_vVV4}3N)IB6fouT0^qoF9Hp`wwY0hghnAp=80J&*zdF0QyZ0AIXL AWB>pF literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_89.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_89.csv new file mode 100644 index 0000000000000000000000000000000000000000..1d9e2996dc5bb6a504e06eb73aefc34b2cd6325a GIT binary patch literal 393 zcmZQ%!~+di7_hx literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_90.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_90.csv new file mode 100644 index 0000000000000000000000000000000000000000..09ac5761f6569eee79063051e37ebd8982cd64b5 GIT binary patch literal 421 zcmZQ%!~+cG^oQgD8xm^L4kWRh~WbwV6t$U3kIwhaT%)4 z^1q%*JyG3CLEY1@-qp<#4nS7q$Hm#j#kt4v^6J9H!3?M*&{B7h&2e$+Wo|j@_O(h} zO4aIRuFXmREiHNBTC|O8TwQ}h%z*;h#`l2=5_5nSf;|DW0SLf`05w}xt0Mveja3Hs zKRnHXJ^AA@B*svNf2_{NQOWi|)fFbvm(NL7pP|?WHfXmR(kb$A09!LQJ I7gt;y0EBQ(Gynhq literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_91.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_91.csv new file mode 100644 index 0000000000000000000000000000000000000000..c56c566235daff6282e7edaf8460a7a91e20b024 GIT binary patch literal 446 zcmZ{gu?oU45I}S46zr&rgWBaLv2klbowNwzATBNkMF)q1Q@g0N;J5i5PLBS8pqJLz zqWFf~kxMQw2k?(NfVh2%ZnGd3DVjou*7U~TaB{#n2+`|g`+K6fE}FxiC=C}7qE`6< z9K_l)sbIjZaJEP$Q}>N`KsI?6Hci(ZvID<0BHJg+>Y1Z?)Ph`O{A?wRry-6{wzOGx zaz^jUd5L}%w+L-$iR<@oOcTD4dz3UvX)&PE1VQT%We+{%E^|5Zsx9=$5ObwmlA9aIBUcc2`ploBFMI51KYy#UdDRX6|u literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_92.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_92.csv new file mode 100644 index 0000000000000000000000000000000000000000..5627769cf6a13456eeb7299ca9274a7dceda4f4c GIT binary patch literal 397 zcmZQ%L|TGbDe* zn23;JcnbF+Bgp!=IIuteaAkmuX<%Sr0MQ^&T3YHJ5~R-1@Rrd~l+jSp$k2ex(9n>9 Op`ji~0Rb0RTpR!wTuKrE literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_93.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_93.csv new file mode 100644 index 0000000000000000000000000000000000000000..c343ed6dcf8cf074a4667dadc9ffe705853a92c9 GIT binary patch literal 442 zcmaFOhzBq{g-cqBAwU^JgE~w?AuckG9N?Y|(&YmpU`D}dE-qm%FtB1|U|?aw0E`Ce z>MZ~3nbZ^2trXNf{pwxaEa3p;p8U8tySO;_I9^^|xHy;rl>}Ovm;=;XtHh;LtzHIZ zBYDIdWt%uJfk$jeH4X28-M_)FfI;gh*h;ZG<4K(0!Bup&}0O;25QS6 wt_+YdaW!BPOq7$qG!$htR5UU);4(BcWMF8h2U0-5#T6F^0Mb}o761SM literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_94.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_94.csv new file mode 100644 index 0000000000000000000000000000000000000000..fca0d7564746f6f3f2f7f86ec759c38b867ed89b GIT binary patch literal 444 zcmZQ%!~+ ztF!#CXHrj8w^C5|^s9GuvxEbX75Q;-c5!j;alE{`mX>;8)0DWl7?PB%jC2%~bOqsx zfefg6pgY__j){v?FLTRLx35*=QmR%jb8S|)x({RnK?BrP`poJn>Ws3C(hT>33KDaG zZUFlWXekhY4DkkPu&P!^gn<~U7f{`gqzuS}1Q#eIfZ>T@KF|q&xH7;N)qqJbQCeE+ j9ulO^(D0VgP?XV7(a6w%%h1q}fuW%uNC5#CS6mzba+Ol9 literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_95.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_95.csv new file mode 100644 index 0000000000000000000000000000000000000000..f651e896146e2d1865c3cd1cd92aed63bfc2a7c9 GIT binary patch literal 264 zcmZQ%!~+6IK@o3Cm4Ja1sVLASTA$W%4gf@Cp=+CcdHX z+SlH@S0VlYo`aaqMr2T-7A!hakLaw5y;`(7u+;DcfHC=ja1f2AoM4Q;g}Xx;gx<#M zp)69OEm1_KxzxuUpE{9EBjrqlS(r2pfV2zDf(QEb-PZ1kS5v&8Sco~)d?5xNs13mm zjJ63s@CzIRe^q1tj}H=R9HQ3gC9iF>TaMgH%@HRmSB{q6uU$vFt_x6=ObjWB-cgrG AAOHXW literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_97.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_97.csv new file mode 100644 index 0000000000000000000000000000000000000000..777ca34bdcff0af731fa9384ca0b517fe5401b8c GIT binary patch literal 373 zcmZQ%!~+WuP?(hT<@_E;%^-32rX z2;4yg&;XztU_QX05nh4^0vjVE$W3u^Knu9I{%~c0jESoOlVDN zaSRL$|9!yhIQ25O9Cd|SB`&3EbtIJ_NuVycI7n9Yf+^5LW?7pspf@WQMm4i3~_KiZTiq8W Date: Wed, 12 Feb 2025 09:02:36 -0300 Subject: [PATCH 132/142] Add hangs --- .../afl/20250211_csv_fuzz_hangs/case_0.csv | Bin 0 -> 42 bytes .../afl/20250211_csv_fuzz_hangs/case_1.csv | Bin 0 -> 42 bytes .../afl/20250211_csv_fuzz_hangs/case_10.csv | Bin 0 -> 4922 bytes .../afl/20250211_csv_fuzz_hangs/case_11.csv | Bin 0 -> 4952 bytes .../afl/20250211_csv_fuzz_hangs/case_12.csv | Bin 0 -> 4922 bytes .../afl/20250211_csv_fuzz_hangs/case_13.csv | Bin 0 -> 4915 bytes .../afl/20250211_csv_fuzz_hangs/case_14.csv | Bin 0 -> 4918 bytes .../afl/20250211_csv_fuzz_hangs/case_15.csv | Bin 0 -> 4933 bytes .../afl/20250211_csv_fuzz_hangs/case_16.csv | Bin 0 -> 4911 bytes .../afl/20250211_csv_fuzz_hangs/case_2.csv | Bin 0 -> 42 bytes .../afl/20250211_csv_fuzz_hangs/case_3.csv | Bin 0 -> 48 bytes .../afl/20250211_csv_fuzz_hangs/case_4.csv | Bin 0 -> 58 bytes .../afl/20250211_csv_fuzz_hangs/case_5.csv | Bin 0 -> 42 bytes .../afl/20250211_csv_fuzz_hangs/case_6.csv | Bin 0 -> 63 bytes .../afl/20250211_csv_fuzz_hangs/case_7.csv | Bin 0 -> 4909 bytes .../afl/20250211_csv_fuzz_hangs/case_8.csv | Bin 0 -> 4909 bytes .../afl/20250211_csv_fuzz_hangs/case_9.csv | Bin 0 -> 4920 bytes .../sql/copy/csv/afl/fuzz_20250211_hangs.test | 82 ++++++++++++++++++ 18 files changed, 82 insertions(+) create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_0.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_1.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_10.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_11.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_12.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_13.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_14.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_15.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_16.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_2.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_3.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_4.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_5.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_6.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_7.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_8.csv create mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_9.csv create mode 100644 test/sql/copy/csv/afl/fuzz_20250211_hangs.test diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_0.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..65d6c1a8136e83309dc31c555a4293b1ee0b1b8b GIT binary patch literal 42 xcmb2|=HPgEi9dlQm4TO4fPpEuK<~dmR+3J#E|;N>k&ZDhmywRHjtMW<7XafL3K0MR literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_1.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..f685be9af1f97706e3dd0225bdce240b3aa935c4 GIT binary patch literal 42 wcmb2|=HPgEi9dlQm4TO4ASFMyK<|H2R+3JoE|;N>k&ZDh7m&~~;pO@Q00LACPyhe` literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_10.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_10.csv new file mode 100644 index 0000000000000000000000000000000000000000..fbd977e626f66e1b4358e2d45ba0db4ec129386e GIT binary patch literal 4922 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZQ)#zz7ZjM-b4lG&MH< z|DRzrGmM77Xb6mkz-R~z;}9?##(o>Y75Qkaj)uT!2n^v6h>J_k7%jg>Ltr!n=oSL} KbhC-JM<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86Yv{m)1S6fHrW`02Y^mPyhe` literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_12.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_12.csv new file mode 100644 index 0000000000000000000000000000000000000000..80bfdbe5ee3434edd31feccc610c2b9b75f64a70 GIT binary patch literal 4922 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZQ)#z-S1JhQMeDjE2By z2#kinXb6mkz-S1JhQMeDjE2B~h5)a!0h1tyYmmB*y0!6WH+3`wMnhmU1h9ty086WS A4FCWD literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_13.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_13.csv new file mode 100644 index 0000000000000000000000000000000000000000..de4bb8a6bff7dabefc48c6d5a96f206c04ceeb11 GIT binary patch literal 4915 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZOu?1QK)9trQeSLvu6) zMneE!2z;eVZo_B8sN84>jE2By2#kinXb6mk05Al?M?+yW1V%$(Gz3ONU^E2i5CWNW JumjqQ1OTRodV~M~ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_14.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_14.csv new file mode 100644 index 0000000000000000000000000000000000000000..77660389cae4170b67750ccbc52c48b93feedb41 GIT binary patch literal 4918 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86Yw*I*LF~4{8)}rg%+SDq z3ms5u=BWPB5Eu=C(GVC7fzc2c4S~@R7!3jXgaD(Vp@E?Rm;Go%dNc$^Ltr!nMnhoO Sg#fsKH#E?pPlYlFZ2AB z6$|lviK&7%spoAqGjHB5KUlvN;v{S}l9{^9Y^a$!@R_p7l)YLG6i~H(&7PN>Cdn`< zbe0T`4IbZ=T~^jcHq!ln<>`4%J#^kX_iAOqOvdM-T{pAu91hn=5wz!ew(GOHVAY1k zK86_K6&{|RGfX+T<>U?lRuPqsLwq6QNjVA7#ah|jW#y3Lh5MIG@UVjkZdm0Q)7Ptf z+ql99y*V9Qh~qfNQ5iMjQeh5gtS&N(eSFeeSIXBqIJKmrftDi#ni42J$6soiCwoFb b2nYcoAOwVf5D)@FKnMr{As_^VK%Kx3Qon$J literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_16.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_16.csv new file mode 100644 index 0000000000000000000000000000000000000000..8c2175321d6744c5d2956ef1a31c28737719997f GIT binary patch literal 4911 zcmeH@Jxc>Y6h-e8ikT2Q`ym*n*@c}LEdnkmk|JOVlSYeyDOOR|iR7zL1pk_UB`JbM zO2tAnFPp7|4QT!DRB!K@H_NG&LL7&se??DRs+*!Gi;n>E)oWNOdueUou5x927aA`M zVfKVEk%?rtFiLJq%5(L-`Xu&u^mud~nj`)5?cs2b1VLr1Vd^!SZfL4NqK!U=c!R?0 zEX9PH0X2^ZaDXTy&hdqmPC`w9Iu5h1Et(uW=5+s-HYl87j0c)LM#MF8?i6?UAl4Ic zf;f&nI@zirU0T5bf%!wa+{Guczmw44IykW_qGE^KumLu}1~wU(PB*pOu>m%~2G{@_ TU;}J`4X^<=zy{dBUkxk)`c#8$ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_2.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..3b6e26bb6ace46abb7a240b441c76e3ec9b4d698 GIT binary patch literal 42 xcmb2|=HPgEi9dlQm4TO4ASFMyK<|H2R)bE9E|-9gk&ZDhmjHvYjtMWk9z+ZxEUQ literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_5.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..2709f85fe7d427752072bd85572b411066693fe8 GIT binary patch literal 42 wcmb2|=HTE@czB5=g@Knk&ZDh7m&~~;pO@Q0MCpGCjbBd literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_6.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_6.csv new file mode 100644 index 0000000000000000000000000000000000000000..95986aca82e796b51a0dd27b2717e2e6a47f5b30 GIT binary patch literal 63 zcmb2|=HPh9z`zjV>K?+&`mymXqoF9Hp`n4H0hghHj-i3M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZQ)#z-S1JhQMeDjE2By z2n?bS;2uP78EqxdCj>Z0<9sv(MnhmU1V%$(Gz3ONfNCM2MKzP?(+(I`wiM99Hvk-* BcuxQT literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_8.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_8.csv new file mode 100644 index 0000000000000000000000000000000000000000..768f41dcd7bb32a6e808b795f8f361087bbaab3b GIT binary patch literal 4909 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86iw;nV4t7w_9}M->Gi%hO z(GVC7fzc2c4S~@R7!85Z5Eu=C(GVC70dhj1fq@)dqk2X|U^E0qLx3tF5K9$PhSXTd H;Po*8A3S(( literal 0 HcmV?d00001 diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_9.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_9.csv new file mode 100644 index 0000000000000000000000000000000000000000..2897497b6cfff1bc9d4a7d5fb1055c2c4ec01fe7 GIT binary patch literal 4920 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZQ)#z-S1JhQMeDjE2BS l3W3pzZ!`o(Ltr!nMnhmU1V%$(Gz3ONU^E0qLtrR|0037gdGi1O literal 0 HcmV?d00001 diff --git a/test/sql/copy/csv/afl/fuzz_20250211_hangs.test b/test/sql/copy/csv/afl/fuzz_20250211_hangs.test new file mode 100644 index 000000000000..77e165ebe972 --- /dev/null +++ b/test/sql/copy/csv/afl/fuzz_20250211_hangs.test @@ -0,0 +1,82 @@ +# name: test/sql/copy/csv/afl/fuzz_20250211_hangs.test +# description: fuzzer generated csv files - should not raise internal exception (by failed assertion). +# group: [csv] + +# This test takes a very long time to run, ~ 6 minutes on a Mac M1 Max +mode skip + +statement ok +PRAGMA enable_verification + +query I +select count(file) from glob('./data/csv/afl/20250211_csv_fuzz_hangs/*'); +---- +17 + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_0.csv', compression='gzip'); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_1.csv', compression='gzip'); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_2.csv', compression='gzip'); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_3.csv', compression='gzip'); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_4.csv', compression='gzip'); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_5.csv', compression='gzip'); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_6.csv', compression='gzip'); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_7.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_8.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_9.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_10.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_11.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_12.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_13.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_14.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_15.csv', rejects_table=L); +---- + +statement maybe +FROM read_csv('data/csv/afl/20250211_csv_fuzz_hangs/case_16.csv', rejects_table=L); +---- From 0110b2467ba4bab35c3e47dd8d464c9003a2e95f Mon Sep 17 00:00:00 2001 From: pdet Date: Wed, 12 Feb 2025 09:27:23 -0300 Subject: [PATCH 133/142] This needs to be slightly bigger for windows --- test/sql/copy/csv/parallel/csv_parallel_buffer_size.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test index e3b0531499a6..04f78983c3cb 100644 --- a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test +++ b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test @@ -75,6 +75,6 @@ SELECT sum(a) FROM read_csv('data/csv/test/new_line_string.csv', COLUMNS=STRUCT 111 query I -SELECT sum(a) FROM read_csv('data/csv/test/new_line_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=80) +SELECT sum(a) FROM read_csv('data/csv/test/new_line_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), quote ='"', escape ='"', comment = '', auto_detect='true', delim = '|', buffer_size=100, new_line = '\r\n') ---- 111 From 28fd731a9c5de9ea8468fb2d2a7ab24d6f8c463e Mon Sep 17 00:00:00 2001 From: pdet Date: Wed, 12 Feb 2025 10:27:20 -0300 Subject: [PATCH 134/142] Format --- test/sql/copy/csv/afl/fuzz_20250211_hangs.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/copy/csv/afl/fuzz_20250211_hangs.test b/test/sql/copy/csv/afl/fuzz_20250211_hangs.test index 77e165ebe972..456d8966edc0 100644 --- a/test/sql/copy/csv/afl/fuzz_20250211_hangs.test +++ b/test/sql/copy/csv/afl/fuzz_20250211_hangs.test @@ -1,6 +1,6 @@ # name: test/sql/copy/csv/afl/fuzz_20250211_hangs.test # description: fuzzer generated csv files - should not raise internal exception (by failed assertion). -# group: [csv] +# group: [afl] # This test takes a very long time to run, ~ 6 minutes on a Mac M1 Max mode skip From a04d9065fdc697b4eda3b5f86efc0f8ef6ec5fe8 Mon Sep 17 00:00:00 2001 From: pdet Date: Wed, 12 Feb 2025 17:34:52 -0300 Subject: [PATCH 135/142] Verify that the table names are valid --- .../operator/csv_scanner/util/csv_reader_options.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index 5c91a5523eef..4c26d4f02e51 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -308,6 +308,9 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, if (table_name.empty()) { throw BinderException("REJECTS_TABLE option cannot be empty"); } + if (KeywordHelper::RequiresQuotes(table_name)) { + throw BinderException("rejects_scan option: %s requires quotes to be used as an identifier", table_name); + } rejects_table_name.Set(table_name); } else if (loption == "rejects_scan") { // skip, handled in SetRejectsOptions @@ -315,6 +318,9 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, if (table_name.empty()) { throw BinderException("rejects_scan option cannot be empty"); } + if (KeywordHelper::RequiresQuotes(table_name)) { + throw BinderException("rejects_scan option: %s requires quotes to be used as an identifier", table_name); + } rejects_scan_name.Set(table_name); } else if (loption == "rejects_limit") { auto limit = ParseInteger(value, loption); From 36538e672aa818519c3cb4c5bf093b1362352406 Mon Sep 17 00:00:00 2001 From: pdet Date: Mon, 17 Feb 2025 15:47:56 -0300 Subject: [PATCH 136/142] Drastrically minimize the number of tests --- .../afl/20250211_csv_fuzz_crash/case_1.csv | Bin 171 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_10.csv | Bin 209 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_100.csv | 3 - .../afl/20250211_csv_fuzz_crash/case_101.csv | Bin 204 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_102.csv | Bin 136 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_103.csv | Bin 86 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_104.csv | Bin 86 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_105.csv | Bin 132 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_106.csv | Bin 86 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_107.csv | Bin 137 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_108.csv | Bin 137 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_109.csv | Bin 122 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_11.csv | Bin 241 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_110.csv | Bin 116 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_111.csv | Bin 98 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_112.csv | Bin 134 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_113.csv | Bin 128 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_114.csv | Bin 98 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_115.csv | Bin 113 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_116.csv | Bin 159 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_117.csv | Bin 405 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_118.csv | Bin 134 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_119.csv | Bin 93 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_12.csv | Bin 241 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_120.csv | Bin 148 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_121.csv | Bin 228 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_122.csv | Bin 220 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_123.csv | Bin 264 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_124.csv | Bin 415 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_125.csv | Bin 418 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_126.csv | 30 - .../afl/20250211_csv_fuzz_crash/case_127.csv | 28 - .../afl/20250211_csv_fuzz_crash/case_128.csv | Bin 279 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_129.csv | 30 - .../afl/20250211_csv_fuzz_crash/case_13.csv | Bin 186 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_130.csv | Bin 759 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_131.csv | Bin 199 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_132.csv | Bin 291 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_14.csv | Bin 175 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_15.csv | 20 - .../afl/20250211_csv_fuzz_crash/case_16.csv | Bin 177 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_17.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_18.csv | 8 - .../afl/20250211_csv_fuzz_crash/case_19.csv | 5 - .../afl/20250211_csv_fuzz_crash/case_2.csv | Bin 171 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_20.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_21.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_22.csv | Bin 176 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_23.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_24.csv | 7 - .../afl/20250211_csv_fuzz_crash/case_25.csv | 7 - .../afl/20250211_csv_fuzz_crash/case_26.csv | 7 - .../afl/20250211_csv_fuzz_crash/case_27.csv | 8 - .../afl/20250211_csv_fuzz_crash/case_28.csv | Bin 175 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_29.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_3.csv | Bin 160 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_30.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_31.csv | Bin 189 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_32.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_33.csv | Bin 186 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_34.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_35.csv | 27 - .../afl/20250211_csv_fuzz_crash/case_36.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_37.csv | 10 - .../afl/20250211_csv_fuzz_crash/case_38.csv | 6 - .../afl/20250211_csv_fuzz_crash/case_39.csv | Bin 181 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_4.csv | Bin 171 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_40.csv | 4 - .../afl/20250211_csv_fuzz_crash/case_41.csv | 3 - .../afl/20250211_csv_fuzz_crash/case_42.csv | 4 - .../afl/20250211_csv_fuzz_crash/case_43.csv | 4 - .../afl/20250211_csv_fuzz_crash/case_44.csv | Bin 209 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_45.csv | 4 - .../afl/20250211_csv_fuzz_crash/case_46.csv | 5 - .../afl/20250211_csv_fuzz_crash/case_47.csv | Bin 204 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_48.csv | Bin 241 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_49.csv | 3 - .../afl/20250211_csv_fuzz_crash/case_5.csv | Bin 171 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_50.csv | Bin 157 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_51.csv | Bin 201 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_52.csv | 5 - .../afl/20250211_csv_fuzz_crash/case_54.csv | Bin 234 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_55.csv | Bin 232 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_56.csv | Bin 288 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_57.csv | Bin 288 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_58.csv | Bin 319 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_59.csv | Bin 239 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_6.csv | Bin 171 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_60.csv | Bin 236 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_61.csv | Bin 236 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_62.csv | Bin 2371 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_63.csv | Bin 2399 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_64.csv | Bin 261 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_65.csv | 27 - .../afl/20250211_csv_fuzz_crash/case_66.csv | Bin 268 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_67.csv | Bin 291 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_68.csv | Bin 252 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_69.csv | Bin 233 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_7.csv | Bin 171 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_70.csv | Bin 272 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_71.csv | Bin 251 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_72.csv | Bin 251 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_73.csv | Bin 810 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_74.csv | Bin 236 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_75.csv | Bin 231 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_76.csv | Bin 232 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_77.csv | Bin 87 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_78.csv | Bin 92 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_79.csv | Bin 106 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_8.csv | Bin 175 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_80.csv | Bin 85 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_81.csv | Bin 101 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_82.csv | Bin 90 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_83.csv | Bin 101 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_84.csv | Bin 455 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_85.csv | Bin 449 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_86.csv | Bin 405 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_87.csv | Bin 397 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_88.csv | Bin 412 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_89.csv | Bin 393 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_9.csv | Bin 200 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_90.csv | Bin 421 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_91.csv | Bin 446 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_92.csv | Bin 397 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_93.csv | Bin 442 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_94.csv | Bin 444 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_95.csv | Bin 264 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_96.csv | Bin 397 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_97.csv | Bin 373 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_98.csv | Bin 270 -> 0 bytes .../afl/20250211_csv_fuzz_crash/case_99.csv | Bin 322 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_0.csv | Bin 42 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_1.csv | Bin 42 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_10.csv | Bin 4922 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_11.csv | Bin 4952 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_12.csv | Bin 4922 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_13.csv | Bin 4915 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_14.csv | Bin 4918 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_15.csv | Bin 4933 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_16.csv | Bin 4911 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_2.csv | Bin 42 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_3.csv | Bin 48 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_4.csv | Bin 58 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_5.csv | Bin 42 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_6.csv | Bin 63 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_7.csv | Bin 4909 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_8.csv | Bin 4909 -> 0 bytes .../afl/20250211_csv_fuzz_hangs/case_9.csv | Bin 4920 -> 0 bytes data/csv/afl/4172/case_1.csv | Bin 2398 -> 0 bytes data/csv/afl/4172/case_2.csv | Bin 229 -> 0 bytes data/csv/afl/4172/case_3.csv | Bin 257 -> 0 bytes data/csv/afl/4172/case_5.csv | Bin 240 -> 0 bytes .../sql/copy/csv/afl/fuzz_20250211_crash.test | 529 ------------------ .../sql/copy/csv/afl/fuzz_20250211_hangs.test | 82 --- test/sql/copy/csv/afl/test_fuzz_4172.test | 22 - 155 files changed, 942 deletions(-) delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_1.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_10.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_100.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_101.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_102.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_103.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_104.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_105.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_106.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_107.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_108.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_109.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_11.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_110.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_111.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_112.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_113.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_114.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_115.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_116.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_117.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_118.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_119.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_12.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_120.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_121.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_122.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_123.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_124.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_125.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_126.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_127.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_128.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_129.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_13.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_130.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_131.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_132.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_14.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_15.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_16.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_17.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_18.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_19.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_2.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_20.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_21.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_22.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_23.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_24.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_25.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_26.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_27.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_28.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_29.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_3.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_30.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_31.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_32.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_33.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_34.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_35.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_36.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_37.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_38.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_39.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_4.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_40.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_41.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_42.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_43.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_44.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_45.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_46.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_47.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_48.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_49.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_5.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_50.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_51.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_52.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_54.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_55.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_56.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_57.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_58.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_59.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_6.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_60.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_61.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_62.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_63.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_64.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_65.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_66.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_67.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_68.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_69.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_7.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_70.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_71.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_72.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_73.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_74.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_75.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_76.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_77.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_78.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_79.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_8.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_80.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_81.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_82.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_83.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_84.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_85.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_86.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_87.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_88.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_89.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_9.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_90.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_91.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_92.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_93.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_94.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_95.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_96.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_97.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_98.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_crash/case_99.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_0.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_1.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_10.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_11.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_12.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_13.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_14.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_15.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_16.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_2.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_3.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_4.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_5.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_6.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_7.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_8.csv delete mode 100644 data/csv/afl/20250211_csv_fuzz_hangs/case_9.csv delete mode 100644 data/csv/afl/4172/case_1.csv delete mode 100644 data/csv/afl/4172/case_2.csv delete mode 100644 data/csv/afl/4172/case_3.csv delete mode 100644 data/csv/afl/4172/case_5.csv delete mode 100644 test/sql/copy/csv/afl/fuzz_20250211_hangs.test diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_1.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_1.csv deleted file mode 100644 index 2dff96bfb5f9b182d598d7cfb9ed20a53a95b5be..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 171 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQQldagNcEGg^AIC#jcu5JyG3CLEY1@-qqbTNL@!g zN!_Yiy(}>YC@K&a83zS!VA==5Q!jJNQMa#E;!=XCL}n`>bh#tx;xaT)GBhv%QU(GH M28=+;KnH3#07#T7&;S4c diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_10.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_10.csv deleted file mode 100644 index 1e43526015ac6450d9ea8a7f25505492a6628370..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 209 zcmeZK<<#ZYWz=U@Pf=%-XOw2J)#U^O1_nk32n&T&FH6i(w^C4ui;RoQj{^aBFzEv( zmAI6up(?>_2F3=suDG}eu&z>M0MZRMA~6SIVmyQ!w-913m=s`OcopL6 Y9%2OsjJ&J{hCs+=XrNdE;z>Q)Ns!66u?1cf?> zXp|-9D1gq;08fT;u8* e2hyWn3DnP(&BBo7;h~od(+w0=0IOF984Li$;)NM g%f-vfrJxWO83zUKVA==9)d3lg!YN7yS)>l70Hy3CLI3~& diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_103.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_103.csv deleted file mode 100644 index 627236618725faeee0f2a66fdc8dfaf2ebff8b19..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 86 zcmY#Za8yXnNGwt}Qcy2TOq1u(;^bstV5sMWgG41OLmj2${2V40o-Fk^D9`~ZFG^;J L^FbE@>Q)B;4d)YR diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_104.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_104.csv deleted file mode 100644 index 3ffe8f761076d228091c82edc09de307cc15f5ff..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 86 zcmY#Za8yXnNGwt}Qcy2TOq1u(Vt{}+AkYDcl`HUaS@LpO@pAF%M#P%IkXrcKpCRGD79RH rm&=lu%ZitaSBXnOAuciw3fv(ygz_nbit2#ODN6Q%seSP=?B(FP9ZB7q1eRfM5wfq*-h^Z}D`>Sb;@>h`rtTuRkYm0&glBO?P$S6o~KSXU`B0O{1ObReCyb5u353vFRMqX9}Lm=cbG|)L`V88_h05se=hyVZp diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_110.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_110.csv deleted file mode 100644 index 48f32730da7f3dc915d56e92b8784e748387b441..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 116 zcmZ=%NX|$sQr9uGG`D1Q;FRSA10x0Xvcxob4lM>SP*!jRD@!d`;N{Tb<+9@C;#J~O aP>73+g93Lj?E~ZLfb=tBfTCoOaq0jy9u1bwdLn;L-t81_lbr8Hq*eMhfa>iD~j2TD(eJ3JP(NV3k0^9nA27adkk- N7%@OmG6N8(0|1OQ6t@5X diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_112.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_112.csv deleted file mode 100644 index 73296b846c3bb9a5983b32a841e35a29f892f0c0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 134 zcmY#Za8yXnNGwt}Qcy2TOq1s@Ff>v%umS-~FbO4iS%DJTR#sXJV4w`umRhdB%Vo*S jWyQD diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_113.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_113.csv deleted file mode 100644 index 91f981ac0cc27ecc6756b472a295f0772c608e51..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 128 zcmZQ%1cDcg43?Ib%-?_n8sO#CRrzmeSp@`^mfBFsYW1?j9Ca&&S|u(eOG{p`s2-X| aOG_(-irN$gC79w0uyQC*y%MAxWHQ)N1N?b~YhK5`U3UQHf Saj|i6>Xw#z2t7zFkahqQA{E#G diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_115.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_115.csv deleted file mode 100644 index e58f1e9f64aa92c00ad58df71c909366c8275960..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 113 zcmZQ%WMnXiQ!ldw0+s)kmQ_^(RhE{#Ag-kzm_#Oc83Z7Ts(^~Lp)zhc>h`rtTuSaC fLFx<*Zy6XE1U^HgK*qR2jQL;Bz+lPCtE&P4BGDVB diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_116.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_116.csv deleted file mode 100644 index f30120d3aa53c5e68cd1bbc85b14f504650971eb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 159 zcmXriDJd#V$mdNe83IL-=FfIT9 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_117.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_117.csv deleted file mode 100644 index aa7e242fad58bbc5cfb7c8d9f456b8c407383cd9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 405 zcmZQ%!~+NBgSs58nlN;BB%a&mE^gE%1Y0TIZCFfcGQFml0w6(a)!3lj!lG*DM( z`CreZo~RzKpzi5c@9Jg=2XT>cartp^c5!j;alE{`P;o6@US2R$$I>z`4rmJl$mX~> z^)j~{b^BT+E~RRXGS_BxtNTDU5&-+b2IPnP)#_!5IqFsl3Q!w>y4^uW05w=utBYX; z%3@+=3=QfKA2K{e_!Q)pxHuV*O@FvDK*rR72rxjf;y{K90dd*?1~g?7jE0Ivh6Y@Q RhK9f(s|Qj*z{M382LNjHN|OKp diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_118.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_118.csv deleted file mode 100644 index 12c1eefed8aeda7b8b899c4fcc22e88442b2ba70..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 134 zcmY#Za8yXnNGwt}Qc#a$U|@)gi}Q(7h6<*ZEAVnz@^V@6a`7s0DJaC{s9PzhSE?)2 nDsd@QtCuAL`Rb8CEnwgdCVim1I2{F`0{oyVu_zg6CxbcwCtW4~ diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_119.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_119.csv deleted file mode 100644 index 79fea4b0758b2a85f0077dc5ba92526b45a2a78b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 93 zcmY#Za8yXnNGwt}Qcy2TOq1u(VgLeOB`yVpxX3sl0P>W9YCz05pSU_gE|0|!V>WS diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_12.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_12.csv deleted file mode 100644 index 4c1a8352beb22246e53a5cd2b5c696744fca8535..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 241 zcmXpsGUhUb;{3c+AUhb&H)QhiL#)7nk(brL5D2*p4Rj0*47i{G02tahzyJUM diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_120.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_120.csv deleted file mode 100644 index dc91aa419dda8f81202f056122a3368eb813209c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 148 zcmY#Za8yXnNGwt}Qcy2TOq1u(Vqi!tN{tHvaf`}X1z5p=*T4b7P~hdVCcu pN{!P2DpM>fM~EtMDJaB6#zBEQjDd^xiDQIFfecD4N@f58bpVW-Bq{&^ diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_121.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_121.csv deleted file mode 100644 index 416d4de5e3f6fa6a8f82606e27787d3778dc14a1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 228 zcmX|)y$ixH5XJ8>OQJ(}2TMS13O1V*#6h%6$2zzih#d=VS*qwCE@&>1{@~tyJa~tg z`C_})Ztie1qn>buOV~NBUpk|-MgT}l?4XMA?@+i7RF@@RnC~n!BDYvZc(6UB!9Z_5 zMm*+t6cDU6cBJs0B9eVh5!|LFOA()T$EK?jpa6X3E8`bJ&YzS#GlkPY+x=7C`~a1B BD$M`@ diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_122.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_122.csv deleted file mode 100644 index 29da7018bbd835f6eb234d0ea9d5faf4807dded3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 220 zcmZQ%gn<8yKB}C$+`5eV%<3uXjPi`q47R$Qob{X#zyhLlpd1LLUY3}nZl$0Q7a13q z9|r>NU=m1hfq)P;P!p#F16)eg>SYZuBkF+KV&g&~M)<%C0@4Bu3=AQz?jcrSz{tz$ eZ)jj>z-7n-0$_^}gt}o|oO+pCj=Fs<$QA&eh9_PC diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_123.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_123.csv deleted file mode 100644 index 3eebaa16b6aa47c88857f1169f10054a423d13d9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 264 zcmZQ%1Oo;zc*DS82?9(kJk{!n>Q?Gr!Tx^gwE~QW)k^jGIq``~Rt7ps$v{>TkX6eS z?Fj=eKnA)POb(UK%P&z#RLCmM&x_?s&Ig$VG}RDps!=UhHP{A_C?ktpwK`CVx|M=` za7a+7bBIP+Vva(XW013l<62GiT7_zLpn$qntrC|KlnJ3w?S%5I6e^$!85)2_Rwy88 F0|361M_vE` diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_124.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_124.csv deleted file mode 100644 index 955e7021fe1e00e1fab21209acb1864d3fc2f49c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 415 zcmZQ%1cU!T0AYbihR+b{DO}7_3<@A3mM{s0xX3sta8Hhli;c4Y@nEuWnhOT37#SE? zm@ojNfx0@&|9U3%M0G0#bx*%~S2s&IfLdV}7v~hi1&KL83&Ebq#{^*W zz{WzY4GI<$({c9j^l{Yz1&0|lOvK6<7#IY=o@GD;86(I8adBK+adDLpxA}k#1iH{Q i7w$r!0~ifO84VSU3=Oyp4GkF>8tQ=*5O8tbjROE@LR3Bg diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_125.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_125.csv deleted file mode 100644 index 9c24c33b7bc7f8f769d417ddff855a26ad2b6d99..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 418 zcmZQ%#0D5XgGCvh!X+)mpa3Fb36oHWi;RN;_hb;m2SmVR;WQTvSTQm%urOf&Mgw(q zmjCrk>WS)B3hJJI^{#G~Z~(Q!E-ubJj+a*#DDLVS9AXZnwT)}Q%=jvxrS2e`_LVx%D!iGb$1O?*HKSWx2jezOU#LjtLB6Q^|btakew-T5gORmAE-p@()AgCvQ`8yd8KoKSSF0-k%>imtU|gnOtzMFuo2qUFrix1va|=qU)OCO+ zrWPj`WfqiV=H~$=%My!{GZKr`Ye9;2kmR_G42%qP4Ge)O#K6D`jC67{Q}Qy?GfEV4 F@&VzKR_y=) diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_126.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_126.csv deleted file mode 100644 index b0e23e1f3f15..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_126.csv +++ /dev/null @@ -1,30 +0,0 @@ -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -1 "" -10" -199;1 -;a "" -10" -199;1000;"a "" -;a99;1000;"a ""000;"a "" -;a "" -10 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_127.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_127.csv deleted file mode 100644 index 7c1e2505553f..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_127.csv +++ /dev/null @@ -1,28 +0,0 @@ -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -1 ""#10" -199;1 -;a "" -10" -199;1000;"a "" -;a99;1000;"a ""000;"a ""a "" -;a99;1 \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_128.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_128.csv deleted file mode 100644 index ae9fd5dd6f8f0d3ff80db4a464ad756378f2ef53..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 279 zcmXrCw6r!fFfg!IN>or%;<5&lh6YMphJ#ZA46$ diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_129.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_129.csv deleted file mode 100644 index 475ffa66fb2c..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_129.csv +++ /dev/null @@ -1,30 +0,0 @@ -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -1 "" -10" -199;1 -;a "" -10" -199;1000;"a "" -;a99;1000;"a ""000;"a "" -;{{{{{{{{{{{{{a "" -10" diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_13.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_13.csv deleted file mode 100644 index 738932afa8f621cb8c6703b3a12350cb65c74fe5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 186 zcmXpsGUhS_;}49eK%ZGXMV-M92pAa{locEm(o)NTD*mIYLguO03NRX0D<$XW#3w3Q z8R&pnNkCRDS9D%}i9(`6R&jn_EEimfAxMdi0$5$L5+lgQI5;SUu)r3?sT&#q0hf-B OdXjofwR%}%jyeFSPbPf; diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_130.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_130.csv deleted file mode 100644 index d476d82e98e5d29ceaf19bb50b4dfd79419e5127..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 759 zcmcJK*-jKu5Qb&BAS4=Jz@%x^$U`{chj;P>i_G z=gvo|)n{hs8cFAy3yT*nUTR%lx^nf}^2+raH*eiuy|Z?=eQ*8##)F5C9(OjMY&|tE zx4XTaXU|`}?7w1fM7cz{M7cz{M7cz{M7fkYRr=1C za*1+j87P;!MNuh&A}E3)D1x$$vW;4mSYDa0%va|B{dew}uPm?3SC&_N!#7#24Bv<| z;*2;W&WJPOj5wp8;Tyi;8@}N?zLW3xj_>%6@A!`I_>S*e_V3Su!2?7m392Xbo4#aVBK43DAsh)wwu9{0d zQQb;G-P5n$)!j8nT}M4h-KrXBVh&JLAubZ46iugknOly!eXSCg5=Y*WQJ+~o oMV(QeQJRB^mw^RrlmO6KA+GKrLFx<*Zy60m84V2$3=Oyp0sLDqbpQYW diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_132.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_132.csv deleted file mode 100644 index 2bfd1d477eab24c69f8fd82aa60f0548a9100d66..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 291 zcmXrCw6r!fFfg!IN>or%;<5&lh6YMph7?IK>NBgSs58nlN;4}8R;#Dw=c`*OsHfzY zCgr56C$KZB>!?d}FhBuNmdnsc$CS&^Ku^cOfXh+O2n1Y!n2Cj_T0K$SO5H1%(LkO< z3#x-f2O{d}7vk#f8l7C_}wgfYGp8DLFqU zK2gcaKnKi90h2n( zuA`o$ZdI*bmY4$+6^M(Bg93Lj?E~Sdm$~Jr+t(^_DZx}CvlX!G;xaT)GBhv%QU(GH M28=+;KnH3#03I_chX4Qo diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_20.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_20.csv deleted file mode 100644 index ac53beaa9544..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_20.csv +++ /dev/null @@ -1,6 +0,0 @@ -123 -123 -12ð'}1{"col_a":0,"col_b":0} -[h`r', 'yarchar', 'varchanot aþjson] -{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_21.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_21.csv deleted file mode 100644 index 7a4be8dbbd6f..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_21.csv +++ /dev/null @@ -1,6 +0,0 @@ -123 -129*99999999999999 - nda999-93 -12ð'}1{"col_a":0,"col_b":0} -[not a json] -{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_22.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_22.csv deleted file mode 100644 index 2ddf7b2c3297a4eb76023ad4664a9f57e0ac9df7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 176 zcmXryP%kY>(={|xm+$7_xL{<=WeCO}7>NbywE~QW)k?|vIq``~Rt7p?RuYg^%N3oM zU!stxkX4+Y7s~}#VhB>AqX1Tyti-6#tez4F2c-}e*z!1aLjxe-($P^*Vu-0$FH6i( F2LJ)5B@_Sv diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_23.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_23.csv deleted file mode 100644 index 9e40b5220838..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_23.csv +++ /dev/null @@ -1,6 +0,0 @@ -123 -123 -12ð'}1{"col_a":0,"col_b":0} -[not a json] -{"col_a":1,co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_24.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_24.csv deleted file mode 100644 index 5a0c0c194278..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_24.csv +++ /dev/null @@ -1,7 +0,0 @@ -123 -123 -12ð'}1{"coolÿÿÿ2} -;STRUCTl_a":0,"col_b":0} -[not a json] -{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_25.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_25.csv deleted file mode 100644 index 99e0a7375746..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_25.csv +++ /dev/null @@ -1,7 +0,0 @@ -123 -123 -12ðn] -{"col_a":1,"co, "co'}1{"col_a":0,"col_b":0} -[not a json] -{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_26.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_26.csv deleted file mode 100644 index 48473ccb3069..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_26.csv +++ /dev/null @@ -1,7 +0,0 @@ -123 -123 -12ð'}1{"col_a":0,"col_b":0} -[not a json] -{"col_a":1,"co, "col_son] -{"col_a":1,"co, "col_cc"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_27.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_27.csv deleted file mode 100644 index fde479ec47d4..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_27.csv +++ /dev/null @@ -1,8 +0,0 @@ -123 -123 -12ð'}1{"col_a":0,"col_b":0} -[not a json] -{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'1~000 -,,'b'\{':0,"col_b":0} -[not a json] -{"cval' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_28.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_28.csv deleted file mode 100644 index 99c23520e19a116ab8352b598cb944bd7e684589..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 175 zcmXpsGUhS_;}49)0`*z}M#E~QPhM`)eH;_0_p(0zaYi{ diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_29.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_29.csv deleted file mode 100644 index a4787dbb0d68..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_29.csv +++ /dev/null @@ -1,6 +0,0 @@ -123 -  0123 -12ð'}1{"col_a":0,"col_b":0} -[not a json] -{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_3.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_3.csv deleted file mode 100644 index f7dbe5492a56cc7bb3e271f4c85514b3a005f2f8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 160 zcmXpsGUhS_VtpXa&r1ce8TpyjQ`8yd8LPR}6VfbDFYp-;Q;i# BC}RKs diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_30.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_30.csv deleted file mode 100644 index 733a0aa58558..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_30.csv +++ /dev/null @@ -1,6 +0,0 @@ -123 -123 -12ð'}1{"col_a":0,"col_b":0} -[not a json] -{"0,"cocol_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_31.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_31.csv deleted file mode 100644 index ac1dfd8e50a13afb2c9069b16da0c24e46325fce..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 189 zcmZQ#G?3@uW`F<|fokNGwpV6<{>1R!Yt{00J%@LjwaH z!}=s8D}!3D=)C+Ag+zs{;{3c=E|8L(_(UZuLmePXM*+-DR$|m=R!@n8gHi|!Y`ErT+w;?C0sg&1_nBY z^#%r9hCtc?1{4w%vWoNbV!7a24MAFU6u|nEl^FGz)l=f&pcKLaTN7{EiGhKIiP3EHMWtst^|$2LOW@FP;1=M&Cq*;K0fg!}z ZJtRn-q2Vo~p(vxFp@E?Rm!Sbr4gl0pE!6-3 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_4.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_4.csv deleted file mode 100644 index ccc7d2986d19d813a245c83b24f4e6518e7d6fdd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 171 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQJztogNcEGg^AIC#jcu5JyG3CL0!wQ-qqbTNL@!g zN!_Yiy(}>YC@K&a83zUKVA==5Q!jJNQMa#E;!=XBR8&B)trQfn>*6vrP%<r(Wik!)srw#HCbS3giO88kkZB0R{#TTa1x0k%5JQK`A*uCq7BZ z%BYE}niIr{OH{Hl)B&<|6u@%HN{sr<>M1}&b<~s8t#Tmd#l=AlhM5befd&JO&QS*d DACep} diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_48.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_48.csv deleted file mode 100644 index 947747bb1c464de190e13f0f291157e8c0099300..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 241 zcmZQ%LL6|k1Bhf~lwnk_ z6<{>1R!Yv#iBD9rGSL1HVkH4twOrA8`6czP?ygDdI_gR4R@Lfdi8<<=K)}GrPy?b7 z04&15ARuX_prDwjkX4+Y7s~}V)ldg$zK#Oel4K=DeP;FAxVY>%2q=Y6VCTSyJOv&e HeUNPcA@evD diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_49.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_49.csv deleted file mode 100644 index 199641f8f80d..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_49.csv +++ /dev/null @@ -1,3 +0,0 @@ -999+9899999999 not a json] -{"colËËËËËËËËËËËËËËËËËËËËËË 01_a":1,"co+ "col_c"/'d^^^^^^^^99@9999999^^u^^^^0苹æž0 -,,'b'\{00^^^'1'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_5.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_5.csv deleted file mode 100644 index c754534b4502b986fb0e49b43982f99450eb5dd0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 171 zcmXpozyj1$)EVU&r8$@w7+9DX4Or}|xzrQYtrXNf{pwxaU4zth)RWY$s@2O9bAX}( zaglLQ;LacbWcfgN>Sb;@>h`rtTuNY-|Ev_?Y@jX$BwcwR4G>*ih6YN81_nUNK!Cx3 M5l9*6K#Yq608SGt7ytkO diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_50.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_50.csv deleted file mode 100644 index 0cb4a24515a27da97c355caf071ae0780e6f5dd7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 157 zcmZQ%!~_1lL6>A;VKl5(O3u%TPgJrp(Ebl%B>`ErT+w;?B?^fOS;hHzv1_q_WneKb iB~CEl0+Xe2U;wrxPTkS~2)J}~)RWX>s@2O9bJPLp{wLo6 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_51.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_51.csv deleted file mode 100644 index 144ac1887d73bb137a8569634ff53718ceaba3ef..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 201 zcmZQ%WMp{3NDQ#FL{_B==DubCi>cQNFd9}XCFkeFe+Gg?B`ZUnIm!7tV5Wige~?6y zl9fR%S9D%}yc+~4Br0ST=jX+8K{SFj0gcd60GpGn#Hi1#o)QNKr4SawPoRx)>V^hD Sz@?+3o}?aAtzMRxqYeNM0WIPH diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_52.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_52.csv deleted file mode 100644 index d702dcc8f36e..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_52.csv +++ /dev/null @@ -1,5 +0,0 @@ -a json] -{"col_a":1,"c'}1{"col_a":0+ÿcol_b":0} -[not a json] -{"col_a":1,"co, "col_c"/'d^^^^^^^^^^u^^^^^^^'10000 -,,'b'\{'val' \ No newline at end of file diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_54.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_54.csv deleted file mode 100644 index b57920716e6cb5c013518bbe15cf08a20b51a6ba..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 234 zcmZQ%1OtYr5E@8YLVzKV;^o!l%}Ytt3(thhLKzI`0*val0*r>$O3C>-@rg=S2HO9V zfUG1StClM|FTX?~Q6Z~1KQERGs6>bHKckN-r!KcHqdv2GiaMiwzchobE+-hgV202j zvVj54RxeA;Q4dm3h>MJit5xDs%8vs9cQEM#CM_-9-3{GqGF6RKT}mu1K*g ztF}Vra;YbR6ngsAySlpusq3gGsqX@+NX&s~h>OdQ0|ED`Akrr;2t=uux#g(a*D7%- z0adyJRo;iVNe4tRI6ycrLGI(_wX_5xowy)%hK9F{hN6syhDL@4T!zSiiz_Y;0Qhz@ A9RL6T diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_56.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_56.csv deleted file mode 100644 index fd74276fc29f8a24dbe37c29362abbbfad62c68b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 288 zcmZQ%1OtYr5E@8Yf&c>p^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpbU%jil zYmmB*dbzrJwR%}%PF!4^dIJLk1CWk$2N7{TaX}zTz055~-M&_d3kZQKU31i}?gM#N zKsG~zx|M=L9J8WewR&29K9HlHl3$vXld4|J%?vbKn(;rQk1D4ww=SbTvpU!{KOfub zav}qTxX8G;2_TCYB{_gDWdKpi3|qHug*pT14lqEt1L&lDun^3h2>HYub!!EXTLHIn BLOuWh diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_57.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_57.csv deleted file mode 100644 index 22a39d8efc129a9083c90c825b443414e65c3e7c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 288 zcmZQ%1OtYr5E@8Yf&c>p^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpbU%jil zYmmB*dXl<%wR%}%j=GhCLR@4V7`TH;pSU0ptzPDqqi$cT#07*vm99XQ_klbsAe*5< z-6}yLj#*K#T0Jd4A83SnN`7flPO5q>H#5*|X~zGIKB}C$+`5eV%<5p*{CsSy%ZUuY wmga*jVwB|g$H2ep^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpbU%jil zYmmB*dXl<%wR%}%j=GhCLR@4V7`TH;pSU0p4U@oV#ytAkzh^RcZiCo%wA3O1XNA0#<| z9${bvQpyZlw{DGtxH8TM?5;TVGPfLcd!WOVfJ}&6Vcv*40`dk(9_USL1&|s5DBnh? diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_59.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_59.csv deleted file mode 100644 index c04e3fedf310e26e11fc8f149a2ff7e12a69e416..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 239 zcmZQ%WMpIjqNhL_0xUs*fs6SCOn{LwMV(QeQJR5~fq{jA(Li0D#jcu5JyFd{LEY1@ z-qqbTNL@$0MBThvy(}?D-AX|rj`2UEFDs)yvpSMaCPo9GKA;&4>WM%_NLrKBt*UiF zT77|9Bje)o<3PY2O!|PyIQ25O9CiCzB`zhHB@i}Hr#crGCl?U#fK_Hdb=8CDN)RCc Pv@yihJtRn-q2Vn6i}W)# diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_6.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_6.csv deleted file mode 100644 index bcb439094c5584d2b9141561c8c54a8ad18e8ae3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 171 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQJztngNcEGg^AIC#jcu5JyG3CLEY1@-qqbTNL@!g zN!_Yiy(}>YC@K&a83zUKVA==5Q!jJNQMa#E;!=XCgs^MCx)iYM;xaT)GBhv%QU(GH M28=+;KnH3#03_ckl>h($ diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_60.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_60.csv deleted file mode 100644 index 3d726a869239ab47a0376d4c77df4eff5ab3b781..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 236 zcmZQ%1OtYr5E@8Yf&c>p^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpbU%jil zYmmB*dXl<%wR%}%j=GhCLLB3NMqgG&eP(qeolJ}dKz)qri9i)7`mCyTLHc}w`Xb}v z^5a0j9ZdRw$vE{gw;Xl*S|u(em<YqJmtr)~)}rK$#3FSa1q`MD0|P^dt9wY0 IIzz)-0CDLv!2kdN diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_61.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_61.csv deleted file mode 100644 index 05ba66ba7e8034bb3e169315903736ccd32a3013..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 236 zcmZQ%1OtYr5E@8Yf&c>p^9#5LV~RSXJfk!NBLf2q1EYbuI*VO3mwKX_m4dpfU%jil zYmmB*dXl<%wR%}%j=GhCLLB3NMqgG&eP(qeolJ}dEOuFp>WM%V>Ygxt_5anas&zs7 ze1ZBRpX4*p diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_62.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_62.csv deleted file mode 100644 index b08e116c19e39d67f543610e3390bf8063c85a78..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2371 zcmd5--EZ4A5SJ1Uh>Qhj-v;bqtR6-Tusu??V_U#lWSN_GXp%McigXxk(-Li~kVQ|T z>}5{S{U!VF1`OEi9tU(sO8$sztk_F2!l0!4_}%Z0ceH8hW&EB!VQ;Br82f*$B&Lte z&&*E^(=ZB#NiZ%vTof_;+(r25`J47>`x2M%05=wR#zO2O)HUClXCIkg?&A&5J(GZ+ zi5Xbj@pBht<#z^tuex2)RqIsm&-C8i46-)Hv&SKRbZZy)3Lh^ND*pDx#W}vcXVQi3 ziLl3Pmz4C|z<#T6S#d+M{WU-)}bLw*(8@gJ?~^PKZrRvOrtE*jvPf0Zb6eS1P zjOjDUmD6!2cL8h>``qunqmgWsR5pDT-z~b@WXX9WBFMpZ#$s>GVqCmEDBvF@{$wBb zw<=)C%-hPd6p@u>X03CvHc2R&)*aH;4x+g%$5sYKLQk*0ZnsX(Ue+(k^IE5U^+WCI z@zrJkD!C99T6txS#(J#tCIb&mQM{k~K3`>>qS zyLbz`_gbWJ#KPbg-iyG&~xyQ3lV@NSProm<1u4v)C>n zd-IGYOnPHzLX|Rgsr_?^A%CUh%3c#AU!CgFac>D#bhwBrl8MaO23=vfY+zJ_;fGLVl{_lZsCD z_B}&h&ZoRa|82_cA|)p!`i2mADXCk-fd8tUx5j^-a`u6z9D3V?sfS@@Vfa6u@)0yA z+*qoYyfl$w!go!1!^sXOmbeXj#&wsJo@Rq1qypFtI0zE5{S; zNy3t>+DG7;OC)Z16nG9KcmyP*a%SvoehBoETJhLE=R5yDF-?6LzhzH2o9Z%*z2C1X zW`NDx=DUVz7zM*57#BXA7cqO^L-@hz^VV_e43}^Z*XMZ3BJ3g5F<+V|H_f|ycun$# zNkGoT3=kV~-n^{jPC)Kyrz1M*I@aeyeRkH3tiX7BKf)hf*~J@$H|I(#{_OnGDZaX8 z(z)%6sLO1Rl=NHAeyO8seo)Gu*XMpL)8d`wr{)^B)MsI@@YlSE5S9sxQ|8N1#N9AZ zzCSrUJNonxV%BWVMmhb#X*;6@VZuRR9A}Z{R5lWmSc03MJo@6r!&U}mzOSfS10*Gf zuGWu>aO5%6+ z@Yh=zuw*7}z~1=CoY50u%{7zWBJ+JUV$&J0p+XcI)Cx z_u~G==a0{>jc zKmGCN-Cq~K{CNKJq4B~{H=)jZaIaaL+udhklEf?uzvJCx5^>u@b{|M6g>rDj;sB+5 z%42lNS&||)5J++ROb$7G_YyG~3?=G|L=MqpEMg>wVTyba%P{7#L=44zhGM2VO_6;b zPI-XB7)4=^C-PF;fWn79WHdlhfCi4ni5QDC)XlhTJTwA77W_I4(eMFv|<8RF`tQEPTYXT zi)x)N>@l_XDOotwUAWrZB<0Dql6u213jx)SzwzeeGmTXdz-qR$C|my+5j2K TP6*vs0{_q8zooas`r!W!s64qn diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_64.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_64.csv deleted file mode 100644 index 9050f01683ebf4f2e70e34fa4c5eeed7760abcc3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 261 zcmZQ%1OtYrKNVA2On#;KRN<*3`&Dsd@QGa5Q76h-PdaT)3u85o)98W`)UTW}e{IR=*c T5J!Pk!yO1T7XpChI>rG2cUC$Z diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_65.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_65.csv deleted file mode 100644 index a98344d21bb3..000000000000 --- a/data/csv/afl/20250211_csv_fuzz_crash/case_65.csv +++ /dev/null @@ -1,27 +0,0 @@ -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -11" -199;1000;"a "" -;a ""y^^z012 0^^^^^'0000 -,10C,1 -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" -199;1000;"a "" -;a "" -10" diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_66.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_66.csv deleted file mode 100644 index b7408c92520aa15dce7c3190e16ca05b96e749d2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 268 zcmXrCw6r!fFfg!IN>or%;<5&lh6YMphP07j)Mr*tQD>BAk!E0IU=U+sG+?na;4(DO zF*I-w2~uZhc*|%g%4le4U>ZxQz%oW_%z2A+&K BL3jWF diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_68.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_68.csv deleted file mode 100644 index 37a9ce79fbb141f91d187765647d099bb249da84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 252 zcmXrCw6r!fFfg!IN>or%;<5&lh6YMphQvxR{%7=MWz=U@Pf=%-XOw1OWME)X0D}Ko zmb{iVuCBo$=1hzREOyn5>WS(p4CSc*J>b?pJaglLx`EelN z4kmrTWSn}LTaLPYtrC|KM5UEN?RyYg-Rgfm7Z)cN5b%IiWM0;T#5e(1!&^oG%S$`y diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_69.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_69.csv deleted file mode 100644 index bee400e35ba1bc4ed9609a9ecf2ac4c450e19af0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 233 zcmXpsGUhS_;}48jfT5uQx~M+0dWt%uJfrky#(IVZ28Q~5V4!ZLpzi5c?`l=8UY3{x zGvD0|9q1=>sO?)XUs*)a`4PxRk0H4ZU=nxD0iS42(>44UBaSEVzt-oM51> zh6YBeT!scZhGqs_FaR<%4rnL@01b7Fi}PWH@qx4e0|P^dt9!^@b%utwjE15hr^5^a E07D%zw*UYD diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_7.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_7.csv deleted file mode 100644 index e6b78bdd5fe06abf9b9d2552b2037835db408356..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 171 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQJztngNcEGg^AIC#jcu5JyG3CLEY1@-qqbTNL@!g zN!_Yiy(}>YC@K&a83zUKVA==5Q!jJNQMa#E;!=XCoX%(fvkOF{*$39eWoV#eXkY-O P32*7ISfDv1%l434p1y6N#F?Fc&T98)W y3I!z1>Xn9t2&2?1)e(|8Ame}{VATxuAiW?iLV>!KIz$rUCUvM357>nY5IF#8Wj`1I diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_71.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_71.csv deleted file mode 100644 index 1f44de30dc3ca3c073e951ef0787b4d958261889..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 251 zcmXpsGUhTgGOktPQmR%jOUzLZQbz-f7(l&NfYGp8DLFrfkwvar9Z0KNDX0gB1cf?> zXaLPn2y+Z__HYc+RIh~^YE^--z>0wZ%3%QMgUZwbt+wK=P(V_rUTKKJQ%6YVfNiV< jnh$jLk5Z6c1@%fK1?pDn5J|9~fEBf0P(W7Tt!||bk%U;U4%N#8RSK2^0OW}{CjbBd diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_73.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_73.csv deleted file mode 100644 index 6e8fbb5507c2d718375126f1a053c1692f402353..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 810 zcmb_aO-tNh5PpIeeMRUW5JFILFrk}Wzi%$0Qg=OcPgPX1Uw*|z_QQj+^mp~xqko|H z9;B6}P$-ce#Q4m_^{NL4-ppj?d1s!Pd4pof1bY5jZxwg&!3wUOMiL!{&T7F^t4?cR z_Tr>xhxTbViTBM5BoK&aE3V;KAIIG0!He{Bi^bi`FmhMzf#V0Rm06Y934hYJteJ7Q z-)sB9qMJR&2yu=UJnAFrDHwc0f-df0<2xc;GSXq>ikK1#im2j&k%-<7y+tgjcY}W+ zV*KL$6K1fn{#r5L|KX?U1Z9pVP19kTPQNPy^l?ExF?wV=|IDg(dwfjx=H1LPoMg{*^cQmzSMciOgrKSrq z-wXv5;}{dMa#JesZk4u7JS2CSc}Z_?;TC7gPqLpV>pD>#vXLF+HOlZ+sB-FZ>oV#ytEZ?l%J)k%*y?g(fCdJbPy<6* zVvc%{fGqX$D(ePBg&7*}wo7QZGx)Q4dm3 zh>MJi%Z~#AcQEM#Cgar0+;Y_IYn8Z^s=+F)6rk+J9Ca&&|CT5KYzRo-{|0131Q-|? fLR{TLtiXU#mes(}z)+sc&_Kt~z(7e!i3<(@`O7VJ diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_76.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_76.csv deleted file mode 100644 index db289c458b9c336b9a248463a9c19135c02de26b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 232 zcmY+5F$;oF6ov0i8@e>KH}r7mEJbdiL$#O$*&r@04~&Kep-qTK*q;qr`VHk1i#o%1 z50~>*+=lL-#(@oy^Z0)F2p;i}n8+pe0e4{p`8ZVA?5Ze^QcQ5b+@Q=0gi2|Canj|P zgCDcQe(f=GTK#*WnOZNYy}j0FZ4)=#HqSk#3FSa1@$rmxP$>1Bpc+!Cn{MP>L?}W>nJcN XFt7+z8yGO+0Fw0v`H4kkAS=`Xx~vdA diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_8.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_8.csv deleted file mode 100644 index 2ee586557b168beb83a0b72dc5c2dd9524fee86b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 175 zcmXpsGUhUb;{3c+Ae)h&Sv^IaQJztngNcEGg^AIC#jcu5JyG3CLEY1@-qqbTNL@!g zN!_Yiy(}>YC@K&a83zUKVA==5Q!jJNQMa#E;!>(s2dRLlhVUV51?)Px3=NbF4Ge&k PfdGR6BakxCfm#3n>K`lu diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_80.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_80.csv deleted file mode 100644 index ca8c67c89c03d488aab455b4f0c004ac5d89b3ce..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 85 zcmdOrVE_R^Wd(a)E(cyNM|GY0vc#g~jKnH+9R>9=0|-dY&#^=Tx|RwI3Jfd))gal# MqB5Wf2o{4n0M7Ce0ssI2 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_81.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_81.csv deleted file mode 100644 index 7651212a6534a6397efc9b23f35795754ce6a0c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 101 zcmdO500Ti~1^c614!m3`KyF!LQF2CNk-Cn8dYJ))GysF-{G9kiB`ZT6rQCcS1qKBM U7Ig7ssJK8iNG7qU3}m`G02k5}>Hq)$ diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_82.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_82.csv deleted file mode 100644 index 9b792cfb3b325c840c4ef2387a1242f9fd6acf6f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 90 zcmdO500Ti~1$$mD2VO2mb)EXMKm(pA(;`WM!zMl$@`lz@Wgu nB2aB$U}&JkWoT(>Z3t9kt(2&sq{L+nCczR028l&wAk)05lLN85g%54FDB` zfqI!+j=FuV5|>i7dYNmpy48J%aX?9k3b2dg;t=W-z@7pd3Q+>oY*np}2n;k9C~P1; zMG8Ddkn7@n@PasSsQlr|06Vk>M8<-drKP3rAwlX44R09@MHvkhjSLOA3=It#7#iw< L6cBK6#l-;tSX67Y diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_85.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_85.csv deleted file mode 100644 index c6030efb1e9a477b11fbc9f696eead2dcdb95796..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 449 zcmZQ%!~+MT|Y zwMtw{)#_!5IqFsl$R>l#R7%dziBD9rGSo3NG*VSc7RZTDQnLDA&!nCRRITpmSMTa( z2?ro&<;TU@#l^YD@$%}z#lZ}yB+yWIkjZgz>Sb;@>h@6EU7OXd?gQCizzrt#nblL& z8RZ$J8SbMv4`>4rfDHj^u&P!^1QLR!24W(@3@K0daFekcmbKOlkb*cS=m z5lydOZ+ifLQ1>!9=~Toj#Ujz?4d(<~W&mB_fl{r;H(LLSN)5P(tyj{(5WPMwmdV_o zlw=Y|<|%W)mNhKn#>tbiL-5%HWB-7&i>>Vw#`7zBH}PwXnz%#gL08?RAUuemIYJbQ zA=2QTusT@0cI{Q&U;7})wqZ6xPrWz8&c+%OMNzO`p-LY>ZJ@!JTunqur3sPkIF9W) Jt^`PH<~OUsNIU=l diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_87.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_87.csv deleted file mode 100644 index 7ab38f0bf7bc3bff1f0711631fcaf5476681efc4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 397 zcmZvYu?oUK42JKtivdSn9IRb3Nj*0!=%hsu2XS!;6dfD{r*=_k!3XW+DEI(^=3=db zenS#2cmJQ9)_*x-GPz)opFIGuHwMiwiOTTv%B3|Ggd1DTZTNJ{%qnjJt&~&wLH7`y zXIjAs0~cqDbTS>3Sc?PEh$y6by<473Q8d{!w9Z7BO!qkE!M(S;x=IGP*JK30id)oe z-BLMk8ST+`WeIj*MC9xZ|4@FwcmG0L4D2*S&y-CbuoJP9LK Nz!;-cmU3bMh+ZVgO`!k) diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_88.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_88.csv deleted file mode 100644 index df6772439a27db84637baff573059fb93c79eb8a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 412 zcmZQ%!~+nOfj71f&nW=1_l-;48UlhuFmqm zo=H7X-AX~-)34su%@PhkHs;60*~P`V$MN#&g2k1%l&aOs5_5pckzC{sv|GK*El1tH z7OWu7wOQTjK9G$B^qJLD)EVU&r5Wy{n2;a05~v$wLL5+oRkb?8|7a{ka3F;jBgk2C zabS!7aAkmuiK_vVV4}3N)IB6fouT0^qoF9Hp`wwY0hghnAp=80J&*zdF0QyZ0AIXL AWB>pF diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_89.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_89.csv deleted file mode 100644 index 1d9e2996dc5bb6a504e06eb73aefc34b2cd6325a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 393 zcmZQ%!~+di7_hx diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_90.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_90.csv deleted file mode 100644 index 09ac5761f6569eee79063051e37ebd8982cd64b5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 421 zcmZQ%!~+cG^oQgD8xm^L4kWRh~WbwV6t$U3kIwhaT%)4 z^1q%*JyG3CLEY1@-qp<#4nS7q$Hm#j#kt4v^6J9H!3?M*&{B7h&2e$+Wo|j@_O(h} zO4aIRuFXmREiHNBTC|O8TwQ}h%z*;h#`l2=5_5nSf;|DW0SLf`05w}xt0Mveja3Hs zKRnHXJ^AA@B*svNf2_{NQOWi|)fFbvm(NL7pP|?WHfXmR(kb$A09!LQJ I7gt;y0EBQ(Gynhq diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_91.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_91.csv deleted file mode 100644 index c56c566235daff6282e7edaf8460a7a91e20b024..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 446 zcmZ{gu?oU45I}S46zr&rgWBaLv2klbowNwzATBNkMF)q1Q@g0N;J5i5PLBS8pqJLz zqWFf~kxMQw2k?(NfVh2%ZnGd3DVjou*7U~TaB{#n2+`|g`+K6fE}FxiC=C}7qE`6< z9K_l)sbIjZaJEP$Q}>N`KsI?6Hci(ZvID<0BHJg+>Y1Z?)Ph`O{A?wRry-6{wzOGx zaz^jUd5L}%w+L-$iR<@oOcTD4dz3UvX)&PE1VQT%We+{%E^|5Zsx9=$5ObwmlA9aIBUcc2`ploBFMI51KYy#UdDRX6|u diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_92.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_92.csv deleted file mode 100644 index 5627769cf6a13456eeb7299ca9274a7dceda4f4c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 397 zcmZQ%L|TGbDe* zn23;JcnbF+Bgp!=IIuteaAkmuX<%Sr0MQ^&T3YHJ5~R-1@Rrd~l+jSp$k2ex(9n>9 Op`ji~0Rb0RTpR!wTuKrE diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_93.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_93.csv deleted file mode 100644 index c343ed6dcf8cf074a4667dadc9ffe705853a92c9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 442 zcmaFOhzBq{g-cqBAwU^JgE~w?AuckG9N?Y|(&YmpU`D}dE-qm%FtB1|U|?aw0E`Ce z>MZ~3nbZ^2trXNf{pwxaEa3p;p8U8tySO;_I9^^|xHy;rl>}Ovm;=;XtHh;LtzHIZ zBYDIdWt%uJfk$jeH4X28-M_)FfI;gh*h;ZG<4K(0!Bup&}0O;25QS6 wt_+YdaW!BPOq7$qG!$htR5UU);4(BcWMF8h2U0-5#T6F^0Mb}o761SM diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_94.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_94.csv deleted file mode 100644 index fca0d7564746f6f3f2f7f86ec759c38b867ed89b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 444 zcmZQ%!~+ ztF!#CXHrj8w^C5|^s9GuvxEbX75Q;-c5!j;alE{`mX>;8)0DWl7?PB%jC2%~bOqsx zfefg6pgY__j){v?FLTRLx35*=QmR%jb8S|)x({RnK?BrP`poJn>Ws3C(hT>33KDaG zZUFlWXekhY4DkkPu&P!^gn<~U7f{`gqzuS}1Q#eIfZ>T@KF|q&xH7;N)qqJbQCeE+ j9ulO^(D0VgP?XV7(a6w%%h1q}fuW%uNC5#CS6mzba+Ol9 diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_95.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_95.csv deleted file mode 100644 index f651e896146e2d1865c3cd1cd92aed63bfc2a7c9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 264 zcmZQ%!~+6IK@o3Cm4Ja1sVLASTA$W%4gf@Cp=+CcdHX z+SlH@S0VlYo`aaqMr2T-7A!hakLaw5y;`(7u+;DcfHC=ja1f2AoM4Q;g}Xx;gx<#M zp)69OEm1_KxzxuUpE{9EBjrqlS(r2pfV2zDf(QEb-PZ1kS5v&8Sco~)d?5xNs13mm zjJ63s@CzIRe^q1tj}H=R9HQ3gC9iF>TaMgH%@HRmSB{q6uU$vFt_x6=ObjWB-cgrG AAOHXW diff --git a/data/csv/afl/20250211_csv_fuzz_crash/case_97.csv b/data/csv/afl/20250211_csv_fuzz_crash/case_97.csv deleted file mode 100644 index 777ca34bdcff0af731fa9384ca0b517fe5401b8c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 373 zcmZQ%!~+WuP?(hT<@_E;%^-32rX z2;4yg&;XztU_QX05nh4^0vjVE$W3u^Knu9I{%~c0jESoOlVDN zaSRL$|9!yhIQ25O9Cd|SB`&3EbtIJ_NuVycI7n9Yf+^5LW?7pspf@WQMm4i3~_KiZTiq8Wk&ZDhmywRHjtMW<7XafL3K0MR diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_1.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_1.csv deleted file mode 100644 index f685be9af1f97706e3dd0225bdce240b3aa935c4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 42 wcmb2|=HPgEi9dlQm4TO4ASFMyK<|H2R+3JoE|;N>k&ZDh7m&~~;pO@Q00LACPyhe` diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_10.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_10.csv deleted file mode 100644 index fbd977e626f66e1b4358e2d45ba0db4ec129386e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4922 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZQ)#zz7ZjM-b4lG&MH< z|DRzrGmM77Xb6mkz-R~z;}9?##(o>Y75Qkaj)uT!2n^v6h>J_k7%jg>Ltr!n=oSL} KbhC-JM<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86Yv{m)1S6fHrW`02Y^mPyhe` diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_12.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_12.csv deleted file mode 100644 index 80bfdbe5ee3434edd31feccc610c2b9b75f64a70..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4922 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZQ)#z-S1JhQMeDjE2By z2#kinXb6mkz-S1JhQMeDjE2B~h5)a!0h1tyYmmB*y0!6WH+3`wMnhmU1h9ty086WS A4FCWD diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_13.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_13.csv deleted file mode 100644 index de4bb8a6bff7dabefc48c6d5a96f206c04ceeb11..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4915 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZOu?1QK)9trQeSLvu6) zMneE!2z;eVZo_B8sN84>jE2By2#kinXb6mk05Al?M?+yW1V%$(Gz3ONU^E2i5CWNW JumjqQ1OTRodV~M~ diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_14.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_14.csv deleted file mode 100644 index 77660389cae4170b67750ccbc52c48b93feedb41..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4918 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86Yw*I*LF~4{8)}rg%+SDq z3ms5u=BWPB5Eu=C(GVC7fzc2c4S~@R7!3jXgaD(Vp@E?Rm;Go%dNc$^Ltr!nMnhoO Sg#fsKH#E?pPlYlFZ2AB z6$|lviK&7%spoAqGjHB5KUlvN;v{S}l9{^9Y^a$!@R_p7l)YLG6i~H(&7PN>Cdn`< zbe0T`4IbZ=T~^jcHq!ln<>`4%J#^kX_iAOqOvdM-T{pAu91hn=5wz!ew(GOHVAY1k zK86_K6&{|RGfX+T<>U?lRuPqsLwq6QNjVA7#ah|jW#y3Lh5MIG@UVjkZdm0Q)7Ptf z+ql99y*V9Qh~qfNQ5iMjQeh5gtS&N(eSFeeSIXBqIJKmrftDi#ni42J$6soiCwoFb b2nYcoAOwVf5D)@FKnMr{As_^VK%Kx3Qon$J diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_16.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_16.csv deleted file mode 100644 index 8c2175321d6744c5d2956ef1a31c28737719997f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4911 zcmeH@Jxc>Y6h-e8ikT2Q`ym*n*@c}LEdnkmk|JOVlSYeyDOOR|iR7zL1pk_UB`JbM zO2tAnFPp7|4QT!DRB!K@H_NG&LL7&se??DRs+*!Gi;n>E)oWNOdueUou5x927aA`M zVfKVEk%?rtFiLJq%5(L-`Xu&u^mud~nj`)5?cs2b1VLr1Vd^!SZfL4NqK!U=c!R?0 zEX9PH0X2^ZaDXTy&hdqmPC`w9Iu5h1Et(uW=5+s-HYl87j0c)LM#MF8?i6?UAl4Ic zf;f&nI@zirU0T5bf%!wa+{Guczmw44IykW_qGE^KumLu}1~wU(PB*pOu>m%~2G{@_ TU;}J`4X^<=zy{dBUkxk)`c#8$ diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_2.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_2.csv deleted file mode 100644 index 3b6e26bb6ace46abb7a240b441c76e3ec9b4d698..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 42 xcmb2|=HPgEi9dlQm4TO4ASFMyK<|H2R)bE9E|-9gk&ZDhmjHvYjtMWk9z+ZxEUQ diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_5.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_5.csv deleted file mode 100644 index 2709f85fe7d427752072bd85572b411066693fe8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 42 wcmb2|=HTE@czB5=g@Knk&ZDh7m&~~;pO@Q0MCpGCjbBd diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_6.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_6.csv deleted file mode 100644 index 95986aca82e796b51a0dd27b2717e2e6a47f5b30..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 63 zcmb2|=HPh9z`zjV>K?+&`mymXqoF9Hp`n4H0hghHj-i3M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZQ)#z-S1JhQMeDjE2By z2n?bS;2uP78EqxdCj>Z0<9sv(MnhmU1V%$(Gz3ONfNCM2MKzP?(+(I`wiM99Hvk-* BcuxQT diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_8.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_8.csv deleted file mode 100644 index 768f41dcd7bb32a6e808b795f8f361087bbaab3b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4909 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86iw;nV4t7w_9}M->Gi%hO z(GVC7fzc2c4S~@R7!85Z5Eu=C(GVC70dhj1fq@)dqk2X|U^E0qLx3tF5K9$PhSXTd H;Po*8A3S(( diff --git a/data/csv/afl/20250211_csv_fuzz_hangs/case_9.csv b/data/csv/afl/20250211_csv_fuzz_hangs/case_9.csv deleted file mode 100644 index 2897497b6cfff1bc9d4a7d5fb1055c2c4ec01fe7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4920 zcmY%8)Mc>M<)kKHU}Ru`nx|ft$oQYpNA-m+H&ld?kI^86YZQ)#z-S1JhQMeDjE2BS l3W3pzZ!`o(Ltr!nMnhmU1V%$(Gz3ONU^E0qLtrR|0037gdGi1O diff --git a/data/csv/afl/4172/case_1.csv b/data/csv/afl/4172/case_1.csv deleted file mode 100644 index 47200b6b4d2ec0e76ba53d1168e9d63a90edfebc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2398 zcmeHI-H+Nv5Z_dZbVkZ^+qV&Y@C%?_^KsxlUDE4^)TGUYM%Sv+X|NaAlkw8^`sn2p z+Jt;)lgjk!KN5?JqPK7B;@%5ewmZ<4q#_^Qh1X$RXFz^m*GUlp zS=^vWmT1D(tCRcTpZ2H}fvr*xcIgIHYC)4supfnsGdPGkct@l)R&zUr78gHUp4_8K z{_~bG@om%^GWqCy^7zS%;RSy-n4Vw$F}VEx^4DhL&V-iGPMNwCXZWI9* zG3T-1LBykemk+vp1Y#8P(>NLhj=$zR;<{^JUF*{1n5mcY%9NRG$IzY=1O-)J0_G_c z@N4u2tJ~O&pEG^8nu}dh|ItRN{1tdDRDwnwhgj{YQM>k*&PNce# zDX1JWHIv5PdLPthl~Dasu$mYotc_mlLU}8@MB>RZSWz5EQ6)xtUHMmuSr2l(fepn< z5}7Y>QPCqToEjwRMrE);dI@u-mX3N9H|L}QUTI3u~fO0$1$Pb zop!qakws$E+Lg+YIwd2ss+i4kZ^Lkzjz!;T?G6j}Hz1_DD}eTd1&aLfJZ;BaKn5`+$p4dt;Q9T|J-g zO+yqe-6Mq4$9{v@bMo3 zxBRprM@Qg8Ek1k6I;_t)d&pYs346mb`buU9jo6TF$L{DldCFSy<>KCM9zLSw_ZOI$$nWh?PQSUWtx^dWI3ug0jS-WQYN*|NpaQBo=`z llKP{jqn=@?lVQYVW}s8fz{J47rEjQHo)0w0%*3xe9{^6dIeh>C diff --git a/data/csv/afl/4172/case_3.csv b/data/csv/afl/4172/case_3.csv deleted file mode 100644 index 691f0c0e161588d33fb6b4be9499800244f0176c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 257 zcmY+8JqyA>42JKvl0k6Q<>=ro0nMG(a+`>#i$f{yR~0H$3&jsa{89craq#*fXdv*0 zym{XE$MfN~Yu$Z858nuSN+aYtvq+FYlG(o|PJZQwAOCbQi6jb{ycl~eSMh8v6_ek!>Li(Iy*kN@>7qgxHAv@#Ofaa<5%%v8e&0 pv6!T`D1$oh9?Fz3RS*pDnwO|PH6l=rP#Rrw-{?SpSHsmc{{U@iK(+t? diff --git a/data/csv/afl/4172/case_5.csv b/data/csv/afl/4172/case_5.csv deleted file mode 100644 index 708aee93cd765054732ddca101ee10bb800d8376..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 240 zcmXrE&dJP6HI6s3W|TJKGE7WDB_MnbMg|CAWnwJKN#QcH;4-t+DaQvAi;|M8ITKrr zb<{Hqbux^&%nWqO^TC=8VGJW(sE9F4#017L Date: Sat, 1 Feb 2025 14:18:37 -0500 Subject: [PATCH 137/142] Better comment in optimizer.cpp --- src/optimizer/optimizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index dc1ddfa59224..8c16e83a6c62 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -118,7 +118,7 @@ void Optimizer::RunBuiltInOptimizers() { // this does not change the logical plan structure, but only simplifies the expression trees RunOptimizer(OptimizerType::EXPRESSION_REWRITER, [&]() { rewriter.VisitOperator(*plan); }); - // transform ORDER BY + LIMIT to TopN + // Rewrites SUM(x + C) into SUM(x) + C * COUNT(x) RunOptimizer(OptimizerType::SUM_REWRITER, [&]() { SumRewriterOptimizer optimizer(*this); optimizer.Optimize(plan); From aadd5438b4bf7897ca9ee8e1e02de06b5728a6b9 Mon Sep 17 00:00:00 2001 From: Richard Wesley <13156216+hawkfish@users.noreply.github.com> Date: Tue, 18 Feb 2025 16:46:09 +1300 Subject: [PATCH 138/142] Issue #16250: Window Range Performance * Check hints for equality and skip search. fixes: duckdb/duckdb#16250 fixes: duckdblabs/duckdb-internal#4229 --- .../window/window_boundaries_state.cpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/function/window/window_boundaries_state.cpp b/src/function/window/window_boundaries_state.cpp index 6ee3c105234d..a4b034441d38 100644 --- a/src/function/window/window_boundaries_state.cpp +++ b/src/function/window/window_boundaries_state.cpp @@ -211,15 +211,23 @@ static idx_t FindTypedRangeBound(WindowCursor &range_lo, WindowCursor &range_hi, if (prev.start < prev.end) { if (order_begin < prev.start && prev.start < order_end) { const auto first = range_lo.GetCell(0, prev.start); - if (!comp(val, first)) { - // prev.first <= val, so we can start further forward + if (FROM && !comp(val, first)) { + // If prev.start == val and we are looking for a lower bound, then we are done + if (!comp(first, val)) { + return prev.start; + } + // prev.start <= val, so we can start further forward begin += UnsafeNumericCast(prev.start - order_begin); } } if (order_begin < prev.end && prev.end < order_end) { const auto second = range_hi.GetCell(0, prev.end - 1); if (!comp(second, val)) { - // val <= prev.second, so we can end further back + // If val == prev.end and we are looking for an upper bound, then we are done + if (!FROM && !comp(val, second)) { + return prev.end; + } + // val <= prev.end, so we can end further back // (prev.second is the largest peer) end -= UnsafeNumericCast(order_end - prev.end - 1); } @@ -943,6 +951,11 @@ void WindowBoundariesState::FrameEnd(DataChunk &bounds, idx_t row_idx, const idx } else { const auto valid_start = valid_begin_data[chunk_idx]; prev.start = valid_start; + const auto cur_partition = partition_begin_data[chunk_idx]; + if (cur_partition != prev_partition) { + prev.end = valid_end; + prev_partition = cur_partition; + } window_end = FindOrderedRangeBound(*range_lo, *range_hi, range_sense, valid_start, row_idx + 1, end_boundary, boundary_end, chunk_idx, prev); prev.end = window_end; From aa82eb90336db14590c68eebb3b43fbdf3576953 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Tue, 18 Feb 2025 10:13:16 +0100 Subject: [PATCH 139/142] change string hash function again, now inlined strings are hashed branchlessly --- src/common/types/hash.cpp | 45 ++++++++++++++++++------ test/sql/function/generic/hash_func.test | 44 +++++++++++------------ 2 files changed, 57 insertions(+), 32 deletions(-) diff --git a/src/common/types/hash.cpp b/src/common/types/hash.cpp index 9a9fd5daf9e8..de0233b0f224 100644 --- a/src/common/types/hash.cpp +++ b/src/common/types/hash.cpp @@ -78,6 +78,35 @@ hash_t Hash(const char *str) { template <> hash_t Hash(string_t val) { + // If the string is inlined, we can do a branchless hash + if (val.IsInlined()) { + // This seed slightly improves bit distribution, taken from here: + // https://github.com/martinus/robin-hood-hashing/blob/3.11.5/LICENSE + // MIT License Copyright (c) 2018-2021 Martin Ankerl + hash_t h = 0xe17a1465U ^ (val.GetSize() * 0xc6a4a7935bd1e995U); + + // Hash/combine the first 8-byte block + h ^= Load(const_data_ptr_cast(val.GetPrefix())); + h *= 0xd6e8feb86659fd93U; + + // Load remaining 4 bytes + hash_t hr = 0; + memcpy(&hr, const_data_ptr_cast(val.GetPrefix()) + sizeof(hash_t), 4U); + + // Process the remainder the same an 8-byte block + // This operation is a NOP if the string is <= 8 bytes + const bool not_a_nop = val.GetSize() > sizeof(hash_t); + h ^= hr; + h *= 0xd6e8feb86659fd93U * not_a_nop + (1 - not_a_nop); + + // This is just an optimization. It should not change the result + // This property is important for verification (e.g., DUCKDB_DEBUG_NO_INLINE) + // We achieved this with the NOP trick above (and in HashBytes) + h = Hash(h); + D_ASSERT(h == Hash(val.GetData(), val.GetSize())); + + return h; + } return Hash(val.GetData(), val.GetSize()); } @@ -98,22 +127,18 @@ hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept { h *= 0xd6e8feb86659fd93U; } - // Load and process remaining (<8) bytes + // Load remaining (<8) bytes hash_t hr = 0; memcpy(&hr, ptr, len & 7U); - hr *= 0xd6e8feb86659fd93U; - hr ^= h >> 32; - // XOR with hash + // Process the remainder same as an 8-byte block + // This operation is a NOP if the number of remaining bytes is 0 + const bool not_a_nop = len & 7U; h ^= hr; + h *= 0xd6e8feb86659fd93U * not_a_nop + (1 - not_a_nop); // Finalize - h *= 0xd6e8feb86659fd93U; - h ^= h >> 32; - - return h; - - // return Hash(h); + return Hash(h); } hash_t Hash(const char *val, size_t size) { diff --git a/test/sql/function/generic/hash_func.test b/test/sql/function/generic/hash_func.test index 44ca5113eb87..0427e0d0ee6c 100644 --- a/test/sql/function/generic/hash_func.test +++ b/test/sql/function/generic/hash_func.test @@ -44,9 +44,9 @@ CREATE TABLE structs AS query II SELECT s, HASH(s) FROM structs ---- -{'i': 5, 's': string} 5041354121594313779 +{'i': 5, 's': string} 312378390946197788 {'i': -2, 's': NULL} 13311620765177879553 -{'i': NULL, 's': not null} 17669771151474316850 +{'i': NULL, 's': not null} 12187543307399756733 {'i': NULL, 's': NULL} 18212156630472451589 NULL 18212156630472451589 @@ -76,11 +76,11 @@ NULL 13787848793156543929 query II SELECT lg, HASH(lg) FROM lists ---- -[TGTA] 17595328716338797054 -[CGGT] 10306172129632853293 -[CCTC] 13297701768986389650 -[TCTA] 12532519228232631318 -[AGGG] 18327401687889337414 +[TGTA] 2473061308111828075 +[CGGT] 17252230290449032892 +[CCTC] 12469451733100292545 +[TCTA] 16441147910138644840 +[AGGG] 6734708784738468094 NULL 13787848793156543929 # Maps @@ -98,11 +98,11 @@ CREATE TABLE maps AS query II SELECT m, HASH(m) FROM maps ---- -{1=TGTA} 12831981919938534237 -{1=CGGT, 2=CCTC} 13475482557019497469 +{1=TGTA} 7235425910004250312 +{1=CGGT, 2=CCTC} 1011047862598495049 {} 13787848793156543929 -{1=TCTA, 2=NULL, 3=CGGT} 6801514312074335687 -{1=TGTA, 2=CGGT, 3=CCTC, 4=TCTA, 5=AGGG} 1967491966533763128 +{1=TCTA, 2=NULL, 3=CGGT} 6001596667924474868 +{1=TGTA, 2=CGGT, 3=CCTC, 4=TCTA, 5=AGGG} 16287978232011168685 NULL 13787848793156543929 statement ok @@ -189,17 +189,17 @@ SELECT r, HASH() FROM enums; query II SELECT r, HASH(r, 'capacitor') FROM enums; ---- -black 10215506564763180114 -brown 14699666407584440049 -red 10435339440036763924 -orange 7449326894723801922 -yellow 7545557152300511399 -green 13515514493392674532 -blue 16730185616673645170 -violet 6167961171085770869 -grey 10019148715359395841 -white 8224352891729695362 -NULL 14853453776375799790 +black 16797622758688705282 +brown 12620868779234625953 +red 17584344400128560708 +orange 268160620305560594 +yellow 895888387990267895 +green 16089427619650030004 +blue 10156864916169405730 +violet 3549084991787980581 +grey 17281098274178594641 +white 1655957553588749778 +NULL 12320705626460735678 query II SELECT r, HASH('2022-02-12'::DATE, r) FROM enums; From 12e96f6ce5ef0f0c334cfa624f4d6854b38644b5 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Tue, 18 Feb 2025 10:52:12 +0100 Subject: [PATCH 140/142] some more fast paths --- .../writer/templated_column_writer.hpp | 81 +++++++++++-------- .../types/column/column_data_collection.cpp | 19 +++-- 2 files changed, 61 insertions(+), 39 deletions(-) diff --git a/extension/parquet/include/writer/templated_column_writer.hpp b/extension/parquet/include/writer/templated_column_writer.hpp index 027af57fe6c5..1ace63726af5 100644 --- a/extension/parquet/include/writer/templated_column_writer.hpp +++ b/extension/parquet/include/writer/templated_column_writer.hpp @@ -263,6 +263,49 @@ class StandardColumnWriter : public PrimitiveColumnWriter { void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start, idx_t chunk_end) override { + const auto &mask = FlatVector::Validity(input_column); + if (mask.AllValid()) { + WriteVectorInternal(temp_writer, stats, page_state_p, input_column, chunk_start, chunk_end); + } else { + WriteVectorInternal(temp_writer, stats, page_state_p, input_column, chunk_start, chunk_end); + } + } + + void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override { + auto &state = state_p.Cast>(); + D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY); + + state.bloom_filter = + make_uniq(state.dictionary.GetSize(), writer.BloomFilterFalsePositiveRatio()); + + state.dictionary.IterateValues([&](const SRC &src_value, const TGT &tgt_value) { + // update the statistics + OP::template HandleStats(stats, tgt_value); + // update the bloom filter + auto hash = OP::template XXHash64(tgt_value); + state.bloom_filter->FilterInsert(hash); + }); + + // flush the dictionary page and add it to the to-be-written pages + WriteDictionary(state, state.dictionary.GetTargetMemoryStream(), state.dictionary.GetSize()); + // bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up + } + + idx_t GetRowSize(const Vector &vector, const idx_t index, + const PrimitiveColumnWriterState &state_p) const override { + auto &state = state_p.Cast>(); + if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) { + return (state.key_bit_width + 7) / 8; + } else { + return OP::template GetRowSize(vector, index); + } + } + +private: + template + void WriteVectorInternal(WriteStream &temp_writer, ColumnWriterStatistics *stats, + ColumnWriterPageState *page_state_p, Vector &input_column, idx_t chunk_start, + idx_t chunk_end) { auto &page_state = page_state_p->Cast>(); const auto &mask = FlatVector::Validity(input_column); @@ -286,7 +329,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } for (; r < chunk_end; r++) { - if (!mask.RowIsValid(r)) { + if (!ALL_VALID && !mask.RowIsValid(r)) { continue; } const auto &src_value = data_ptr[r]; @@ -313,7 +356,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } for (; r < chunk_end; r++) { - if (!mask.RowIsValid(r)) { + if (!ALL_VALID && !mask.RowIsValid(r)) { continue; } const TGT target_value = OP::template Operation(data_ptr[r]); @@ -340,7 +383,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } for (; r < chunk_end; r++) { - if (!mask.RowIsValid(r)) { + if (!ALL_VALID && !mask.RowIsValid(r)) { continue; } const TGT target_value = OP::template Operation(data_ptr[r]); @@ -351,7 +394,7 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT: { for (idx_t r = chunk_start; r < chunk_end; r++) { - if (!mask.RowIsValid(r)) { + if (!ALL_VALID && !mask.RowIsValid(r)) { continue; } const TGT target_value = OP::template Operation(data_ptr[r]); @@ -374,36 +417,6 @@ class StandardColumnWriter : public PrimitiveColumnWriter { throw InternalException("Unknown encoding"); } } - - void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override { - auto &state = state_p.Cast>(); - D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY); - - state.bloom_filter = - make_uniq(state.dictionary.GetSize(), writer.BloomFilterFalsePositiveRatio()); - - state.dictionary.IterateValues([&](const SRC &src_value, const TGT &tgt_value) { - // update the statistics - OP::template HandleStats(stats, tgt_value); - // update the bloom filter - auto hash = OP::template XXHash64(tgt_value); - state.bloom_filter->FilterInsert(hash); - }); - - // flush the dictionary page and add it to the to-be-written pages - WriteDictionary(state, state.dictionary.GetTargetMemoryStream(), state.dictionary.GetSize()); - // bloom filter will be queued for writing in ParquetWriter::BufferBloomFilter one level up - } - - idx_t GetRowSize(const Vector &vector, const idx_t index, - const PrimitiveColumnWriterState &state_p) const override { - auto &state = state_p.Cast>(); - if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) { - return (state.key_bit_width + 7) / 8; - } else { - return OP::template GetRowSize(vector, index); - } - } }; } // namespace duckdb diff --git a/src/common/types/column/column_data_collection.cpp b/src/common/types/column/column_data_collection.cpp index a2480f44d624..e45228b8c219 100644 --- a/src/common/types/column/column_data_collection.cpp +++ b/src/common/types/column/column_data_collection.cpp @@ -417,12 +417,21 @@ static void TemplatedColumnDataCopy(ColumnDataMetaData &meta_data, const Unified // initialize the validity mask to set all to valid result_validity.SetAllValid(STANDARD_VECTOR_SIZE); } - for (idx_t i = 0; i < append_count; i++) { - auto source_idx = source_data.sel->get_index(offset + i); - if (source_data.validity.RowIsValid(source_idx)) { + if (source_data.validity.AllValid()) { + // Fast path: all valid + for (idx_t i = 0; i < append_count; i++) { + auto source_idx = source_data.sel->get_index(offset + i); OP::template Assign(meta_data, base_ptr, source_data.data, current_segment.count + i, source_idx); - } else { - result_validity.SetInvalid(current_segment.count + i); + } + } else { + for (idx_t i = 0; i < append_count; i++) { + auto source_idx = source_data.sel->get_index(offset + i); + if (source_data.validity.RowIsValid(source_idx)) { + OP::template Assign(meta_data, base_ptr, source_data.data, current_segment.count + i, + source_idx); + } else { + result_validity.SetInvalid(current_segment.count + i); + } } } current_segment.count += append_count; From b2bf61759c3da8d48eda777f8640b90e24435655 Mon Sep 17 00:00:00 2001 From: Laurens Kuiper Date: Tue, 18 Feb 2025 12:34:28 +0100 Subject: [PATCH 141/142] fix hash function for empty strings and fix test output now that hash function is changed again --- src/common/types/hash.cpp | 7 +++++-- test/api/adbc/test_adbc.cpp | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/common/types/hash.cpp b/src/common/types/hash.cpp index de0233b0f224..604a6249f8b3 100644 --- a/src/common/types/hash.cpp +++ b/src/common/types/hash.cpp @@ -86,8 +86,9 @@ hash_t Hash(string_t val) { hash_t h = 0xe17a1465U ^ (val.GetSize() * 0xc6a4a7935bd1e995U); // Hash/combine the first 8-byte block + const bool not_an_empty_string = !val.Empty(); h ^= Load(const_data_ptr_cast(val.GetPrefix())); - h *= 0xd6e8feb86659fd93U; + h *= 0xd6e8feb86659fd93U * not_an_empty_string + (1 - not_an_empty_string); // Load remaining 4 bytes hash_t hr = 0; @@ -99,10 +100,12 @@ hash_t Hash(string_t val) { h ^= hr; h *= 0xd6e8feb86659fd93U * not_a_nop + (1 - not_a_nop); + // Finalize + h = Hash(h); + // This is just an optimization. It should not change the result // This property is important for verification (e.g., DUCKDB_DEBUG_NO_INLINE) // We achieved this with the NOP trick above (and in HashBytes) - h = Hash(h); D_ASSERT(h == Hash(val.GetData(), val.GetSize())); return h; diff --git a/test/api/adbc/test_adbc.cpp b/test/api/adbc/test_adbc.cpp index a624a66a7857..91a86b93b6dc 100644 --- a/test/api/adbc/test_adbc.cpp +++ b/test/api/adbc/test_adbc.cpp @@ -1364,8 +1364,8 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { REQUIRE((res->ColumnCount() == 2)); REQUIRE((res->RowCount() == 3)); REQUIRE((res->GetValue(1, 0).ToString() == - "[{'db_schema_name': main, 'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, " - "'db_schema_tables': NULL}, {'db_schema_name': information_schema, 'db_schema_tables': NULL}]")); + "[{'db_schema_name': pg_catalog, 'db_schema_tables': NULL}, {'db_schema_name': information_schema, " + "'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': NULL}]")); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, nullptr, nullptr, From 6682a5272655e68a66be66303b3c847d8729b2bc Mon Sep 17 00:00:00 2001 From: xuke-hat Date: Wed, 19 Feb 2025 00:57:17 +0800 Subject: [PATCH 142/142] make ValidityMask::RowIsValidUnsafe really unsafe --- src/include/duckdb/common/types/validity_mask.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/duckdb/common/types/validity_mask.hpp b/src/include/duckdb/common/types/validity_mask.hpp index 05583cddd82f..89c39750d326 100644 --- a/src/include/duckdb/common/types/validity_mask.hpp +++ b/src/include/duckdb/common/types/validity_mask.hpp @@ -188,7 +188,7 @@ struct TemplatedValidityMask { D_ASSERT(validity_mask); idx_t entry_idx, idx_in_entry; GetEntryIndex(row_idx, entry_idx, idx_in_entry); - auto entry = GetValidityEntry(entry_idx); + auto entry = GetValidityEntryUnsafe(entry_idx); return RowIsValid(entry, idx_in_entry); }